pycodamath 1.1.2__tar.gz → 1.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycodamath
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: Compositional data (CoDa) analysis tools for Python
5
5
  Author-email: Christian Brinch <cbri@food.dtu.dk>
6
6
  License: MIT
@@ -285,9 +285,9 @@ The keyword `palette` is a dict mapping each unique group value to a colour.
285
285
  import pandas as pd
286
286
  data = pd.read_csv('example/kilauea_iki_chem.csv')
287
287
  mypca = coda.pca.Biplot(data)
288
- mypca.removelabels()
288
+ mypca.remove("labels")
289
289
  mypca.plotloadings(cluster=True)
290
290
  print(mypca.clusterlegend)
291
- mypca.removelabels()
291
+ mypca.remove("labels")
292
292
  mypca.plotloadings(labels=['FeO', 'Al2O3', 'CaO'], cluster=False)
293
293
  mypca.adjustloadinglabels()
@@ -254,9 +254,9 @@ The keyword `palette` is a dict mapping each unique group value to a colour.
254
254
  import pandas as pd
255
255
  data = pd.read_csv('example/kilauea_iki_chem.csv')
256
256
  mypca = coda.pca.Biplot(data)
257
- mypca.removelabels()
257
+ mypca.remove("labels")
258
258
  mypca.plotloadings(cluster=True)
259
259
  print(mypca.clusterlegend)
260
- mypca.removelabels()
260
+ mypca.remove("labels")
261
261
  mypca.plotloadings(labels=['FeO', 'Al2O3', 'CaO'], cluster=False)
262
262
  mypca.adjustloadinglabels()
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "pycodamath"
7
- version = "1.1.2"
7
+ version = "1.1.3"
8
8
  authors = [
9
9
  { name="Christian Brinch", email="cbri@food.dtu.dk" },
10
10
  ]
@@ -45,9 +45,8 @@ def norm(balances):
45
45
  def check_basis(psi):
46
46
  ''' Check if basis is orthonormal '''
47
47
  ident = psi @ psi.T
48
- if np.trace(ident) != ident.shape[0]:
48
+ if np.abs(np.trace(ident) - ident.shape[0]) > 1e-6:
49
49
  raise AttributeError("Error: Basis is not normalized.")
50
- #if np.abs(np.sum(ident-np.diag(np.diagonal(ident)))) > 1e-6:
51
50
  if np.abs(ident.sum()-np.trace(ident)) > 1e-6:
52
51
  raise AttributeError("Error: Basis is not orthogonal.")
53
52
 
@@ -83,8 +82,8 @@ def get_covariance_ellipse(data, conf=95):
83
82
  ("Error: get_covariance_ellipse expects only two columns. " +
84
83
  "Got {0:d}.").format(len(data.columns)))
85
84
 
86
- lambda_, angle = np.linalg.eig(np.cov(data.loc[:, 0], data.loc[:, 1]))
87
- lambda_ = np.sqrt(lambda_)
85
+ lambda_, angle = np.linalg.eigh(np.cov(data.loc[:, 0], data.loc[:, 1]))
86
+ lambda_ = np.sqrt(np.maximum(lambda_, 0))
88
87
 
89
88
  chi2_scales = {90: 4.605, 95: 5.991, 99: 9.210}
90
89
  if conf not in chi2_scales:
@@ -93,12 +92,11 @@ def get_covariance_ellipse(data, conf=95):
93
92
 
94
93
 
95
94
  return {'shape': (lambda_[0]*np.sqrt(scale), lambda_[1]*np.sqrt(scale)),
96
- # 'angle': np.arccos(-angle[0, 0]),
97
95
  'angle': np.arctan(angle[1, 0]/angle[0, 0]),
98
96
  'center': (np.mean(data.loc[:, 0]), np.mean(data.loc[:, 1]))}
99
97
 
100
98
 
101
- def plot_covariance_ellipse(axis, ellipse, color=0):
99
+ def plot_covariance_ellipse(axis, ellipse, color=None):
102
100
  ''' plot covariance ellipse '''
103
101
  if color is None:
104
102
  color = 'black'
@@ -0,0 +1,531 @@
1
+ """Class and methods for making compositional biplots based on PCA"""
2
+
3
+ __author__ = "Christian Brinch"
4
+ __copyright__ = "Copyright 2019"
5
+ __credits__ = ["Christian Brinch"]
6
+ __license__ = "AFL 3.0"
7
+ __version__ = "1.1"
8
+ __maintainer__ = "Christian Brinch"
9
+ __email__ = "cbri@food.dtu.dk"
10
+
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import webcolors as wc
14
+ from matplotlib.colors import ListedColormap
15
+ from matplotlib import cm
16
+ from matplotlib.transforms import TransformedBbox
17
+ import matplotlib.patches as mpatches
18
+ import pandas as pd
19
+ import scipy.stats as st
20
+ from scipy.cluster.hierarchy import linkage, cut_tree
21
+ from pycodamath import extra
22
+
23
+
24
+ class GeomObj:
25
+ """A generic container of geometric objects"""
26
+
27
+ def __init__(self, **kwargs):
28
+ vars(self).update(kwargs)
29
+ self.area = self.polyarea()
30
+
31
+ def polyarea(self):
32
+ """Calculate the area of a polygon given two lists of vertices"""
33
+ x, y = self.vertices
34
+ return 0.5 * np.abs(x@np.roll(y,1) - y@np.roll(x,1))
35
+
36
+
37
+ def scree_plot(axis, eig_val):
38
+ """Make scree plot from eigen values"""
39
+ axis.set_xlabel("Component")
40
+ axis.set_ylabel("Explained variance")
41
+ axis.set_xlim(0, min(len(eig_val) + 1, 20))
42
+ axis.bar(np.arange(len(eig_val)) + 1, (eig_val**2 / np.sum(eig_val** 2)))
43
+ csum = np.cumsum(eig_val**2 / np.sum(eig_val**2))
44
+ for i in range(min(5, len(eig_val))):
45
+ axis.annotate(
46
+ str(np.round(csum[i] * 100)) + "%",
47
+ (i + 1.2, (eig_val[i]** 2 / np.sum(eig_val** 2))),
48
+ )
49
+
50
+
51
+ def _get_palette(group):
52
+ cspace = cm.jet(np.linspace(0, 1, len(set(group))))
53
+ return {item: cspace[idx] for idx, item in enumerate(set(group))}
54
+
55
+
56
+ def _svd(clr):
57
+ """Internal SVD function"""
58
+ scores, eig_val, loadings = np.linalg.svd(clr)
59
+ scores = pd.DataFrame(scores.T[0:2, :], columns=clr.index, index=["pc1", "pc2"])
60
+ loadings = pd.DataFrame(
61
+ (eig_val[:,np.newaxis] * loadings)[0:2],
62
+ columns=clr.columns[0 : len(eig_val)],
63
+ index=["pc1", "pc2"],
64
+ )
65
+ return scores, eig_val, loadings
66
+
67
+
68
+ def _bundle_loadings(loadings, threshold=0.05):
69
+ """Reduce the number of loadings by hierachical clustering"""
70
+ tree = linkage(loadings.T, "ward")
71
+ stump = cut_tree(tree, height=tree[-1][2] * threshold)
72
+ clusters = pd.DataFrame([], index=loadings.index)
73
+ legend = {}
74
+ counter = 1
75
+ stump_flat = np.concatenate(stump)
76
+ for i in set(stump_flat):
77
+ features = np.where(stump_flat == i)
78
+ if len(features[0]) > 1:
79
+ clusters[counter] = loadings[
80
+ [loadings.columns[j] for j in features[0]]
81
+ ].mean(axis=1)
82
+ legend[counter] = [loadings.columns[j] for j in features[0]]
83
+ counter += 1
84
+ else:
85
+ clusters[loadings.columns[features[0]]] = loadings[
86
+ loadings.columns[features[0]]
87
+ ]
88
+
89
+ return legend, clusters
90
+
91
+
92
+ def _do_bbox_overlap(box1, box2):
93
+ # If one rectangle is on left side of other
94
+ return not (
95
+ box1[1][0] > box2[2][0]
96
+ or box2[1][0] > box1[2][0]
97
+ or box1[3][1] > box2[1][1]
98
+ or box2[2][1] > box1[1][1]
99
+ )
100
+
101
+
102
+ class Biplot:
103
+ """A class to create and a PCA biplot"""
104
+
105
+ def __init__(self, data, axis=None, default=True):
106
+ if axis is None:
107
+ _, self.axis = plt.subplots(figsize=(7.8, 7.8))
108
+ else:
109
+ self.axis = axis
110
+ self.axis.set(adjustable="box", aspect="equal")
111
+ self.scores, eig_val, self.loadings = _svd(
112
+ data.coda.center().coda.scale().coda.clr()
113
+ )
114
+
115
+ scales = [
116
+ np.max(np.abs(self.loadings.values)),
117
+ [np.max(np.abs(self.scores.loc[idx].values)) for idx in ["pc1", "pc2"]],
118
+ ]
119
+
120
+ self.axis.set_xlabel(
121
+ f"P.C. 1 ({np.round(eig_val[0]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)"
122
+ )
123
+ self.axis.set_ylabel(
124
+ f"P.C. 2 ({np.round(eig_val[1]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)"
125
+ )
126
+ self.axis.set_xlim(-scales[0] * 1.1, scales[0] * 1.1)
127
+ self.axis.set_ylim(-scales[0] * 1.1, scales[0] * 1.1)
128
+ self.axis.axhline(0, linestyle="--", color="black", alpha=0.4)
129
+ self.axis.axvline(0, linestyle="--", color="black", alpha=0.4)
130
+
131
+ self.scores = (scales[0] * (self.scores.T / scales[1])).T
132
+
133
+ self.patches = []
134
+ self.geomobj = {}
135
+ plt.tight_layout()
136
+
137
+ if default:
138
+ self.plotloadings()
139
+ self.plotscores()
140
+
141
+ def plotloadings(self, cutoff=0, scale=None, labels=None, cluster=False):
142
+ """Plot loadings"""
143
+ if scale is None:
144
+ scale = np.max(np.abs(self.loadings.values))
145
+
146
+ if labels is None:
147
+ labels = self.loadings.columns
148
+ loadings = self.loadings[labels]
149
+
150
+ if cutoff > 0:
151
+ length = np.sqrt(loadings.loc["pc1"] ** 2 + loadings.loc["pc2"] ** 2)
152
+ loadings = loadings.loc[:, length > cutoff * scale]
153
+
154
+
155
+ if cluster:
156
+ legend, loadings = _bundle_loadings(loadings)
157
+
158
+ for column in loadings:
159
+ self.axis.arrow(
160
+ 0,
161
+ 0,
162
+ loadings.loc["pc1", column],
163
+ loadings.loc["pc2", column],
164
+ facecolor="black",
165
+ alpha=0.5,
166
+ linewidth=0.0,
167
+ width=scale * 0.01,
168
+ zorder=2000,
169
+ )
170
+ self.plotloadinglabels(loadings.columns, loadings, cutoff)
171
+ if cluster:
172
+ self.clusterlegend = legend
173
+
174
+ def plotloadinglabels(self, labels=None, loadings=None, cutoff=0):
175
+ """Add labels to the loadings"""
176
+ if loadings is None:
177
+ loadings = self.loadings
178
+ if labels is None:
179
+ labels = loadings.columns
180
+
181
+ fig = self.axis.get_figure()
182
+ renderer = fig.canvas.get_renderer()
183
+ for column in labels:
184
+ if (
185
+ np.sqrt(
186
+ loadings.loc["pc1", column] ** 2 + loadings.loc["pc2", column] ** 2
187
+ )
188
+ > cutoff
189
+ ):
190
+
191
+ ann = self.axis.annotate(
192
+ str(column)[:30],
193
+ (loadings.loc["pc1", column], loadings.loc["pc2", column]),
194
+ ha="left",
195
+ va="bottom",
196
+ alpha=0.95,
197
+ color="black",
198
+ zorder=5001,
199
+ )
200
+
201
+ fig.canvas.draw()
202
+ bbox = ann.get_window_extent(renderer=renderer)
203
+ transform = self.axis.transData.inverted()
204
+ tbox = TransformedBbox(bbox, transform).corners()
205
+ if tbox[2][0] > self.axis.get_xlim()[1]:
206
+ self.axis.texts[-1]._x -= tbox[2][0] - self.axis.get_xlim()[1]
207
+ fig.canvas.draw()
208
+ bbox = self.axis.texts[-1].get_window_extent(
209
+ renderer=renderer
210
+ )
211
+ transform = self.axis.transData.inverted()
212
+ tbox = TransformedBbox(bbox, transform).corners()
213
+ self.axis.texts[-1]._x += self.axis.get_xlim()[1] - tbox[2][0]
214
+
215
+ def adjustloadinglabels(self):
216
+ """Adjust loading label positions, so that labels don't overlap."""
217
+ fig = self.axis.get_figure()
218
+ fig.canvas.draw()
219
+ renderer = fig.canvas.get_renderer()
220
+ for idx, label in enumerate(self.axis.texts[:-1]):
221
+ bbox = label.get_window_extent(renderer=renderer)
222
+ transform = self.axis.transData.inverted()
223
+ tbox = TransformedBbox(bbox, transform).corners()
224
+ for tidx, testlabel in enumerate(self.axis.texts[idx + 1 :]):
225
+ lbox = testlabel.get_window_extent(renderer=renderer)
226
+ transform = self.axis.transData.inverted()
227
+ ttbox = TransformedBbox(lbox, transform).corners()
228
+ if _do_bbox_overlap(tbox, ttbox) or _do_bbox_overlap(ttbox, tbox):
229
+ if self.axis.texts[idx]._y < self.axis.texts[idx + tidx + 1]._y:
230
+ self.axis.texts[idx]._y -= (tbox[1][1] - tbox[0][1]) / 3.0
231
+ self.axis.texts[idx + tidx + 1]._y += (
232
+ ttbox[1][1] - ttbox[0][1]
233
+ ) / 3.0
234
+ else:
235
+ self.axis.texts[idx]._y += (tbox[1][1] - tbox[0][1]) / 3.0
236
+ self.axis.texts[idx + tidx + 1]._y -= (
237
+ ttbox[1][1] - ttbox[0][1]
238
+ ) / 3.0
239
+ fig.canvas.draw()
240
+
241
+ def plotscores(self, group=None, palette=None, legend=True, labels=None):
242
+ """Plot scores as points"""
243
+ if labels is None:
244
+ labels = self.scores.columns
245
+
246
+ if palette is None:
247
+ if group is not None:
248
+ palette = _get_palette(group)
249
+ else:
250
+ palette = "steelblue"
251
+
252
+ if group is None:
253
+ self.axis.plot(
254
+ *self.scores[labels].values,
255
+ "o",
256
+ alpha=0.5,
257
+ color=palette,
258
+ zorder=7,
259
+ markeredgewidth=0,
260
+ )
261
+ else:
262
+ for item in set(group):
263
+ idx = group.loc[group == item].index
264
+ self.axis.plot(
265
+ *self.scores[idx].values,
266
+ "o",
267
+ alpha=0.5,
268
+ zorder=7,
269
+ label=item,
270
+ color=palette[item],
271
+ markeredgewidth=0,
272
+ )
273
+ if legend:
274
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
275
+
276
+ def plotscorelabels(self, labels=None):
277
+ """Add labels to the scores"""
278
+ if labels is None:
279
+ labels = self.scores.columns
280
+
281
+ for label in labels:
282
+ self.axis.annotate(
283
+ label,
284
+ (self.scores.loc["pc1", label], self.scores.loc["pc2", label]),
285
+ ha="left",
286
+ va="bottom",
287
+ alpha=0.8,
288
+ zorder=201,
289
+ size=8,
290
+ )
291
+
292
+ def plotellipses(self, group, palette=None, legend=False):
293
+ """Plot confidence ellipses"""
294
+ if palette is None:
295
+ palette = _get_palette(group)
296
+
297
+ for item in set(group):
298
+ idx = group.loc[group == item].index
299
+ if len(idx) > 3:
300
+ ellipse = extra.get_covariance_ellipse(
301
+ pd.DataFrame(self.scores[idx].values.T), conf=90
302
+ )
303
+ extra.plot_covariance_ellipse(self.axis, ellipse, color=palette[item])
304
+ if legend:
305
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
306
+
307
+ def plotcentroids(self, group, palette=None, legend=False):
308
+ """Plot score group centroids"""
309
+ if palette is None:
310
+ palette = _get_palette(group)
311
+
312
+ for item in set(group):
313
+ idx = group.loc[group == item].index
314
+ mean_x = self.scores.loc["pc1", idx].mean()
315
+ mean_y = self.scores.loc["pc2", idx].mean()
316
+ self.axis.plot(
317
+ [mean_x],
318
+ [mean_y],
319
+ "x",
320
+ alpha=0.7,
321
+ label=item,
322
+ color=palette[item],
323
+ markersize=24,
324
+ )
325
+ if legend:
326
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
327
+
328
+ def plothulls(self, group, palette=None, legend=True):
329
+ """Plot score group hulls"""
330
+ if palette is None:
331
+ palette = _get_palette(group)
332
+
333
+ self.geomobj = {}
334
+ for item in set(group):
335
+ idx = group.loc[group == item].index
336
+ if len(idx) > 3:
337
+ # My secret hull construction algorithm
338
+ idxmin = self.scores.loc["pc1", idx].idxmin()
339
+ j = self.scores[idx].columns.get_loc(idxmin)
340
+ hull = [list(self.scores[idxmin])]
341
+ while j != self.scores[idx].columns.get_loc(idxmin) or len(hull) == 1:
342
+ k = (j + 1) % len(idx)
343
+ for i in range(len(idx)):
344
+ if (
345
+ self.scores[idx].iloc[1, k] - self.scores[idx].iloc[1, j]
346
+ ) * (
347
+ self.scores[idx].iloc[0, i] - self.scores[idx].iloc[0, k]
348
+ ) - (
349
+ self.scores[idx].iloc[0, k] - self.scores[idx].iloc[0, j]
350
+ ) * (
351
+ self.scores[idx].iloc[1, i] - self.scores[idx].iloc[1, k]
352
+ ) < 0:
353
+ k = i
354
+ j = k
355
+ hull.append(list(self.scores[self.scores[idx].columns[k]]))
356
+ self.geomobj[item] = GeomObj(vertices=tuple(map(list, zip(*hull))))
357
+
358
+ for idx, item in enumerate(
359
+ sorted(self.geomobj, key=lambda x: self.geomobj[x].area, reverse=True)
360
+ ):
361
+ self.axis.fill(
362
+ *self.geomobj[item].vertices,
363
+ color=palette[item],
364
+ alpha=0.7,
365
+ zorder=10 + (2 * idx),
366
+ )
367
+ self.axis.fill(
368
+ *self.geomobj[item].vertices,
369
+ facecolor="none",
370
+ edgecolor="black",
371
+ alpha=0.9,
372
+ linewidth=2.2,
373
+ zorder=11 + (2 * idx),
374
+ )
375
+
376
+ if legend:
377
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
378
+
379
+ def plotcontours(
380
+ self,
381
+ group,
382
+ palette=None,
383
+ legend=True,
384
+ plot_outliers=True,
385
+ percent_outliers=0.1,
386
+ linewidth=2.2,
387
+ ):
388
+ """Plot scores as contours"""
389
+ if palette is None and group is not None:
390
+ palette = _get_palette(group)
391
+ if percent_outliers > 1 or percent_outliers < 0:
392
+ raise Exception("Percent_outliers has to be between 0 and 1")
393
+
394
+ # Build color maps
395
+ cmap = {}
396
+ for item in set(group):
397
+ colorvalues = np.ones((4, 4))
398
+ if "#" in str(palette[item]):
399
+ color = wc.hex_to_rgb(palette[item])
400
+ elif palette[item][-1] != 1:
401
+ color = wc.name_to_rgb(palette[item])
402
+ else:
403
+ color = palette[item]
404
+
405
+ for i in range(3):
406
+ colorvalues[:, i] = np.linspace(1, color[i] / 256.0, 5)[1:]
407
+ colorvalues[:, 3] = np.linspace(0.95, 0.25, 4)
408
+ cmap[item] = ListedColormap(colorvalues)
409
+
410
+ self.geomobj = {}
411
+ for item in set(group):
412
+ minlevel = 0.2
413
+ diff = 100
414
+ k = 0
415
+ xgrid, ygrid = np.mgrid[
416
+ self.axis.get_xlim()[0] : self.axis.get_xlim()[1] : 300j,
417
+ self.axis.get_ylim()[0] : self.axis.get_ylim()[1] : 300j,
418
+ ]
419
+ positions = np.vstack([xgrid.ravel(), ygrid.ravel()])
420
+ while abs(diff) > 0 and k < 25:
421
+ levels = np.arange(5) * (1.0 - minlevel) / 4.0 + minlevel
422
+ idx = group.loc[group == item].index
423
+ values = np.vstack(
424
+ [self.scores.loc["pc1", idx], self.scores.loc["pc2", idx]]
425
+ )
426
+ kernel = st.gaussian_kde(values)
427
+ density = np.reshape(kernel(positions).T, xgrid.shape)
428
+ vals = np.max(density) * levels
429
+ self.axis.contour(xgrid, ygrid, density, vals)
430
+ vertices = self.axis.collections[-4].get_paths()[0].vertices.T
431
+ contained = [False] * len(idx)
432
+ for j in range(len(self.axis.collections[-5].get_paths())):
433
+ contained = np.logical_or(
434
+ contained,
435
+ self.axis.collections[-5]
436
+ .get_paths()[j]
437
+ .contains_points(
438
+ [
439
+ [self.scores.loc["pc1", i], self.scores.loc["pc2", i]]
440
+ for i in idx
441
+ ]
442
+ ),
443
+ )
444
+ _ = [self.axis.collections[-1].remove() for _ in np.arange(5)]
445
+ outside = [a for a, b in zip(list(idx), contained) if not b]
446
+
447
+ diff = round(percent_outliers * len(idx)) - len(outside)
448
+ minlevel = minlevel + diff / 1000.0
449
+ k += 1
450
+
451
+ self.geomobj[item] = GeomObj(
452
+ vertices=vertices,
453
+ grid=(xgrid, ygrid),
454
+ density=density,
455
+ values=vals,
456
+ outside=outside,
457
+ )
458
+
459
+ for idx, item in enumerate(
460
+ sorted(self.geomobj, key=lambda x: self.geomobj[x].area, reverse=True)
461
+ ):
462
+ self.axis.contourf(
463
+ *self.geomobj[item].grid,
464
+ self.geomobj[item].density,
465
+ self.geomobj[item].values,
466
+ antialiased=True,
467
+ cmap=cmap[item],
468
+ alpha=0.9,
469
+ zorder=10 + (2 * idx),
470
+ )
471
+ self.axis.contour(
472
+ *self.geomobj[item].grid,
473
+ self.geomobj[item].density,
474
+ self.geomobj[item].values,
475
+ antialiased=True,
476
+ colors="black",
477
+ alpha=0.5,
478
+ linewidths=linewidth,
479
+ zorder=11 + (2 * idx),
480
+ )
481
+ self.axis.collections[-1].remove()
482
+
483
+ if plot_outliers:
484
+ self.plotscores(None, palette[item], False, self.geomobj[item].outside)
485
+
486
+ if legend:
487
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
488
+
489
+ def labeloutliers(self, group, conf=3.0):
490
+ """Print labels on scores that are more than conf away from centroid"""
491
+ for item in set(group):
492
+ idx = group.loc[group == item].index
493
+ length = len(self.scores[idx].T)
494
+ sum_x = np.sum(self.scores.loc["pc1", idx])
495
+ sum_y = np.sum(self.scores.loc["pc2", idx])
496
+
497
+ mean_x, mean_y = sum_x / length, sum_y / length
498
+ pdist = {
499
+ i: np.sqrt(
500
+ (self.scores.loc["pc1", i] - mean_x) ** 2
501
+ + (self.scores.loc["pc2", i] - mean_y) ** 2
502
+ )
503
+ for i in idx
504
+ }
505
+ std = np.std(pdist.values())
506
+
507
+ outliers = [i for i in pdist.keys() if pdist[i] > conf * std]
508
+ self.plotscorelabels(outliers)
509
+
510
+ def displaylegend(self, loc=2):
511
+ """Display the item legend at location loc"""
512
+ patches = sorted(self.patches, key=lambda x: x._label)
513
+ for text in self.axis.texts:
514
+ if (
515
+ text._x < self.axis.get_xlim()[0] * 1 / 3.0
516
+ and text._y > self.axis.get_ylim()[1] * 2 / 3.0
517
+ ):
518
+ loc += 1
519
+ break
520
+ self.axis.legend(handles=patches, fontsize=9, frameon=False, loc=loc)
521
+
522
+ def remove(self, item):
523
+ """ Remove elements from plot """
524
+ table={"patches":self.axis.patches,
525
+ "labels": self.axis.texts,
526
+ "scores": self.axis.lines,
527
+ "contours": self.axis.collections}
528
+ if item not in table:
529
+ raise ValueError(f"Unknown item '{item}'. Choose from: {list(table)}")
530
+ for _ in range(len(table[item])):
531
+ table[item][-1].remove()
@@ -5,7 +5,7 @@ __author__ = "Christian Brinch"
5
5
  __copyright__ = "Copyright 2019-2026"
6
6
  __credits__ = ["Christian Brinch"]
7
7
  __license__ = "AFL 3.0"
8
- __version__ = "1.1.2"
8
+ __version__ = "1.1"
9
9
  __maintainer__ = "Christian Brinch"
10
10
  __email__ = "cbri@food.dtu.dk"
11
11
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycodamath
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: Compositional data (CoDa) analysis tools for Python
5
5
  Author-email: Christian Brinch <cbri@food.dtu.dk>
6
6
  License: MIT
@@ -285,9 +285,9 @@ The keyword `palette` is a dict mapping each unique group value to a colour.
285
285
  import pandas as pd
286
286
  data = pd.read_csv('example/kilauea_iki_chem.csv')
287
287
  mypca = coda.pca.Biplot(data)
288
- mypca.removelabels()
288
+ mypca.remove("labels")
289
289
  mypca.plotloadings(cluster=True)
290
290
  print(mypca.clusterlegend)
291
- mypca.removelabels()
291
+ mypca.remove("labels")
292
292
  mypca.plotloadings(labels=['FeO', 'Al2O3', 'CaO'], cluster=False)
293
293
  mypca.adjustloadinglabels()
@@ -1,456 +0,0 @@
1
- ''' Class and methods for making compositional biplots based on PCA '''
2
-
3
- __author__ = "Christian Brinch"
4
- __copyright__ = "Copyright 2019"
5
- __credits__ = ["Christian Brinch"]
6
- __license__ = "AFL 3.0"
7
- __version__ = "1.1"
8
- __maintainer__ = "Christian Brinch"
9
- __email__ = "cbri@food.dtu.dk"
10
-
11
- import numpy as np
12
- import matplotlib.pyplot as plt
13
- import webcolors as wc
14
- from matplotlib.colors import ListedColormap
15
- from matplotlib import cm
16
- from matplotlib.transforms import TransformedBbox
17
- import matplotlib.patches as mpatches
18
- import pandas as pd
19
- import scipy.stats as st
20
- from scipy.cluster.hierarchy import linkage, cut_tree
21
- from pycodamath import extra
22
-
23
-
24
- class GeomObj():
25
- ''' A generic container of geometric objects '''
26
-
27
- def __init__(self, **kwargs):
28
- vars(self).update(kwargs)
29
- self.area = self.polyarea()
30
-
31
- def polyarea(self):
32
- ''' Calculate the area of a polygon given two lists of vertices '''
33
- x, y = self.vertices
34
- return 0.5*np.abs(np.dot(x, np.roll(y, 1))-np.dot(y, np.roll(x, 1)))
35
-
36
-
37
- def scree_plot(axis, eig_val):
38
- ''' Make scree plot from eigen values'''
39
- axis.set_xlabel('Component')
40
- axis.set_ylabel('Explained variance')
41
- axis.set_xlim(0, min(len(eig_val)+1, 20))
42
- axis.bar(np.arange(len(eig_val))+1, (eig_val/np.sum(eig_val))**2)
43
- csum = np.cumsum(eig_val**2/np.sum(eig_val**2))
44
- for i in range(min(5, len(eig_val))):
45
- axis.annotate(str(np.round(csum[i]*100))+'%',
46
- (i+1.2, (eig_val[i]/np.sum(eig_val))**2))
47
-
48
-
49
- def _get_palette(group):
50
- cspace = cm.jet(np.linspace(0, 1, len(set(group))))
51
- return {item: cspace[idx] for idx, item in enumerate(set(group))}
52
-
53
-
54
- def _svd(clr):
55
- ''' Internal SVD function '''
56
- scores, eig_val, loadings = np.linalg.svd(clr)
57
- scores = pd.DataFrame(
58
- scores.T[0:2, :], columns=clr.index, index=['pc1', 'pc2'])
59
- loadings = pd.DataFrame(np.inner(eig_val*np.identity(len(eig_val)),
60
- loadings.T[0:len(eig_val), 0:len(eig_val)])[0:2],
61
- columns=clr.columns[0:len(eig_val)], index=['pc1', 'pc2'])
62
- return scores, eig_val, loadings
63
-
64
-
65
- def _bundle_loadings(loadings, threshold=0.05):
66
- ''' Reduce the number of loadings by hierachical clustering '''
67
- tree = linkage(loadings.T, 'ward')
68
- stump = cut_tree(tree, height=tree[-1][2]*threshold)
69
- clusters = pd.DataFrame([], index=loadings.index)
70
- legend = {}
71
- counter = 1
72
- stump_flat = np.concatenate(stump)
73
- for i in set(stump_flat):
74
- features = np.where(stump_flat == i)
75
- if len(features[0]) > 1:
76
- clusters[counter] = loadings[[loadings.columns[j]
77
- for j in features[0]]].mean(axis=1)
78
- legend[counter] = [loadings.columns[j] for j in features[0]]
79
- counter += 1
80
- else:
81
- clusters[loadings.columns[features[0]]
82
- ] = loadings[loadings.columns[features[0]]]
83
-
84
- return legend, clusters
85
-
86
-
87
- def _do_bbox_overlap(box1, box2):
88
- # If one rectangle is on left side of other
89
- return not(box1[1][0] > box2[2][0] or box2[1][0] > box1[2][0]
90
- or box1[3][1] > box2[1][1] or box2[2][1] > box1[1][1])
91
-
92
-
93
- class Biplot():
94
- ''' A class to create and a PCA biplot '''
95
-
96
- def __init__(self, data, axis=None, default=True):
97
- if axis is None:
98
- _, self.axis = plt.subplots(figsize=(7.8, 7.8))
99
- else:
100
- self.axis = axis
101
- self.axis.set(adjustable='box', aspect='equal')
102
- self.scores, eig_val, self.loadings = _svd(
103
- data.coda.center().coda.scale().coda.clr())
104
-
105
- scales = [np.max(np.abs(self.loadings.values)),
106
- [np.max(np.abs(self.scores.loc[idx].values)) for idx in ['pc1', 'pc2']]]
107
-
108
- self.axis.set_xlabel(
109
- f'P.C. 1 ({np.round(eig_val[0]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)')
110
- self.axis.set_ylabel(
111
- f'P.C. 2 ({np.round(eig_val[1]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)')
112
- self.axis.set_xlim(-scales[0]*1.1, scales[0]*1.1)
113
- self.axis.set_ylim(-scales[0]*1.1, scales[0]*1.1)
114
- self.axis.plot([self.axis.get_xlim()[0], self.axis.get_xlim()[1]],
115
- [0.0, 0.0], '--', color='black', alpha=0.4)
116
- self.axis.plot([0.0, 0.0], [self.axis.get_ylim()[0], self.axis.get_ylim()[1]],
117
- '--', color='black', alpha=0.4)
118
-
119
- self.scores = (scales[0]*(self.scores.T/scales[1])).T
120
-
121
- self.patches = []
122
- self.geomobj = {}
123
- plt.tight_layout()
124
-
125
- if default:
126
- self.plotloadings()
127
- self.plotscores()
128
-
129
- def plotloadings(self, cutoff=0, scale=None, labels=None, cluster=False):
130
- ''' Plot loadings '''
131
- if scale is None:
132
- scale = np.max(np.abs(self.loadings.values))
133
-
134
- if labels is None:
135
- labels = self.loadings.columns
136
- loadings = self.loadings[labels]
137
-
138
- if cutoff > 0:
139
- loadings.loc['len'] = np.sqrt(loadings.loc['pc1']**2 + loadings.loc['pc2']**2)
140
- loadings = loadings.T[loadings.T['len'] > cutoff*scale].T
141
- loadings = loadings.drop('len')
142
-
143
- if cluster:
144
- legend, loadings = _bundle_loadings(loadings)
145
-
146
- # Uncomment this part to print the cluster legend in the figure
147
- # x = self.axis.get_xlim()[1]*1.05
148
- # y = self.axis.get_ylim()[1]*0.95
149
- # for key in legend.keys():
150
- # # plt.text(x, y, str(key)+":", fontsize=10)
151
- # print(str(key)+":", end="")
152
- # for item in legend[key]:
153
- # if len(item) > 22:
154
- # item = item[:22]+'...'
155
- # # plt.text(x, y, " "+str(item), fontsize=10)
156
- # # y -= (self.axis.get_ylim()[1]-self.axis.get_ylim()[0])/50.
157
- # print(" "+str(item))
158
-
159
- for column in loadings:
160
- self.axis.arrow(0, 0,
161
- loadings.loc['pc1', column],
162
- loadings.loc['pc2', column],
163
- facecolor='black',
164
- alpha=0.5,
165
- linewidth=0.,
166
- width=scale*0.01,
167
- zorder=2000)
168
- self.plotloadinglabels(loadings.columns, loadings, cutoff)
169
- if cluster:
170
- self.clusterlegend = legend
171
-
172
- def plotloadinglabels(self, labels=None, loadings=None, cutoff=0):
173
- ''' Add labels to the loadings '''
174
- if loadings is None:
175
- loadings = self.loadings
176
- if labels is None:
177
- labels = loadings.columns
178
-
179
- for column in labels:
180
- if np.sqrt(loadings.loc['pc1', column]**2 +
181
- loadings.loc['pc2', column]**2) > cutoff:
182
-
183
- ann = self.axis.annotate(str(column)[:30], (loadings.loc['pc1', column],
184
- loadings.loc['pc2', column]),
185
- ha='left',
186
- va='bottom',
187
- alpha=0.95,
188
- color='black',
189
- zorder=5001
190
- )
191
- fig = self.axis.get_figure()
192
- fig.canvas.draw()
193
- bbox = ann.get_window_extent(
194
- renderer=fig.canvas.get_renderer())
195
- transform = self.axis.transData.inverted()
196
- tbox = TransformedBbox(bbox, transform).corners()
197
- if tbox[2][0] > self.axis.get_xlim()[1]:
198
- self.axis.texts[-1]._x -= (tbox[2]
199
- [0]-self.axis.get_xlim()[1])
200
- fig.canvas.draw()
201
- bbox = self.axis.texts[-1].get_window_extent(
202
- renderer=fig.canvas.get_renderer())
203
- transform = self.axis.transData.inverted()
204
- tbox = TransformedBbox(bbox, transform).corners()
205
- self.axis.texts[-1]._x += (self.axis.get_xlim()
206
- [1]-tbox[2][0])
207
-
208
- def adjustloadinglabels(self):
209
- ''' Adjust loading label positions, so that labels don't overlap. '''
210
- fig = self.axis.get_figure()
211
- fig.canvas.draw()
212
- for idx, label in enumerate(self.axis.texts[:-1]):
213
- bbox = label.get_window_extent(renderer=fig.canvas.get_renderer())
214
- transform = self.axis.transData.inverted()
215
- tbox = TransformedBbox(bbox, transform).corners()
216
- for tidx, testlabel in enumerate(self.axis.texts[idx+1:]):
217
- lbox = testlabel.get_window_extent(
218
- renderer=fig.canvas.get_renderer())
219
- transform = self.axis.transData.inverted()
220
- ttbox = TransformedBbox(lbox, transform).corners()
221
- if _do_bbox_overlap(tbox, ttbox) or _do_bbox_overlap(ttbox, tbox):
222
- if self.axis.texts[idx]._y < self.axis.texts[idx+tidx+1]._y:
223
- self.axis.texts[idx]._y -= (tbox[1][1]-tbox[0][1])/3.
224
- self.axis.texts[idx+tidx+1]._y += (ttbox[1]
225
- [1]-ttbox[0][1])/3.
226
- else:
227
- self.axis.texts[idx]._y += (tbox[1][1]-tbox[0][1])/3.
228
- self.axis.texts[idx+tidx+1]._y -= (ttbox[1]
229
- [1]-ttbox[0][1])/3.
230
- fig.canvas.draw()
231
-
232
- def plotscores(self, group=None, palette=None, legend=True, labels=None):
233
- ''' Plot scores as points '''
234
- if labels is None:
235
- labels = self.scores.columns
236
-
237
- if palette is None:
238
- if group is not None:
239
- palette = _get_palette(group)
240
- else:
241
- palette = 'steelblue'
242
-
243
- if group is None:
244
- self.axis.plot(*self.scores[labels].values, 'o', alpha=0.5,
245
- color=palette, zorder=7, markeredgewidth=0)
246
- else:
247
- for item in set(group):
248
- idx = group.loc[group == item].index
249
- self.axis.plot(*self.scores[idx].values, 'o', alpha=0.5, zorder=7,
250
- label=item, color=palette[item], markeredgewidth=0)
251
- if legend:
252
- self.patches.append(mpatches.Patch(
253
- color=palette[item], label=item))
254
-
255
- def plotscorelabels(self, labels=None):
256
- ''' Add labels to the scores '''
257
- if labels is None:
258
- labels = self.scores.columns
259
-
260
- for label in labels:
261
- self.axis.annotate(label, (self.scores.loc['pc1', label],
262
- self.scores.loc['pc2', label]),
263
- ha='left',
264
- va='bottom',
265
- alpha=0.8,
266
- zorder=201,
267
- size=8
268
- )
269
-
270
- def plotellipses(self, group, palette=None, legend=False):
271
- ''' Plot confidence ellipses '''
272
- if palette is None:
273
- palette = _get_palette(group)
274
-
275
- for item in set(group):
276
- idx = group.loc[group == item].index
277
- if len(idx) > 3:
278
- ellipse = extra.get_covariance_ellipse(pd.DataFrame(self.scores[idx].values.T),
279
- conf=90)
280
- extra.plot_covariance_ellipse(
281
- self.axis, ellipse, color=palette[item])
282
- if legend:
283
- self.patches.append(mpatches.Patch(
284
- color=palette[item], label=item))
285
-
286
- def plotcentroids(self, group, palette=None, legend=False):
287
- ''' Plot score group centroids '''
288
- if palette is None:
289
- palette = _get_palette(group)
290
-
291
- for item in set(group):
292
- idx = group.loc[group == item].index
293
- mean_x = self.scores.loc['pc1', idx].mean()
294
- mean_y = self.scores.loc['pc2', idx].mean()
295
- self.axis.plot([mean_x], [mean_y], 'x', alpha=0.7,
296
- label=item, color=palette[item], markersize=24)
297
- if legend:
298
- self.patches.append(mpatches.Patch(
299
- color=palette[item], label=item))
300
-
301
- def plothulls(self, group, palette=None, legend=True):
302
- ''' Plot score group hulls '''
303
- if palette is None:
304
- palette = _get_palette(group)
305
-
306
- self.geomobj = {}
307
- for item in set(group):
308
- idx = group.loc[group == item].index
309
- if len(idx) > 3:
310
- # My secret hull construction algorithm
311
- idxmin = self.scores.loc['pc1', idx].idxmin()
312
- j = self.scores[idx].columns.get_loc(idxmin)
313
- hull = [list(self.scores[idxmin])]
314
- while (j != self.scores[idx].columns.get_loc(idxmin) or len(hull) == 1):
315
- k = (j + 1) % len(idx)
316
- for i in range(len(idx)):
317
- if (self.scores[idx].iloc[1, k]-self.scores[idx].iloc[1, j]) * \
318
- (self.scores[idx].iloc[0, i]-self.scores[idx].iloc[0, k]) - \
319
- (self.scores[idx].iloc[0, k]-self.scores[idx].iloc[0, j]) * \
320
- (self.scores[idx].iloc[1, i]-self.scores[idx].iloc[1, k]) < 0:
321
- k = i
322
- j = k
323
- hull.append(list(self.scores[self.scores[idx].columns[k]]))
324
- self.geomobj[item] = GeomObj(
325
- vertices=tuple(map(list, zip(*hull))))
326
-
327
- for idx, item in enumerate(sorted(self.geomobj,
328
- key=lambda x: self.geomobj[x].area, reverse=True)):
329
- self.axis.fill(*self.geomobj[item].vertices,
330
- color=palette[item], alpha=0.7, zorder=10+(2*idx))
331
- self.axis.fill(*self.geomobj[item].vertices, facecolor='none',
332
- edgecolor='black', alpha=0.9, linewidth=2.2, zorder=11+(2*idx))
333
-
334
- if legend:
335
- self.patches.append(mpatches.Patch(
336
- color=palette[item], label=item))
337
-
338
- def plotcontours(self, group, palette=None, legend=True,
339
- plot_outliers=True, percent_outliers=0.1, linewidth=2.2):
340
- ''' Plot scores as contours '''
341
- if palette is None and group is not None:
342
- palette = _get_palette(group)
343
- if percent_outliers > 1 or percent_outliers < 0:
344
- raise Exception('Percent_outliers has to be between 0 and 1')
345
-
346
- # Build color maps
347
- cmap = {}
348
- for item in set(group):
349
- colorvalues = np.ones((4, 4))
350
- if '#' in str(palette[item]):
351
- color = wc.hex_to_rgb(palette[item])
352
- elif palette[item][-1] != 1:
353
- color = wc.name_to_rgb(palette[item])
354
- else:
355
- color = palette[item]
356
-
357
- for i in range(3):
358
- colorvalues[:, i] = np.linspace(1, color[i]/256., 5)[1:]
359
- colorvalues[:, 3] = np.linspace(.95, .25, 4)
360
- cmap[item] = ListedColormap(colorvalues)
361
-
362
- self.geomobj = {}
363
- for item in set(group):
364
- minlevel = 0.2
365
- diff = 100
366
- k = 0
367
- while abs(diff) > 0 and k < 25:
368
- levels = np.arange(5)*(1.-minlevel)/4.+minlevel
369
- idx = group.loc[group == item].index
370
- xgrid, ygrid = np.mgrid[self.axis.get_xlim()[0]: self.axis.get_xlim()[1]: 300j,
371
- self.axis.get_ylim()[0]: self.axis.get_ylim()[1]: 300j]
372
- positions = np.vstack([xgrid.ravel(), ygrid.ravel()])
373
- values = np.vstack(
374
- [self.scores.loc['pc1', idx], self.scores.loc['pc2', idx]])
375
- kernel = st.gaussian_kde(values)
376
- density = np.reshape(kernel(positions).T, xgrid.shape)
377
- vals = np.max(density)*levels
378
- self.axis.contour(xgrid, ygrid, density, vals)
379
- vertices = self.axis.collections[-4].get_paths()[0].vertices.T
380
- contained = [False] * len(idx)
381
- for j in range(len(self.axis.collections[-5].get_paths())):
382
- contained = np.logical_or(contained,
383
- self.axis.collections[-5].get_paths()[j].contains_points(
384
- [[self.scores.loc['pc1', i],
385
- self.scores.loc['pc2', i]] for i in idx]))
386
- _ = [self.axis.collections[-1].remove() for _ in np.arange(5)]
387
- outside = [a for a, b in zip(list(idx), contained) if not b]
388
-
389
- diff = round(percent_outliers*len(idx))-len(outside)
390
- minlevel = minlevel+diff/1000.
391
- k += 1
392
-
393
- self.geomobj[item] = GeomObj(vertices=vertices, grid=(
394
- xgrid, ygrid), density=density, values=vals, outside=outside)
395
-
396
- for idx, item in enumerate(sorted(self.geomobj,
397
- key=lambda x: self.geomobj[x].area, reverse=True)):
398
- self.axis.contourf(*self.geomobj[item].grid, self.geomobj[item].density,
399
- self.geomobj[item].values, antialiased=True,
400
- cmap=cmap[item], alpha=0.9, zorder=10+(2*idx))
401
- self.axis.contour(*self.geomobj[item].grid, self.geomobj[item].density,
402
- self.geomobj[item].values, antialiased=True,
403
- colors='black', alpha=0.5, linewidths=linewidth, zorder=11+(2*idx))
404
- self.axis.collections[-1].remove()
405
-
406
- if plot_outliers:
407
- self.plotscores(
408
- None, palette[item], False, self.geomobj[item].outside)
409
-
410
- if legend:
411
- self.patches.append(mpatches.Patch(
412
- color=palette[item], label=item))
413
-
414
- def labeloutliers(self, group, conf=3.):
415
- ''' Print labels on scores that are more than conf away from centroid '''
416
- for item in set(group):
417
- idx = group.loc[group == item].index
418
- length = len(self.scores[idx].T)
419
- sum_x = np.sum(self.scores.loc['pc1', idx])
420
- sum_y = np.sum(self.scores.loc['pc2', idx])
421
-
422
- pdist = {i: np.sqrt(self.scores.loc['pc1', i]-sum_x/length**2 +
423
- self.scores.loc['pc2', i]-sum_y/length**2) for i in idx}
424
- std = np.std(pdist.values())
425
-
426
- outliers = [i for i in pdist.keys() if pdist[i] > conf*std]
427
- self.plotscorelabels(outliers)
428
-
429
- def displaylegend(self, loc=2):
430
- ''' Display the item legend at location loc '''
431
- patches = sorted(self.patches, key=lambda x: x._label)
432
- for text in self.axis.texts:
433
- if text._x < self.axis.get_xlim()[0]*1/3. and text._y > self.axis.get_ylim()[1]*2/3.:
434
- loc += 1
435
- break
436
- self.axis.legend(handles=patches, fontsize=9, frameon=False, loc=loc)
437
-
438
- def removepatches(self):
439
- ''' remove arrows and polygons from plot '''
440
- for _ in range(len(self.axis.patches)):
441
- self.axis.patches[-1].remove()
442
-
443
- def removelabels(self):
444
- ''' remove labels from plot '''
445
- for _ in range(len(self.axis.texts)):
446
- self.axis.texts[-1].remove()
447
-
448
- def removescores(self):
449
- ''' remove points from plot '''
450
- for _ in range(len(self.axis.lines)):
451
- self.axis.lines[-1].remove()
452
-
453
- def removecontours(self):
454
- ''' remove points from plot '''
455
- for _ in range(len(self.axis.collections)):
456
- self.axis.collections[-1].remove()
File without changes
File without changes