pyCoDaMath 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyCoDaMath-1.0/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2018 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyCoDaMath
3
+ Version: 1.0
4
+ Summary: Compositional data (CoDa) analysis tools for Python
5
+ Author-email: Christian Brinch <cbri@food.dtu.dk>
6
+ Project-URL: Homepage, https://bitbucket.org/genomicepidemiology/pycodamath
7
+ Project-URL: Bug Tracker, https://bitbucket.org/genomicepidemiology/pycodamath/issues?status=new&status=open&is_spam=!spam
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+
15
+ # pyCoDaMath
16
+
17
+ [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
18
+
19
+
20
+ pyCoDaMath provides compositional data (CoDa) analysis tools for Python
21
+
22
+ - **Source code:** https://bitbucket.org/genomicepidemiology/pycoda
23
+
24
+ ## Getting Started
25
+
26
+ This package extends the Pandas dataframe object with various CoDa tools. It also provides a set of plotting functions for CoDa figures.
27
+
28
+ ### Installation
29
+
30
+ Clone the git repo to your local hard drive:
31
+
32
+ git clone https://brinch@bitbucket.org/genomicepidemiology/pycoda.git
33
+
34
+ Enter pycoda directory and type
35
+
36
+ pip install ./
37
+
38
+ ### Usage
39
+
40
+ The pyCoDaMath module is loaded as
41
+
42
+ import pycodamath
43
+
44
+ At this point, in order to get CLR values from a Pandas DataFrame df, do
45
+
46
+ df.coda.clr()
47
+
48
+
49
+ ## Documentation
50
+
51
+ ### CLR transformation - point estimate
52
+ df.coda.clr()
53
+
54
+ Returns centered logratio coefficients. If the data frame contains zeros, values
55
+ will be replaced by the Aitchison mean point estimate.
56
+
57
+ ### CLR transformation - standard deviation
58
+ df.coda.clr_std(n_samples=5000)
59
+
60
+ Returns the standard deviation of n_samples random draws in CLR space.
61
+
62
+ **Parameters**
63
+
64
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
65
+
66
+
67
+ ### ALR transformation - point estimate
68
+ df.coda.alr(part=None)
69
+
70
+ Same as clr() but returning additive logratio values. If part is None, then the last part of the composition is used, otherwise part is used as denominator.
71
+
72
+ **Parameters**
73
+
74
+ - part (str) - Name of the part to be used as denominator.
75
+
76
+
77
+ ### ALR transformation - standard deviation
78
+ df.coda.alr_std(part=None, n_samples=5000)
79
+
80
+ Same as clr_std, but in ALR space.
81
+
82
+ **Parameters**
83
+
84
+ - part (str) - Name of the part to be used as denominator.
85
+
86
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
87
+
88
+
89
+ ### ILR transformation - point estimate
90
+ df.coda.ilr(psi=None)
91
+
92
+ Same as clr() but for isometric logratio transform. An orthonormal basis can be
93
+ provided as psi. If no basis is given, a default sequential binary partition basis will be used.
94
+
95
+ **Parameters**
96
+
97
+ - psi (array_like) - Orthonormal basis.
98
+
99
+ ### ILR transformation - standard deviation
100
+ df.coda.ilr_std(psi=None, n_samples=5000)
101
+
102
+ This method does not exist (yet).
103
+
104
+
105
+ ### Bayesian zero replacement
106
+ df.coda.zero_replacement(n_samples=5000)
107
+
108
+ Returns a count table with zero values replaced by finite values using Bayesian inference.
109
+
110
+ **Parameters**
111
+
112
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
113
+
114
+
115
+ ### Closure
116
+ df.coda.closure(N)
117
+
118
+ Apply closure to constant N to the composition.
119
+
120
+ **Parameters**
121
+
122
+ - N (int) - Closure constant.
123
+
124
+ ### Total variance
125
+ df.coda.totvar()
126
+
127
+ Calculates the total variance of a set of compositions.
128
+
129
+ ### Geometric mean
130
+ df.coda.gmean()
131
+
132
+ Calculates the geometric mean of a set of compositions.
133
+
134
+ ### Centering
135
+ df.coda.center()
136
+
137
+ Centers (and scales) the composition by dividing by the geometric mean and powering by the reciprocal variance.
138
+
139
+
140
+
141
+ ## Plotting functions
142
+
143
+ ### PCA biplot
144
+ class pycoda.pca.Biplot(data, default=True)
145
+
146
+ Plots a PCA biplot. Set default to False for an empty plot.
147
+ The parameter data (DataFrame) is the data to be analyzed. Use counts, not CLR values.
148
+
149
+ A number of methods are available for customizing the biplot:
150
+
151
+ - plotloadings(cutoff=0, scale=None, labels=None)
152
+ - plotloadinglabels(labels=None)
153
+ - plotscores(group=None, palette=None, legend=True, labels=None)
154
+ - plotscorelables(labels=None)
155
+ - plotellipses(group=None, palette=None)
156
+ - plotcentroids(group=None, palette=None)
157
+ - plothulls(group=None, palette=None)
158
+ - plotcontours(group=None, palette=None, size=None, levels=None)
159
+ - removepatches()
160
+ - removescores()
161
+ - removelabels()
162
+
163
+ The keyword labels is a list of labelnames. If labels is None, all labels are plottet. Use labels=[] for no labels.
164
+
165
+ The keyword group is a Pandas dataframe with index equal to the index of data.
166
+
167
+ The keyword palette is a dict with colors to use to each unique member of group.
168
+
169
+ Example
170
+ import pycoda as coda
171
+ import pandas as pd
172
+
173
+ data = pd.read_csv('example/kilauea_iki_chem.csv')
174
+ mypca = coda.pca.Biplot(data)
175
+ mypca.plothulls()
176
+ mypca.removelabels()
177
+ mypca.plotloadinglabels(['FeO'])
178
+
179
+ ### Ternary diagram
180
+ pycoda.plot.ternary()
@@ -0,0 +1,166 @@
1
+ # pyCoDaMath
2
+
3
+ [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
4
+
5
+
6
+ pyCoDaMath provides compositional data (CoDa) analysis tools for Python
7
+
8
+ - **Source code:** https://bitbucket.org/genomicepidemiology/pycoda
9
+
10
+ ## Getting Started
11
+
12
+ This package extends the Pandas dataframe object with various CoDa tools. It also provides a set of plotting functions for CoDa figures.
13
+
14
+ ### Installation
15
+
16
+ Clone the git repo to your local hard drive:
17
+
18
+ git clone https://brinch@bitbucket.org/genomicepidemiology/pycoda.git
19
+
20
+ Enter pycoda directory and type
21
+
22
+ pip install ./
23
+
24
+ ### Usage
25
+
26
+ The pyCoDaMath module is loaded as
27
+
28
+ import pycodamath
29
+
30
+ At this point, in order to get CLR values from a Pandas DataFrame df, do
31
+
32
+ df.coda.clr()
33
+
34
+
35
+ ## Documentation
36
+
37
+ ### CLR transformation - point estimate
38
+ df.coda.clr()
39
+
40
+ Returns centered logratio coefficients. If the data frame contains zeros, values
41
+ will be replaced by the Aitchison mean point estimate.
42
+
43
+ ### CLR transformation - standard deviation
44
+ df.coda.clr_std(n_samples=5000)
45
+
46
+ Returns the standard deviation of n_samples random draws in CLR space.
47
+
48
+ **Parameters**
49
+
50
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
51
+
52
+
53
+ ### ALR transformation - point estimate
54
+ df.coda.alr(part=None)
55
+
56
+ Same as clr() but returning additive logratio values. If part is None, then the last part of the composition is used, otherwise part is used as denominator.
57
+
58
+ **Parameters**
59
+
60
+ - part (str) - Name of the part to be used as denominator.
61
+
62
+
63
+ ### ALR transformation - standard deviation
64
+ df.coda.alr_std(part=None, n_samples=5000)
65
+
66
+ Same as clr_std, but in ALR space.
67
+
68
+ **Parameters**
69
+
70
+ - part (str) - Name of the part to be used as denominator.
71
+
72
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
73
+
74
+
75
+ ### ILR transformation - point estimate
76
+ df.coda.ilr(psi=None)
77
+
78
+ Same as clr() but for isometric logratio transform. An orthonormal basis can be
79
+ provided as psi. If no basis is given, a default sequential binary partition basis will be used.
80
+
81
+ **Parameters**
82
+
83
+ - psi (array_like) - Orthonormal basis.
84
+
85
+ ### ILR transformation - standard deviation
86
+ df.coda.ilr_std(psi=None, n_samples=5000)
87
+
88
+ This method does not exist (yet).
89
+
90
+
91
+ ### Bayesian zero replacement
92
+ df.coda.zero_replacement(n_samples=5000)
93
+
94
+ Returns a count table with zero values replaced by finite values using Bayesian inference.
95
+
96
+ **Parameters**
97
+
98
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
99
+
100
+
101
+ ### Closure
102
+ df.coda.closure(N)
103
+
104
+ Apply closure to constant N to the composition.
105
+
106
+ **Parameters**
107
+
108
+ - N (int) - Closure constant.
109
+
110
+ ### Total variance
111
+ df.coda.totvar()
112
+
113
+ Calculates the total variance of a set of compositions.
114
+
115
+ ### Geometric mean
116
+ df.coda.gmean()
117
+
118
+ Calculates the geometric mean of a set of compositions.
119
+
120
+ ### Centering
121
+ df.coda.center()
122
+
123
+ Centers (and scales) the composition by dividing by the geometric mean and powering by the reciprocal variance.
124
+
125
+
126
+
127
+ ## Plotting functions
128
+
129
+ ### PCA biplot
130
+ class pycoda.pca.Biplot(data, default=True)
131
+
132
+ Plots a PCA biplot. Set default to False for an empty plot.
133
+ The parameter data (DataFrame) is the data to be analyzed. Use counts, not CLR values.
134
+
135
+ A number of methods are available for customizing the biplot:
136
+
137
+ - plotloadings(cutoff=0, scale=None, labels=None)
138
+ - plotloadinglabels(labels=None)
139
+ - plotscores(group=None, palette=None, legend=True, labels=None)
140
+ - plotscorelables(labels=None)
141
+ - plotellipses(group=None, palette=None)
142
+ - plotcentroids(group=None, palette=None)
143
+ - plothulls(group=None, palette=None)
144
+ - plotcontours(group=None, palette=None, size=None, levels=None)
145
+ - removepatches()
146
+ - removescores()
147
+ - removelabels()
148
+
149
+ The keyword labels is a list of labelnames. If labels is None, all labels are plottet. Use labels=[] for no labels.
150
+
151
+ The keyword group is a Pandas dataframe with index equal to the index of data.
152
+
153
+ The keyword palette is a dict with colors to use to each unique member of group.
154
+
155
+ Example
156
+ import pycoda as coda
157
+ import pandas as pd
158
+
159
+ data = pd.read_csv('example/kilauea_iki_chem.csv')
160
+ mypca = coda.pca.Biplot(data)
161
+ mypca.plothulls()
162
+ mypca.removelabels()
163
+ mypca.plotloadinglabels(['FeO'])
164
+
165
+ ### Ternary diagram
166
+ pycoda.plot.ternary()
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pyCoDaMath"
7
+ version = "1.0"
8
+ authors = [
9
+ { name="Christian Brinch", email="cbri@food.dtu.dk" },
10
+ ]
11
+ description = "Compositional data (CoDa) analysis tools for Python"
12
+ readme = "README.md"
13
+ requires-python = ">=3.7"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+ dependencies = ['matplotlib>=3.1.1',
20
+ 'numpy>=1.17.2',
21
+ 'pandas>=0.25.1',
22
+ 'python-ternary>=1.0.6',
23
+ 'scipy>=1.3.1',
24
+ 'webcolors>=1.13',
25
+ 'adjustText==0.7.3',
26
+ ]
27
+
28
+ [project.urls]
29
+ "Homepage" = "https://bitbucket.org/genomicepidemiology/pycodamath"
30
+ "Bug Tracker" = "https://bitbucket.org/genomicepidemiology/pycodamath/issues?status=new&status=open&is_spam=!spam"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyCoDaMath
3
+ Version: 1.0
4
+ Summary: Compositional data (CoDa) analysis tools for Python
5
+ Author-email: Christian Brinch <cbri@food.dtu.dk>
6
+ Project-URL: Homepage, https://bitbucket.org/genomicepidemiology/pycodamath
7
+ Project-URL: Bug Tracker, https://bitbucket.org/genomicepidemiology/pycodamath/issues?status=new&status=open&is_spam=!spam
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+
15
+ # pyCoDaMath
16
+
17
+ [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
18
+
19
+
20
+ pyCoDaMath provides compositional data (CoDa) analysis tools for Python
21
+
22
+ - **Source code:** https://bitbucket.org/genomicepidemiology/pycoda
23
+
24
+ ## Getting Started
25
+
26
+ This package extends the Pandas dataframe object with various CoDa tools. It also provides a set of plotting functions for CoDa figures.
27
+
28
+ ### Installation
29
+
30
+ Clone the git repo to your local hard drive:
31
+
32
+ git clone https://brinch@bitbucket.org/genomicepidemiology/pycoda.git
33
+
34
+ Enter pycoda directory and type
35
+
36
+ pip install ./
37
+
38
+ ### Usage
39
+
40
+ The pyCoDaMath module is loaded as
41
+
42
+ import pycodamath
43
+
44
+ At this point, in order to get CLR values from a Pandas DataFrame df, do
45
+
46
+ df.coda.clr()
47
+
48
+
49
+ ## Documentation
50
+
51
+ ### CLR transformation - point estimate
52
+ df.coda.clr()
53
+
54
+ Returns centered logratio coefficients. If the data frame contains zeros, values
55
+ will be replaced by the Aitchison mean point estimate.
56
+
57
+ ### CLR transformation - standard deviation
58
+ df.coda.clr_std(n_samples=5000)
59
+
60
+ Returns the standard deviation of n_samples random draws in CLR space.
61
+
62
+ **Parameters**
63
+
64
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
65
+
66
+
67
+ ### ALR transformation - point estimate
68
+ df.coda.alr(part=None)
69
+
70
+ Same as clr() but returning additive logratio values. If part is None, then the last part of the composition is used, otherwise part is used as denominator.
71
+
72
+ **Parameters**
73
+
74
+ - part (str) - Name of the part to be used as denominator.
75
+
76
+
77
+ ### ALR transformation - standard deviation
78
+ df.coda.alr_std(part=None, n_samples=5000)
79
+
80
+ Same as clr_std, but in ALR space.
81
+
82
+ **Parameters**
83
+
84
+ - part (str) - Name of the part to be used as denominator.
85
+
86
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
87
+
88
+
89
+ ### ILR transformation - point estimate
90
+ df.coda.ilr(psi=None)
91
+
92
+ Same as clr() but for isometric logratio transform. An orthonormal basis can be
93
+ provided as psi. If no basis is given, a default sequential binary partition basis will be used.
94
+
95
+ **Parameters**
96
+
97
+ - psi (array_like) - Orthonormal basis.
98
+
99
+ ### ILR transformation - standard deviation
100
+ df.coda.ilr_std(psi=None, n_samples=5000)
101
+
102
+ This method does not exist (yet).
103
+
104
+
105
+ ### Bayesian zero replacement
106
+ df.coda.zero_replacement(n_samples=5000)
107
+
108
+ Returns a count table with zero values replaced by finite values using Bayesian inference.
109
+
110
+ **Parameters**
111
+
112
+ - n_samples (int) - Number of random draws from a Dirichlet distribution.
113
+
114
+
115
+ ### Closure
116
+ df.coda.closure(N)
117
+
118
+ Apply closure to constant N to the composition.
119
+
120
+ **Parameters**
121
+
122
+ - N (int) - Closure constant.
123
+
124
+ ### Total variance
125
+ df.coda.totvar()
126
+
127
+ Calculates the total variance of a set of compositions.
128
+
129
+ ### Geometric mean
130
+ df.coda.gmean()
131
+
132
+ Calculates the geometric mean of a set of compositions.
133
+
134
+ ### Centering
135
+ df.coda.center()
136
+
137
+ Centers (and scales) the composition by dividing by the geometric mean and powering by the reciprocal variance.
138
+
139
+
140
+
141
+ ## Plotting functions
142
+
143
+ ### PCA biplot
144
+ class pycoda.pca.Biplot(data, default=True)
145
+
146
+ Plots a PCA biplot. Set default to False for an empty plot.
147
+ The parameter data (DataFrame) is the data to be analyzed. Use counts, not CLR values.
148
+
149
+ A number of methods are available for customizing the biplot:
150
+
151
+ - plotloadings(cutoff=0, scale=None, labels=None)
152
+ - plotloadinglabels(labels=None)
153
+ - plotscores(group=None, palette=None, legend=True, labels=None)
154
+ - plotscorelables(labels=None)
155
+ - plotellipses(group=None, palette=None)
156
+ - plotcentroids(group=None, palette=None)
157
+ - plothulls(group=None, palette=None)
158
+ - plotcontours(group=None, palette=None, size=None, levels=None)
159
+ - removepatches()
160
+ - removescores()
161
+ - removelabels()
162
+
163
+ The keyword labels is a list of labelnames. If labels is None, all labels are plottet. Use labels=[] for no labels.
164
+
165
+ The keyword group is a Pandas dataframe with index equal to the index of data.
166
+
167
+ The keyword palette is a dict with colors to use to each unique member of group.
168
+
169
+ Example
170
+ import pycoda as coda
171
+ import pandas as pd
172
+
173
+ data = pd.read_csv('example/kilauea_iki_chem.csv')
174
+ mypca = coda.pca.Biplot(data)
175
+ mypca.plothulls()
176
+ mypca.removelabels()
177
+ mypca.plotloadinglabels(['FeO'])
178
+
179
+ ### Ternary diagram
180
+ pycoda.plot.ternary()
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/pyCoDaMath.egg-info/PKG-INFO
5
+ src/pyCoDaMath.egg-info/SOURCES.txt
6
+ src/pyCoDaMath.egg-info/dependency_links.txt
7
+ src/pyCoDaMath.egg-info/requires.txt
8
+ src/pyCoDaMath.egg-info/top_level.txt
9
+ src/pycodamath/__init__.py
10
+ src/pycodamath/extra.py
11
+ src/pycodamath/pca.py
12
+ src/pycodamath/plot.py
13
+ src/pycodamath/pycoda.py
@@ -0,0 +1,7 @@
1
+ matplotlib>=3.1.1
2
+ numpy>=1.17.2
3
+ pandas>=0.25.1
4
+ python-ternary>=1.0.6
5
+ scipy>=1.3.1
6
+ webcolors>=1.13
7
+ adjustText==0.7.3
@@ -0,0 +1 @@
1
+ pycodamath
@@ -0,0 +1,13 @@
1
+ '''
2
+ pyCoDa init script
3
+ '''
4
+
5
+ __title__ = "pyCoDaMath"
6
+ __author__ = "Christian Brinch"
7
+ __email__ = "cbri@food.dtu.dk"
8
+ __copyright__ = "Copyright 2019 C. Brinch"
9
+ __version__ = 1.0
10
+ __all__ = ['pycoda', 'extra', 'plot', 'pca']
11
+
12
+ from . import pycoda, pca
13
+ pycoda.init()
@@ -0,0 +1,128 @@
1
+ # -*- coding: utf-8 -*-
2
+ ''' Auxilliary functions for pyCoDaMath
3
+ '''
4
+
5
+ __author__ = "Christian Brinch"
6
+ __copyright__ = "Copyright 2019"
7
+ __credits__ = ["Christian Brinch"]
8
+ __license__ = "AFL 3.0"
9
+ __version__ = "1.0"
10
+ __maintainer__ = "Christian Brinch"
11
+ __email__ = "cbri@food.dtu.dk"
12
+
13
+ import math
14
+ import numpy as np
15
+ from matplotlib.patches import Ellipse
16
+
17
+
18
+ def sbp_basis(obj):
19
+ ''' Define basis to use in IRL transformation '''
20
+ dim = np.shape(obj)[1]
21
+ psi = np.zeros([dim-1, dim])
22
+ for i in range(dim-1):
23
+ for j in range(dim):
24
+ if j+1 <= dim-i-1:
25
+ psi[i, j] = np.sqrt(1./((dim-i-1)*(dim-i)))
26
+ elif j+1 == dim-i:
27
+ psi[i, j] = -np.sqrt((dim-i-1)/(dim-i))
28
+
29
+ check_basis(psi)
30
+ return psi
31
+
32
+
33
+ def norm(balances):
34
+ ''' Normalize a matrix of balances '''
35
+ psi = []
36
+ for row in balances:
37
+ minus = sum(1 for i in row if i < 0)
38
+ plus = sum(1 for i in row if i > 0)
39
+ psi.append([1/plus*np.sqrt(plus*minus/(plus+minus)) if i > 0 else -1/minus *
40
+ np.sqrt(plus*minus/(plus+minus)) if i < 0 else 0 for i in row])
41
+
42
+ psi = np.array(psi)
43
+ check_basis(psi)
44
+ return psi
45
+
46
+
47
+ def check_basis(psi):
48
+ ''' Check if basis is orthonormal '''
49
+ ident = np.matmul(psi, psi.T)
50
+ if np.trace(ident) != np.shape(ident)[0]:
51
+ raise AttributeError("Error: Basis is not normalized.")
52
+ if np.abs(np.sum(ident-np.diag(np.diagonal(ident)))) > 1e-6:
53
+ raise AttributeError("Error: Basis is not orthogonal.")
54
+
55
+
56
+ def points_in_ellipse(ellipse, npoints):
57
+ ''' Return n points along the edge of an ellipse '''
58
+ return [(ellipse['shape'][0] * math.cos(x) * math.cos(-ellipse['angle'])
59
+ - ellipse['shape'][1] * math.sin(x)*math.sin(-ellipse['angle']) + ellipse['center'][0],
60
+ ellipse['shape'][0] * math.cos(x) * math.sin(-ellipse['angle'])
61
+ + ellipse['shape'][1] * math.sin(x)*math.cos(-ellipse['angle']) + ellipse['center'][1])
62
+ for x in np.linspace(0, 2*np.pi, npoints)]
63
+
64
+
65
+ def check_point_in_ellipse(scores, ellipse):
66
+ ''' This function takes a point and checks if it is inside or outside an
67
+ ellipse
68
+ '''
69
+ xcoord = scores[0] - ellipse['center'][0]
70
+ ycoord = scores[1] - ellipse['center'][1]
71
+
72
+ xct = xcoord * np.cos(-ellipse['angle']) - ycoord * np.sin(-ellipse['angle'])
73
+ yct = xcoord * np.sin(-ellipse['angle']) + ycoord * np.cos(-ellipse['angle'])
74
+
75
+ if (xct**2/(ellipse['shape'][0])**2) + (yct**2/(ellipse['shape'][1])**2) > 1.25:
76
+ return True
77
+ return False
78
+
79
+
80
+ def get_covariance_ellipse(data, conf=95):
81
+ ''' Return a covariance ellipse object '''
82
+ if len(data.columns) > 2:
83
+ raise AttributeError(
84
+ ("Error: get_covariance_ellipse expects only two columns. " +
85
+ "Got {0:d}.").format(len(data.columns)))
86
+
87
+ lambda_, angle = np.linalg.eig(np.cov(data.loc[:, 0], data.loc[:, 1]))
88
+ lambda_ = np.sqrt(lambda_)
89
+
90
+ if conf == 95:
91
+ scale = 5.991 # 95% confidence interval
92
+ elif conf == 90:
93
+ scale = 4.605 # 90%
94
+ elif conf == 99:
95
+ scale = 9.210 # 99%
96
+ else:
97
+ raise AttributeError(
98
+ "Error: get_covariance_ellipse parameter conf can only accept values {90, 95, 99}.")
99
+
100
+ return {'shape': (lambda_[0]*np.sqrt(scale), lambda_[1]*np.sqrt(scale)),
101
+ # 'angle': np.arccos(-angle[0, 0]),
102
+ 'angle': np.arctan(angle[1, 0]/angle[0, 0]),
103
+ 'center': (np.mean(data.loc[:, 0]), np.mean(data.loc[:, 1]))}
104
+
105
+
106
+ def plot_covariance_ellipse(axis, ellipse, color=0):
107
+ ''' plot covariance ellipse '''
108
+ if color is None:
109
+ color = 'black'
110
+ ell = Ellipse(xy=ellipse['center'],
111
+ width=2*ellipse['shape'][0],
112
+ height=2*ellipse['shape'][1],
113
+ angle=np.rad2deg(ellipse['angle']),
114
+ alpha=0.5,
115
+ edgecolor=color,
116
+ fill=False,
117
+ lw=1.5,
118
+ ls='-')
119
+ axis.add_artist(ell)
120
+ ell = Ellipse(xy=ellipse['center'],
121
+ width=2*ellipse['shape'][0],
122
+ height=2*ellipse['shape'][1],
123
+ angle=np.rad2deg(ellipse['angle']),
124
+ alpha=0.15,
125
+ edgecolor=None,
126
+ fill=True,
127
+ color=color)
128
+ axis.add_artist(ell)
@@ -0,0 +1,341 @@
1
+ ''' Class and methods for making compositional biplots based on PCA '''
2
+
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import webcolors as wc
6
+ from matplotlib.colors import ListedColormap
7
+ from matplotlib import cm
8
+ import matplotlib.patches as mpatches
9
+ import pandas as pd
10
+ import scipy.stats as st
11
+ from pycodamath import extra
12
+
13
+
14
+ class GeomObj():
15
+ ''' A generic container of geometric objects '''
16
+
17
+ def __init__(self, **kwargs):
18
+ vars(self).update(kwargs)
19
+ self.area = self.polyarea()
20
+
21
+ def polyarea(self):
22
+ ''' Calculate the area of a polygon given two lists of vertices '''
23
+ x, y = self.vertices
24
+ return 0.5*np.abs(np.dot(x, np.roll(y, 1))-np.dot(y, np.roll(x, 1)))
25
+
26
+
27
+ def scree_plot(axis, eig_val):
28
+ ''' Make scree plot from eigen values'''
29
+ axis.set_xlabel('Component')
30
+ axis.set_ylabel('Explained varaince')
31
+ axis.set_xlim(0, min(len(eig_val)+1, 20))
32
+ axis.bar(np.arange(len(eig_val))+1, (eig_val/np.sum(eig_val))**2)
33
+ csum = np.cumsum(eig_val**2/np.sum(eig_val**2))
34
+ for i in range(min(5, len(eig_val))):
35
+ axis.annotate(str(np.round(csum[i]*100))+'%',
36
+ (i+1.2, (eig_val[i]/np.sum(eig_val))**2))
37
+
38
+
39
+ def _get_palette(group):
40
+ cspace = cm.jet(np.linspace(0, 1, len(set(group))))
41
+ palette = {}
42
+ for idx, item in enumerate(set(group)):
43
+ palette[item] = cspace[idx]
44
+ return palette
45
+
46
+
47
+ def _svd(clr):
48
+ ''' Internal SVD function '''
49
+ scores, eig_val, loadings = np.linalg.svd(clr)
50
+ scores = pd.DataFrame(scores.T[0:2, :], columns=clr.index, index=['pc1', 'pc2'])
51
+ loadings = pd.DataFrame(np.inner(eig_val*np.identity(len(eig_val)),
52
+ loadings.T[0:len(eig_val), 0:len(eig_val)])[0:2],
53
+ columns=clr.columns[0:len(eig_val)], index=['pc1', 'pc2'])
54
+ return scores, eig_val, loadings
55
+
56
+
57
+ class Biplot():
58
+ ''' A class to create and a PCA biplot '''
59
+
60
+ def __init__(self, data, axis=None, default=True):
61
+ if axis is None:
62
+ _, self.axis = plt.subplots(figsize=(7.8, 7.8))
63
+ else:
64
+ self.axis = axis
65
+ self.axis.set(adjustable='box', aspect='equal')
66
+ self.scores, eig_val, self.loadings = _svd(data.coda.center().coda.scale().coda.clr())
67
+ scales = [np.max(np.abs(self.loadings.values)),
68
+ [np.max(np.abs(self.scores.loc[idx].values)) for idx in ['pc1', 'pc2']]]
69
+
70
+
71
+ self.axis.set_xlabel(f'P.C. 1 ({np.round(eig_val[0]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)')
72
+ self.axis.set_ylabel(f'P.C. 2 ({np.round(eig_val[1]**2 / np.sum(eig_val**2)*100):.1f}% explained variation)')
73
+ self.axis.set_xlim(-scales[0]*1.1, scales[0]*1.1)
74
+ self.axis.set_ylim(-scales[0]*1.1, scales[0]*1.1)
75
+ self.axis.plot([self.axis.get_xlim()[0], self.axis.get_xlim()[1]],
76
+ [0.0, 0.0], '--', color='black', alpha=0.4)
77
+ self.axis.plot([0.0, 0.0], [self.axis.get_ylim()[0], self.axis.get_ylim()[1]],
78
+ '--', color='black', alpha=0.4)
79
+
80
+ self.scores = (scales[0]*(self.scores.T/scales[1])).T
81
+
82
+ self.patches = []
83
+ self.geomobj = {}
84
+ plt.tight_layout()
85
+
86
+ if default:
87
+ self.plotloadings()
88
+ self.plotscores()
89
+
90
+ def plotloadings(self, cutoff=0, scale=None, labels=None):
91
+ ''' Plot loadings '''
92
+ if scale is None:
93
+ scale = np.max(np.abs(self.loadings.values))
94
+
95
+ for column in self.loadings:
96
+ if np.sqrt(pow(self.loadings.loc['pc1', column], 2) +
97
+ pow(self.loadings.loc['pc2', column], 2)) > cutoff:
98
+ self.axis.arrow(0, 0,
99
+ self.loadings.loc['pc1', column],
100
+ self.loadings.loc['pc2', column],
101
+ facecolor='black',
102
+ alpha=0.5,
103
+ linewidth=0.,
104
+ width=scale*0.01,
105
+ zorder=2000)
106
+ self.plotloadinglabels(labels, cutoff)
107
+
108
+ def plotloadinglabels(self, labels=None, cutoff=0):
109
+ ''' Add labels to the loadings '''
110
+ if labels is None:
111
+ labels = self.loadings.columns
112
+
113
+ for column in labels:
114
+ if np.sqrt(pow(self.loadings.loc['pc1', column], 2) +
115
+ pow(self.loadings.loc['pc2', column], 2)) > cutoff:
116
+ yoff = 0.
117
+ if self.loadings.loc['pc1', column] > 0.9*self.axis.get_xlim()[1]:
118
+ xoff = -1.2
119
+ else:
120
+ xoff = 0
121
+ self.axis.annotate(column, (self.loadings.loc['pc1', column]+xoff,
122
+ self.loadings.loc['pc2', column]+yoff),
123
+ ha='left',
124
+ va='bottom',
125
+ alpha=1.0,
126
+ zorder=2001
127
+ )
128
+
129
+ def plotscores(self, group=None, palette=None, legend=True, labels=None):
130
+ ''' Plot scores as points '''
131
+ if labels is None:
132
+ labels = self.scores.columns
133
+
134
+ if palette is None:
135
+ if group is not None:
136
+ palette = _get_palette(group)
137
+ else:
138
+ palette = 'steelblue'
139
+
140
+ if group is None:
141
+ self.axis.plot(*self.scores[labels].values, 'o', alpha=0.5,
142
+ color=palette, zorder=7, markeredgewidth=0)
143
+ else:
144
+ for item in set(group):
145
+ idx = group.loc[group == item].index
146
+ self.axis.plot(*self.scores[idx].values, 'o', alpha=0.5, zorder=7,
147
+ label=item, color=palette[item], markeredgewidth=0)
148
+ if legend:
149
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
150
+
151
+ def plotscorelabels(self, labels=None):
152
+ ''' Add labels to the scores '''
153
+ if labels is None:
154
+ labels = self.scores.columns
155
+
156
+ for label in labels:
157
+ self.axis.annotate(label, (self.scores.loc['pc1', label],
158
+ self.scores.loc['pc2', label]),
159
+ ha='left',
160
+ va='bottom',
161
+ alpha=0.8,
162
+ zorder=201,
163
+ size=8
164
+ )
165
+
166
+ def plotellipses(self, group, palette=None, legend=False):
167
+ ''' Plot confidence ellipses '''
168
+ if palette is None:
169
+ palette = _get_palette(group)
170
+
171
+ for item in set(group):
172
+ idx = group.loc[group == item].index
173
+ if len(idx) > 3:
174
+ ellipse = extra.get_covariance_ellipse(pd.DataFrame(self.scores[idx].values.T),
175
+ conf=90)
176
+ extra.plot_covariance_ellipse(self.axis, ellipse, color=palette[item])
177
+ if legend:
178
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
179
+
180
+ def plotcentroids(self, group, palette=None, legend=False):
181
+ ''' Plot score group centroids '''
182
+ if palette is None:
183
+ palette = _get_palette(group)
184
+
185
+ for item in set(group):
186
+ idx = group.loc[group == item].index
187
+ length = len(self.scores[idx].T)
188
+ sum_x = np.sum(self.scores.loc['pc1', idx])
189
+ sum_y = np.sum(self.scores.loc['pc2', idx])
190
+ self.axis.plot([sum_x/length], [sum_y/length], 'x', alpha=0.7,
191
+ label=item, color=palette[item], markersize=24)
192
+ if legend:
193
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
194
+
195
+ def plothulls(self, group, palette=None, legend=True):
196
+ ''' Plot score group hulls '''
197
+ if palette is None:
198
+ palette = _get_palette(group)
199
+
200
+ self.geomobj = {}
201
+ for item in set(group):
202
+ idx = group.loc[group == item].index
203
+ if len(idx) > 3:
204
+ # My secret hull construction algorithm
205
+ idxmin = self.scores.loc['pc1', idx].idxmin()
206
+ j = self.scores[idx].columns.get_loc(idxmin)
207
+ hull = [list(self.scores[idxmin])]
208
+ while (j != self.scores[idx].columns.get_loc(idxmin) or len(hull) == 1):
209
+ k = (j + 1) % len(idx)
210
+ for i in range(len(idx)):
211
+ if (self.scores[idx].iloc[1, k]-self.scores[idx].iloc[1, j]) * \
212
+ (self.scores[idx].iloc[0, i]-self.scores[idx].iloc[0, k]) - \
213
+ (self.scores[idx].iloc[0, k]-self.scores[idx].iloc[0, j]) * \
214
+ (self.scores[idx].iloc[1, i]-self.scores[idx].iloc[1, k]) < 0:
215
+ k = i
216
+ j = k
217
+ hull.append(list(self.scores[self.scores[idx].columns[k]]))
218
+ self.geomobj[item] = GeomObj(vertices=tuple(map(list, zip(*hull))))
219
+
220
+ for idx, item in enumerate(sorted(self.geomobj,
221
+ key=lambda x: self.geomobj[x].area, reverse=True)):
222
+ self.axis.fill(*self.geomobj[item].vertices,
223
+ color=palette[item], alpha=0.7, zorder=10+(2*idx))
224
+ self.axis.fill(*self.geomobj[item].vertices, facecolor='none',
225
+ edgecolor='black', alpha=0.9, linewidth=2.2, zorder=11+(2*idx))
226
+
227
+ if legend:
228
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
229
+
230
+ def plotcontours(self, group, palette=None, legend=True,
231
+ plot_outliers=True, percent_outliers=0.1, linewidth=2.2):
232
+ ''' Plot scores as contours '''
233
+ if palette is None and group is not None:
234
+ palette = _get_palette(group)
235
+ if percent_outliers > 1 or percent_outliers < 0:
236
+ raise Exception('Percent_outliers has to be between 0 and 1')
237
+
238
+ # Build color maps
239
+ cmap = {}
240
+ for item in set(group):
241
+ colorvalues = np.ones((4, 4))
242
+ if '#' in str(palette[item]):
243
+ color = wc.hex_to_rgb(palette[item])
244
+ elif palette[item][-1] != 1:
245
+ color = wc.name_to_rgb(palette[item])
246
+ else:
247
+ color = palette[item]
248
+
249
+ for i in range(3):
250
+ colorvalues[:, i] = np.linspace(1, color[i]/256., 5)[1:]
251
+ colorvalues[:, 3] = np.linspace(.95, .25, 4)
252
+ cmap[item] = ListedColormap(colorvalues)
253
+
254
+ self.geomobj = {}
255
+ for item in set(group):
256
+ minlevel = 0.2
257
+ diff = 100
258
+ k = 0
259
+ while abs(diff) > 0 and k < 25:
260
+ levels = np.arange(5)*(1.-minlevel)/4.+minlevel
261
+ idx = group.loc[group == item].index
262
+ xgrid, ygrid = np.mgrid[self.axis.get_xlim()[0]: self.axis.get_xlim()[1]: 300j,
263
+ self.axis.get_ylim()[0]: self.axis.get_ylim()[1]: 300j]
264
+ positions = np.vstack([xgrid.ravel(), ygrid.ravel()])
265
+ values = np.vstack([self.scores.loc['pc1', idx], self.scores.loc['pc2', idx]])
266
+ kernel = st.gaussian_kde(values)
267
+ density = np.reshape(kernel(positions).T, xgrid.shape)
268
+ vals = np.max(density)*levels
269
+ self.axis.contour(xgrid, ygrid, density, vals)
270
+ vertices = self.axis.collections[-4].get_paths()[0].vertices.T
271
+ contained = [False for _ in range(len(idx))]
272
+ for j in range(len(self.axis.collections[-5].get_paths())):
273
+ contained = np.logical_or(contained,
274
+ self.axis.collections[-5].get_paths()[j].contains_points(
275
+ [[self.scores.loc['pc1', i],
276
+ self.scores.loc['pc2', i]] for i in idx]))
277
+ _ = [self.axis.collections[-1].remove() for _ in np.arange(5)]
278
+ outside = [a for a, b in zip(list(idx), contained) if not b]
279
+
280
+ diff = round(percent_outliers*len(idx))-len(outside)
281
+ minlevel = minlevel+diff/1000.
282
+ k += 1
283
+
284
+ self.geomobj[item] = GeomObj(vertices=vertices, grid=(
285
+ xgrid, ygrid), density=density, values=vals, outside=outside)
286
+
287
+ for idx, item in enumerate(sorted(self.geomobj,
288
+ key=lambda x: self.geomobj[x].area, reverse=True)):
289
+ self.axis.contourf(*self.geomobj[item].grid, self.geomobj[item].density,
290
+ self.geomobj[item].values, antialiased=True,
291
+ cmap=cmap[item], alpha=0.9, zorder=10+(2*idx))
292
+ self.axis.contour(*self.geomobj[item].grid, self.geomobj[item].density,
293
+ self.geomobj[item].values, antialiased=True,
294
+ colors='black', alpha=0.9, linewidths=linewidth, zorder=11+(2*idx))
295
+ self.axis.collections[-1].remove()
296
+
297
+ if plot_outliers:
298
+ self.plotscores(None, palette[item], False, self.geomobj[item].outside)
299
+
300
+ if legend:
301
+ self.patches.append(mpatches.Patch(color=palette[item], label=item))
302
+
303
+ def labeloutliers(self, group, conf=3.):
304
+ ''' Print labels on scores that are more than conf away from centroid '''
305
+ for item in set(group):
306
+ idx = group.loc[group == item].index
307
+ length = len(self.scores[idx].T)
308
+ sum_x = np.sum(self.scores.loc['pc1', idx])
309
+ sum_y = np.sum(self.scores.loc['pc2', idx])
310
+
311
+ pdist = {i: np.sqrt(pow(self.scores.loc['pc1', i]-sum_x/length, 2) +
312
+ pow(self.scores.loc['pc2', i]-sum_y/length, 2)) for i in idx}
313
+ std = np.std(pdist.values())
314
+
315
+ outliers = [i for i in pdist.keys() if pdist[i] > conf*std]
316
+ self.plotscorelabels(outliers)
317
+
318
+ def displaylegend(self, loc=2):
319
+ ''' Display the item legend at location loc '''
320
+ patches = sorted(self.patches, key=lambda x: x._label)
321
+ self.axis.legend(handles=patches, fontsize=9, frameon=False, loc=loc)
322
+
323
+ def removepatches(self):
324
+ ''' remove arrows and polygons from plot '''
325
+ for _ in range(len(self.axis.patches)):
326
+ self.axis.patches[-1].remove()
327
+
328
+ def removelabels(self):
329
+ ''' remove labels from plot '''
330
+ for _ in range(len(self.axis.texts)):
331
+ self.axis.texts[-1].remove()
332
+
333
+ def removescores(self):
334
+ ''' remove points from plot '''
335
+ for _ in range(len(self.axis.lines)):
336
+ self.axis.lines[-1].remove()
337
+
338
+ def removecontours(self):
339
+ ''' remove points from plot '''
340
+ for _ in range(len(self.axis.collections)):
341
+ self.axis.collections[-1].remove()
@@ -0,0 +1,71 @@
1
+ # -*- coding: utf-8 -*-
2
+ ''' Compositional plot
3
+ '''
4
+
5
+ __author__ = "Christian Brinch"
6
+ __copyright__ = "Copyright 2019"
7
+ __credits__ = ["Christian Brinch"]
8
+ __license__ = "AFL 3.0"
9
+ __version__ = "1.0"
10
+ __maintainer__ = "Christian Brinch"
11
+ __email__ = "cbri@food.dtu.dk"
12
+
13
+ import ternary as td
14
+ import numpy as np
15
+ import pandas as pd
16
+ from pycodamath import extra
17
+
18
+
19
+ def ternary(data, descr=None, center=False, conf=False):
20
+ ''' Plot ternary diagram '''
21
+ if np.shape(data)[1] > 3:
22
+ raise AttributeError("Error: Too many parts in composition (max. 3).")
23
+ for column in data.T:
24
+ if np.abs(data.T[column].sum()-100.) > 1e-6:
25
+ raise AttributeError("Error: Composition is not closed to 100.")
26
+
27
+ _, tax = td.figure(scale=100)
28
+ tax.boundary(linewidth=1.5)
29
+ tax.gridlines(color="blue", multiple=10, linewidth=0.5, alpha=0.5)
30
+ tax.left_axis_label(f"% {data.columns[0]:s}", fontsize=16, offset=0.14)
31
+ tax.right_axis_label(f"% {data.columns[1]:s}", fontsize=16, offset=0.14)
32
+ tax.bottom_axis_label(f"% {data.columns[2]:s}", fontsize=16, offset=0.12)
33
+ tax.ticks(axis='lbr', linewidth=1, multiple=10, offset=0.03)
34
+ tax.clear_matplotlib_ticks()
35
+ tax.get_axes().axis('off')
36
+
37
+ if center:
38
+ sdata = (data/data.coda.gmean()).coda.closure(100)
39
+ else:
40
+ sdata = data
41
+
42
+ if descr is not None:
43
+ for group in set(descr):
44
+ idx = descr.loc[descr == group].index
45
+ tax.scatter(sdata.loc[idx, [sdata.columns[2], sdata.columns[1],
46
+ sdata.columns[0]]].values, alpha=0.7)
47
+ else:
48
+ tax.scatter(sdata.loc[:, [sdata.columns[2], sdata.columns[1],
49
+ sdata.columns[0]]].values, alpha=0.7,
50
+ color='steelblue')
51
+
52
+ if conf:
53
+ ilr = sdata.coda.ilr().loc[:, [0, 1]]
54
+ par = extra.get_covariance_ellipse(ilr)
55
+
56
+ points = [[par['center'][0] +
57
+ par['shape'][0]*np.cos(par['angle'])*np.cos(a) -
58
+ par['shape'][1]*np.sin(par['angle'])*np.sin(a),
59
+ par['center'][1] +
60
+ par['shape'][0]*np.cos(par['angle'])*np.sin(a) +
61
+ par['shape'][1]*np.sin(par['angle'])*np.cos(a)]
62
+ for a in np.linspace(0, 2*np.pi, 100)]
63
+
64
+ psi = extra.sbp_basis(sdata)
65
+
66
+ ellipse = pd.DataFrame(np.exp(np.matmul(points, psi))).coda.closure(100)
67
+ ellipse = ellipse.loc[:, [ellipse.columns[1], ellipse.columns[0],
68
+ ellipse.columns[2]]]
69
+ tax.plot(ellipse.values, color='black', lw=0.5, ls='-')
70
+
71
+ return tax
@@ -0,0 +1,188 @@
1
+ # -*- coding: utf-8 -*-
2
+ ''' CoDa related extensions to pandas dataframes
3
+ '''
4
+
5
+ __author__ = "Christian Brinch"
6
+ __copyright__ = "Copyright 2019"
7
+ __credits__ = ["Christian Brinch"]
8
+ __license__ = "AFL 3.0"
9
+ __version__ = "1.0"
10
+ __maintainer__ = "Christian Brinch"
11
+ __email__ = "cbri@food.dtu.dk"
12
+
13
+ import pandas as pd
14
+ import numpy as np
15
+ import scipy.stats as ss
16
+ import scipy.special as sp
17
+ from pycodamath import extra
18
+
19
+
20
+ def _clr_internal(obj):
21
+ return (np.log(obj.T) - np.mean(np.log(obj.T), axis=0)).T
22
+
23
+
24
+ def _alr_internal(obj):
25
+ return pd.DataFrame(np.log(obj.T/obj.T.loc[obj.columns[-1]])).T.iloc[:, :-1]
26
+
27
+
28
+ def _ilr_internal(obj, psi):
29
+ return pd.DataFrame(np.dot(_clr_internal(obj), psi.T), index=obj.index)
30
+
31
+
32
+ def _ilr_inv_internal(obj, psi):
33
+ return pd.DataFrame(np.exp(np.matmul(obj.values, psi)))
34
+
35
+
36
+ def init():
37
+ ''' Initialize CoDa extension '''
38
+ @ pd.api.extensions.register_dataframe_accessor("coda")
39
+ class _:
40
+ ''' A CoDa extension to pandas objects containing counts '''
41
+
42
+ def __init__(self, pandas_obj):
43
+ self._obj = pandas_obj
44
+
45
+ def _check_for_zeros(self):
46
+ if not self._obj.values.all():
47
+ print("Dataframe contains zeros. Using Bayesian inference to replace zeros.")
48
+ return True
49
+ return False
50
+
51
+ def clr(self):
52
+ ''' Wrapper for CLR '''
53
+ if self._check_for_zeros():
54
+ return _clr_internal(self.aitchison_mean())
55
+
56
+ return _clr_internal(self._obj)
57
+
58
+ def clr_std(self, n_samples=5000):
59
+ ''' Wrapper for CLR bayesian error estimate'''
60
+ logratio = pd.DataFrame(index=self._obj.columns)
61
+ for column in self._obj.T:
62
+ p_matrix = ss.dirichlet.rvs(self._obj.T[column]+0.5, n_samples)
63
+ c_matrix = _clr_internal(p_matrix)
64
+ logratio[column] = [np.std(i) for i in zip(*c_matrix)]
65
+ return logratio.T
66
+
67
+ def alr(self, part=None):
68
+ ''' Wrapper for ALR '''
69
+ if part:
70
+ parts = self._obj.T.index.tolist()
71
+ parts.remove(part)
72
+ self._obj = self._obj.T.reindex(parts+[part]).T
73
+
74
+ print("Using "+self._obj.columns[-1] + " as denominator.")
75
+ if self._check_for_zeros():
76
+ return _alr_internal(self.aitchison_mean())
77
+
78
+ return _alr_internal(self._obj)
79
+
80
+ def alr_std(self, part=None, n_samples=5000):
81
+ ''' Wrapper for ALR error estimate'''
82
+ if part:
83
+ parts = self._obj.index.tolist()
84
+ parts.remove(part)
85
+ self._obj.reindex(parts+[part])
86
+
87
+ logratio = pd.DataFrame(index=self._obj.columns)
88
+ for column in self._obj.T:
89
+ p_matrix = ss.dirichlet.rvs(self._obj.T[column]+0.5, n_samples)
90
+ c_matrix = [np.log(i/i[-1]) for i in p_matrix]
91
+ logratio[column] = [np.std(i) for i in zip(*c_matrix)]
92
+ return logratio.T.iloc[:, :-1]
93
+
94
+ def ilr(self, psi=None):
95
+ ''' Wrapper for ILR '''
96
+ if psi is None:
97
+ psi = extra.sbp_basis(self._obj)
98
+ else:
99
+ extra.check_basis(psi)
100
+
101
+ if self._check_for_zeros():
102
+ return _ilr_internal(self.aitchison_mean(), psi)
103
+
104
+ return _ilr_internal(self._obj, psi)
105
+
106
+ def ilr_inv(self, psi=None):
107
+ ''' Wrapper for inverse ILR transformation '''
108
+ if psi is None:
109
+ psi = extra.sbp_basis(self._obj)
110
+ else:
111
+ extra.check_basis(psi)
112
+
113
+ return _ilr_inv_internal(self._obj, psi)
114
+
115
+ def zero_replacement(self, n_samples=5000):
116
+ ''' Replace zero values using Dirichlet-multinomial Bayesian inherence '''
117
+ counts = pd.DataFrame(index=self._obj.columns)
118
+ for column in self._obj.T:
119
+ p_matrix = ss.dirichlet.rvs(self._obj.T[column]+0.5, n_samples)
120
+ counts[column] = [np.mean(i) for i in zip(*p_matrix)]
121
+ return counts.T
122
+
123
+ def aitchison_mean(self):
124
+ ''' Return the Aitchison mean point estimate '''
125
+ return np.exp(sp.digamma(self._obj+1.0)).coda.closure(1.0)
126
+
127
+ def closure(self, cls_const):
128
+ ''' Apply Closure to composition '''
129
+ return cls_const*self._obj.divide(self._obj.sum(axis=1), axis=0)
130
+
131
+ def varmatrix(self, nmp=False):
132
+ '''
133
+ Calculate the total variation of a composition
134
+ TODO: for large datasets, this function blows up the memory.
135
+ This could be overcome by using a clever running variance
136
+ algorithm, alas I am lazy, so we estimate the variance by only
137
+ using a maximum of 500 entries. This can still be a problem if
138
+ dim[0] is large, so something needs to be done here. -- C.B.
139
+ '''
140
+ if self._check_for_zeros():
141
+ comp = self.aitchison_mean()
142
+ else:
143
+ comp = self._obj
144
+
145
+ # Quick fix: Estimate variance from at most 500 entries.
146
+ reduc = np.array(comp)[:min(500, np.shape(comp)[0]), :]
147
+
148
+ # New vectorized version. Faster than ketchup!
149
+ vrmtrx = np.var(np.log(reduc[:, :, None]*1./reduc[:, None]), axis=0)
150
+ if nmp:
151
+ return vrmtrx
152
+ return pd.DataFrame(vrmtrx, columns=self._obj.columns, index=self._obj.columns)
153
+
154
+ def totvar(self):
155
+ '''
156
+ Calculate the total variance from the variance matrix
157
+ '''
158
+ var_matrix = self.varmatrix(True)
159
+ return 1./(2*np.shape(var_matrix)[0]) * np.sum(var_matrix)
160
+
161
+ def gmean(self):
162
+ ''' Calculate the geometric mean '''
163
+ if self._check_for_zeros():
164
+ gmean = ss.mstats.gmean(self.aitchison_mean())
165
+ else:
166
+ gmean = ss.mstats.gmean(self._obj)
167
+ return np.array([100 * i / np.sum(gmean) for i in gmean])
168
+
169
+ def power(self, alpha):
170
+ ''' Compositional scalar multiplication'''
171
+ if self._check_for_zeros():
172
+ return pow(self.aitchison_mean(), alpha)
173
+ return pow(self._obj,alpha)
174
+
175
+ def perturbation(self, comp):
176
+ ''' Compositional addition with comp'''
177
+ if self._check_for_zeros():
178
+ return self.aitchison_mean()*np.array(comp)
179
+ return self._obj*np.array(comp)
180
+
181
+ def scale(self):
182
+ ''' Scale composition with total variance '''
183
+ return self.power(1./np.sqrt(self.totvar()))
184
+
185
+ def center(self):
186
+ ''' Center the composition '''
187
+ return self.perturbation(1./self.gmean())
188
+