lyubishchev 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: lyubishchev
3
+ Version: 0.2.0
4
+ Summary: Quantitative taxonomy methods of A.A. Lyubishchev (1943) — continuous multivariate classification for biological systematics.
5
+ Author-email: Akzhan Berdeyev <akzhan.berdeyev@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://baddogdata.com
8
+ Project-URL: Repository, https://github.com/AkzhanBerdi/lyubishchev
9
+ Project-URL: Bug Tracker, https://github.com/AkzhanBerdi/lyubishchev/issues
10
+ Project-URL: Primary Source, http://www.zin.ru/animalia/coleoptera/rus/lyubis05.htm
11
+ Keywords: taxonomy,biometrics,classification,systematics,unsupervised learning,multivariate statistics,discriminant analysis,numerical taxonomy
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
15
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: scipy>=1.10
26
+ Requires-Dist: scikit-learn>=1.2
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov; extra == "dev"
30
+ Provides-Extra: plot
31
+ Requires-Dist: matplotlib>=3.6; extra == "plot"
32
+
33
+ @misc{lyubishchev1943,
34
+ author = {Lyubishchev, Alexander Alexandrovich},
35
+ title = {Programma obshchey sistematiki
36
+ [Program of General Systematics]},
37
+ year = {1943},
38
+ note = {Manuscript, 22 November 1943.
39
+ Digitized by ZIN RAS Coleoptera Laboratory.
40
+ Available at: http://www.zin.ru/animalia/coleoptera/rus/lyubis05.htm}
41
+ }
42
+
43
+ @article{lubischew1962,
44
+ author = {Lubischew, A.A.},
45
+ title = {On the use of discriminant functions in taxonomy},
46
+ journal = {Biometrics},
47
+ year = {1962},
48
+ volume = {18},
49
+ number = {4},
50
+ pages = {455--477},
51
+ }
52
+
53
+ @software{lyubishchev_python,
54
+ author = {Berdeyev, Akzhan},
55
+ title = {lyubishchev: Quantitative taxonomy methods of A.A. Lyubishchev},
56
+ year = {2026},
57
+ url = {https://github.com/akzhanberdi/lyubishchev},
58
+ }
@@ -0,0 +1,26 @@
1
+ @misc{lyubishchev1943,
2
+ author = {Lyubishchev, Alexander Alexandrovich},
3
+ title = {Programma obshchey sistematiki
4
+ [Program of General Systematics]},
5
+ year = {1943},
6
+ note = {Manuscript, 22 November 1943.
7
+ Digitized by ZIN RAS Coleoptera Laboratory.
8
+ Available at: http://www.zin.ru/animalia/coleoptera/rus/lyubis05.htm}
9
+ }
10
+
11
+ @article{lubischew1962,
12
+ author = {Lubischew, A.A.},
13
+ title = {On the use of discriminant functions in taxonomy},
14
+ journal = {Biometrics},
15
+ year = {1962},
16
+ volume = {18},
17
+ number = {4},
18
+ pages = {455--477},
19
+ }
20
+
21
+ @software{lyubishchev_python,
22
+ author = {Berdeyev, Akzhan},
23
+ title = {lyubishchev: Quantitative taxonomy methods of A.A. Lyubishchev},
24
+ year = {2026},
25
+ url = {https://github.com/akzhanberdi/lyubishchev},
26
+ }
@@ -0,0 +1,43 @@
1
+ """
2
+ lyubishchev
3
+ ~~~~~~~~~~~
4
+
5
+ Quantitative taxonomy methods of Alexander Alexandrovich Lyubishchev
6
+ (1890–1972), implemented for the modern Python scientific stack.
7
+
8
+ Lyubishchev described multivariate classification by covariance
9
+ structure in his 1943 manuscript *Programma obshchey sistematiki*
10
+ (Program of General Systematics) — twenty years before Sokal &
11
+ Sneath's *Principles of Numerical Taxonomy* (1963), whose binary
12
+ similarity coefficients are memorialized in scipy.spatial.distance
13
+ as ``sokalsneath`` and ``sokalmichener``. This package puts
14
+ Lyubishchev's name into the same ecosystem.
15
+
16
+ Primary source:
17
+ Lyubishchev, A.A. (1943). Programma obshchey sistematiki.
18
+ Manuscript, 22 November 1943. Digitized by ZIN RAS Coleoptera
19
+ Laboratory. http://www.zin.ru/animalia/coleoptera/rus/lyubis05.htm
20
+
21
+ Western publication:
22
+ Lubischew, A.A. (1962). On the use of discriminant functions in
23
+ taxonomy. Biometrics, 18(4), 455–477.
24
+ """
25
+
26
+ from lyubishchev.core import (
27
+ classify,
28
+ divergence_coefficient,
29
+ scatter_ellipse,
30
+ transgression,
31
+ )
32
+ from lyubishchev.estimator import LyubishchevClassifier
33
+
34
+ __version__ = "0.2.0"
35
+
36
+ __all__ = [
37
+ "divergence_coefficient",
38
+ "scatter_ellipse",
39
+ "transgression",
40
+ "classify",
41
+ "LyubishchevClassifier",
42
+ "__version__",
43
+ ]
@@ -0,0 +1,291 @@
1
+ """
2
+ lyubishchev.core
3
+ ~~~~~~~~~~~~~~~~
4
+
5
+ Implementations of Alexander Alexandrovich Lyubishchev's (1890–1972)
6
+ quantitative taxonomy methods, as described in his 1943 manuscript:
7
+
8
+ Lyubishchev, A.A. (1943). Programma obshchey sistematiki
9
+ [Program of General Systematics]. Manuscript, 22 November 1943.
10
+ Digitized by ZIN RAS Coleoptera Laboratory.
11
+ http://www.zin.ru/animalia/coleoptera/rus/lyubis05.htm
12
+
13
+ These methods predate and are mathematically more general than the
14
+ binary-character similarity coefficients of Sokal & Sneath (1963),
15
+ which are memorialized in scipy.spatial.distance as sokalsneath and
16
+ sokalmichener. Lyubishchev worked with continuous measurements and
17
+ full covariance structure — the formulation now standard in
18
+ multivariate statistics and machine learning.
19
+ """
20
+
21
+ import numpy as np
22
+ from scipy.spatial.distance import mahalanobis
23
+ from scipy.stats import chi2
24
+
25
+
26
+ def divergence_coefficient(a, b):
27
+ """
28
+ Compute Lyubishchev's divergence coefficient D between two groups.
29
+
30
+ Defined in his 1943 manuscript as:
31
+
32
+ D = (M₁ - M₂)² / (σ₁² + σ₂²)
33
+
34
+ where M₁, M₂ are group means and σ₁², σ₂² are group variances.
35
+ For multivariate data, D is computed per dimension and summed.
36
+
37
+ When D is large, the groups are cleanly separated in measurement
38
+ space. When D is small, you have transgression — the classical
39
+ boundary between taxa breaks down.
40
+
41
+ Parameters
42
+ ----------
43
+ a : array-like, shape (n_samples,) or (n_samples, n_features)
44
+ Measurements for group A (e.g. one species).
45
+ b : array-like, shape (n_samples,) or (n_samples, n_features)
46
+ Measurements for group B (e.g. another species).
47
+
48
+ Returns
49
+ -------
50
+ D : float
51
+ Divergence coefficient. Values above 1.0 indicate clean
52
+ separation. Values below 0.5 indicate strong transgression.
53
+
54
+ Examples
55
+ --------
56
+ >>> import numpy as np
57
+ >>> from lyubishchev import divergence_coefficient
58
+ >>> rng = np.random.default_rng(42)
59
+ >>> haltica_oleracea = rng.normal(loc=[3.2, 1.5], scale=0.2, size=(20, 2))
60
+ >>> haltica_carduorum = rng.normal(loc=[3.8, 1.9], scale=0.2, size=(20, 2))
61
+ >>> divergence_coefficient(haltica_oleracea, haltica_carduorum)
62
+ """
63
+ a = np.atleast_2d(np.asarray(a, dtype=float))
64
+ b = np.atleast_2d(np.asarray(b, dtype=float))
65
+
66
+ if a.shape[0] == 1:
67
+ a = a.T
68
+ if b.shape[0] == 1:
69
+ b = b.T
70
+
71
+ mean_a = np.mean(a, axis=0)
72
+ mean_b = np.mean(b, axis=0)
73
+ var_a = np.var(a, axis=0, ddof=1)
74
+ var_b = np.var(b, axis=0, ddof=1)
75
+
76
+ pooled_var = var_a + var_b
77
+ # Avoid division by zero for constant features
78
+ mask = pooled_var > 0
79
+ if not np.any(mask):
80
+ return 0.0
81
+
82
+ D = np.sum((mean_a[mask] - mean_b[mask]) ** 2 / pooled_var[mask])
83
+ return float(D)
84
+
85
+
86
+ def scatter_ellipse(X, y):
87
+ """
88
+ Fit covariance ellipses per class, as Lyubishchev did graphically
89
+ in his 1943 manuscript (Fig. 1 — Рис. 1).
90
+
91
+ Each class is represented by its centroid and covariance matrix,
92
+ defining an ellipse of equal probability density in measurement
93
+ space. Overlap between ellipses is the multivariate equivalent of
94
+ Lyubishchev's "transgression" — the failure of a single character
95
+ to separate two taxa.
96
+
97
+ Parameters
98
+ ----------
99
+ X : array-like, shape (n_samples, n_features)
100
+ Measurement matrix. Each row is a specimen, each column a
101
+ morphological measurement.
102
+ y : array-like, shape (n_samples,)
103
+ Class labels (taxon names or integer codes).
104
+
105
+ Returns
106
+ -------
107
+ ellipses : dict
108
+ Keys are unique class labels. Values are dicts with:
109
+ 'mean' : ndarray, shape (n_features,)
110
+ 'cov' : ndarray, shape (n_features, n_features)
111
+ 'n_samples' : int
112
+
113
+ Examples
114
+ --------
115
+ >>> import numpy as np
116
+ >>> from lyubishchev import scatter_ellipse
117
+ >>> rng = np.random.default_rng(0)
118
+ >>> X = np.vstack([
119
+ ... rng.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 30),
120
+ ... rng.multivariate_normal([3, 3], [[1, 0.5], [0.5, 1]], 30),
121
+ ... ])
122
+ >>> y = ['Haltica oleracea'] * 30 + ['Haltica carduorum'] * 30
123
+ >>> ellipses = scatter_ellipse(X, y)
124
+ >>> ellipses['Haltica oleracea']['mean']
125
+ """
126
+ X = np.asarray(X, dtype=float)
127
+ y = np.asarray(y)
128
+ classes = np.unique(y)
129
+
130
+ ellipses = {}
131
+ for cls in classes:
132
+ mask = y == cls
133
+ X_cls = X[mask]
134
+ ellipses[cls] = {
135
+ 'mean': np.mean(X_cls, axis=0),
136
+ 'cov': np.cov(X_cls, rowvar=False) if X_cls.shape[0] > 1 else np.eye(X.shape[1]),
137
+ 'n_samples': int(np.sum(mask)),
138
+ }
139
+ return ellipses
140
+
141
+
142
+ def transgression(ellipses, class_a, class_b, confidence=0.95):
143
+ """
144
+ Compute the transgression (overlap) between two scatter ellipses.
145
+
146
+ Lyubishchev defined transgression as the proportion of specimens
147
+ that fall within the boundary region of both groups. This function
148
+ estimates it by computing the Mahalanobis distance between the two
149
+ group centroids and comparing it to the chi-squared threshold for
150
+ the given confidence level.
151
+
152
+ Parameters
153
+ ----------
154
+ ellipses : dict
155
+ Output of scatter_ellipse().
156
+ class_a : label
157
+ First class label.
158
+ class_b : label
159
+ Second class label.
160
+ confidence : float, default 0.95
161
+ Confidence level for the ellipse boundary.
162
+
163
+ Returns
164
+ -------
165
+ result : dict
166
+ 'mahalanobis_distance' : float
167
+ Distance between centroids in Mahalanobis units.
168
+ 'threshold' : float
169
+ Chi-squared threshold at the given confidence level.
170
+ 'transgression' : bool
171
+ True if the ellipses overlap (distance < threshold).
172
+ 'separation_ratio' : float
173
+ mahalanobis_distance / threshold. Values > 1.0 mean
174
+ clean separation. Values < 1.0 mean transgression.
175
+
176
+ Examples
177
+ --------
178
+ >>> result = transgression(ellipses, 'Haltica oleracea', 'Haltica carduorum')
179
+ >>> result['transgression']
180
+ False
181
+ """
182
+ ea = ellipses[class_a]
183
+ eb = ellipses[class_b]
184
+
185
+ n_features = len(ea['mean'])
186
+ threshold = np.sqrt(chi2.ppf(confidence, df=n_features))
187
+
188
+ # Pooled covariance
189
+ na, nb = ea['n_samples'], eb['n_samples']
190
+ pooled_cov = (na * ea['cov'] + nb * eb['cov']) / (na + nb)
191
+
192
+ try:
193
+ dist = mahalanobis(ea['mean'], eb['mean'], np.linalg.inv(pooled_cov))
194
+ except np.linalg.LinAlgError:
195
+ dist = np.linalg.norm(ea['mean'] - eb['mean'])
196
+
197
+ return {
198
+ 'mahalanobis_distance': float(dist),
199
+ 'threshold': float(threshold),
200
+ 'transgression': bool(dist < threshold),
201
+ 'separation_ratio': float(dist / threshold) if threshold > 0 else 0.0,
202
+ }
203
+
204
+
205
+ def classify(specimen, ellipses):
206
+ """
207
+ Classify a specimen using the Edgeworth-Pearson multivariate
208
+ probability function, as described by Lyubishchev in his 1943
209
+ manuscript.
210
+
211
+ This is the mathematical core of his paper nomograms (Fig. 3 —
212
+ Рис. 3): given a specimen's measurements and a set of reference
213
+ groups with known means and covariance matrices, return the
214
+ posterior probability of belonging to each group (assuming equal
215
+ priors).
216
+
217
+ Parameters
218
+ ----------
219
+ specimen : array-like, shape (n_features,)
220
+ Measurements of the specimen to classify.
221
+ ellipses : dict
222
+ Output of scatter_ellipse(). Each entry must have 'mean',
223
+ 'cov', and 'n_samples'.
224
+
225
+ Returns
226
+ -------
227
+ result : dict
228
+ Keys are class labels. Values are dicts with:
229
+ 'mahalanobis_distance' : float
230
+ Distance from specimen to class centroid.
231
+ 'log_likelihood' : float
232
+ 'posterior' : float
233
+ Posterior probability (sums to 1.0 across classes).
234
+
235
+ Examples
236
+ --------
237
+ >>> from lyubishchev import scatter_ellipse, classify
238
+ >>> import numpy as np
239
+ >>> rng = np.random.default_rng(1)
240
+ >>> X = np.vstack([
241
+ ... rng.multivariate_normal([0, 0], [[1, 0.3], [0.3, 1]], 20),
242
+ ... rng.multivariate_normal([4, 4], [[1, 0.3], [0.3, 1]], 20),
243
+ ... ])
244
+ >>> y = ['oleracea'] * 20 + ['carduorum'] * 20
245
+ >>> ellipses = scatter_ellipse(X, y)
246
+ >>> specimen = np.array([3.8, 3.9])
247
+ >>> result = classify(specimen, ellipses)
248
+ >>> max(result, key=lambda k: result[k]['posterior'])
249
+ 'carduorum'
250
+ """
251
+ specimen = np.asarray(specimen, dtype=float)
252
+ log_likelihoods = {}
253
+
254
+ for cls, params in ellipses.items():
255
+ mean = params['mean']
256
+ cov = params['cov']
257
+ n = params['n_samples']
258
+ k = len(mean)
259
+
260
+ try:
261
+ cov_inv = np.linalg.inv(cov)
262
+ sign, log_det = np.linalg.slogdet(cov)
263
+ if sign <= 0:
264
+ raise np.linalg.LinAlgError("Non-positive definite covariance")
265
+ except np.linalg.LinAlgError:
266
+ cov_inv = np.eye(k)
267
+ log_det = 0.0
268
+
269
+ diff = specimen - mean
270
+ maha = float(np.sqrt(diff @ cov_inv @ diff))
271
+ log_ll = -0.5 * (k * np.log(2 * np.pi) + log_det + diff @ cov_inv @ diff)
272
+
273
+ log_likelihoods[cls] = {
274
+ 'mahalanobis_distance': maha,
275
+ 'log_likelihood': float(log_ll),
276
+ }
277
+
278
+ # Softmax over log-likelihoods for numerical stability
279
+ max_ll = max(v['log_likelihood'] for v in log_likelihoods.values())
280
+ exp_lls = {cls: np.exp(v['log_likelihood'] - max_ll)
281
+ for cls, v in log_likelihoods.items()}
282
+ total = sum(exp_lls.values())
283
+
284
+ result = {}
285
+ for cls in log_likelihoods:
286
+ result[cls] = {
287
+ **log_likelihoods[cls],
288
+ 'posterior': float(exp_lls[cls] / total),
289
+ }
290
+
291
+ return result