snplib 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snplib/__init__.py +8 -8
- snplib/finalreport/__init__.py +7 -7
- snplib/finalreport/_finalreport.py +251 -251
- snplib/format/__init__.py +19 -19
- snplib/format/__settings.py +7 -7
- snplib/format/_plink.py +291 -305
- snplib/format/_snp.py +113 -113
- snplib/parentage/__init__.py +15 -15
- snplib/parentage/_discov.py +102 -102
- snplib/parentage/_isagmark.py +15 -15
- snplib/parentage/_verif.py +91 -91
- snplib/parentage/isag_disc.pl +0 -0
- snplib/parentage/isag_verif.pl +0 -0
- snplib/statistics/__init__.py +16 -16
- snplib/statistics/_callrate.py +60 -59
- snplib/statistics/_freq.py +67 -67
- snplib/statistics/_snphwe.py +132 -132
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/LICENSE +674 -674
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/METADATA +80 -97
- snplib-1.0.9.dist-info/RECORD +22 -0
- snplib/finalreport/tests/__init__.py +0 -7
- snplib/finalreport/tests/test_finalreport.py +0 -215
- snplib/format/tests/__init__.py +0 -7
- snplib/format/tests/test_plink_fam.py +0 -121
- snplib/format/tests/test_plink_lgen.py +0 -106
- snplib/format/tests/test_plink_map.py +0 -42
- snplib/format/tests/test_plink_ped.py +0 -136
- snplib/format/tests/test_snp.py +0 -128
- snplib/parentage/tests/__init__.py +0 -7
- snplib/parentage/tests/test_discov.py +0 -164
- snplib/parentage/tests/test_verif.py +0 -160
- snplib/statistics/tests/__init__.py +0 -7
- snplib/statistics/tests/test_callrate.py +0 -171
- snplib/statistics/tests/test_freq_allele.py +0 -87
- snplib/statistics/tests/test_freq_maf.py +0 -17
- snplib/statistics/tests/test_hwe_t.py +0 -41
- snplib/statistics/tests/test_snphwe.py +0 -41
- snplib-1.0.7.dist-info/RECORD +0 -37
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/WHEEL +0 -0
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/top_level.txt +0 -0
snplib/statistics/_snphwe.py
CHANGED
@@ -1,132 +1,132 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# coding: utf-8
|
3
|
-
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import pandas as pd
|
7
|
-
|
8
|
-
|
9
|
-
def hwe(
|
10
|
-
obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
|
11
|
-
) -> float:
|
12
|
-
""" Python interpretation hwe - https://github.com/jeremymcrae/snphwe
|
13
|
-
|
14
|
-
:param obs_hets: Number of observed heterozygotes (AB, BA)
|
15
|
-
:param obs_hom1: Number of observed homozygotes1 (AA)
|
16
|
-
:param obs_hom2: Number of observed homozygotes2 (BB)
|
17
|
-
:return: This is where the p-value is returned
|
18
|
-
"""
|
19
|
-
|
20
|
-
obs_hets = round(obs_hets)
|
21
|
-
obs_hom1 = round(obs_hom1)
|
22
|
-
obs_hom2 = round(obs_hom2)
|
23
|
-
|
24
|
-
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
|
25
|
-
raise ValueError("snphwe: negative allele count")
|
26
|
-
|
27
|
-
obs_homr = min(obs_hom1, obs_hom2)
|
28
|
-
obs_homc = max(obs_hom1, obs_hom2)
|
29
|
-
|
30
|
-
rare = 2 * obs_homr + obs_hets
|
31
|
-
genotypes = obs_hets + obs_homc + obs_homr
|
32
|
-
|
33
|
-
if genotypes == 0:
|
34
|
-
raise ValueError("snphwe: zero genotypes")
|
35
|
-
|
36
|
-
probs = np.zeros(round(rare) + 1)
|
37
|
-
|
38
|
-
# get distribution midpoint, but ensure midpoint and rare alleles have
|
39
|
-
# same parity
|
40
|
-
mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
|
41
|
-
if mid % 2 != rare % 2:
|
42
|
-
mid += 1
|
43
|
-
|
44
|
-
probs[mid] = 1.0
|
45
|
-
_sum = probs[mid]
|
46
|
-
|
47
|
-
curr_homr = (rare - mid) / 2
|
48
|
-
curr_homc = genotypes - mid - curr_homr
|
49
|
-
curr_hets = mid
|
50
|
-
while curr_hets > 1:
|
51
|
-
probs[curr_hets - 2] = (
|
52
|
-
probs[curr_hets] * curr_hets * (curr_hets - 1.0)
|
53
|
-
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
|
54
|
-
)
|
55
|
-
_sum += probs[curr_hets - 2]
|
56
|
-
|
57
|
-
# fewer heterozygotes -> add one rare, one common homozygote
|
58
|
-
curr_homr += 1
|
59
|
-
curr_homc += 1
|
60
|
-
curr_hets -= 2
|
61
|
-
|
62
|
-
# calculate probabilities from midpoint up
|
63
|
-
curr_homr = (rare - mid) / 2
|
64
|
-
curr_homc = genotypes - mid - curr_homr
|
65
|
-
|
66
|
-
curr_hets = mid
|
67
|
-
while curr_hets <= rare - 2:
|
68
|
-
probs[curr_hets + 2] = \
|
69
|
-
(probs[curr_hets] * 4.0 * curr_homr * curr_homc
|
70
|
-
/ ((curr_hets + 2.0) * (curr_hets + 1.0)))
|
71
|
-
_sum += probs[curr_hets + 2]
|
72
|
-
|
73
|
-
# add 2 heterozygotes -> subtract one rare, one common homozygote
|
74
|
-
curr_homr -= 1
|
75
|
-
curr_homc -= 1
|
76
|
-
curr_hets += 2
|
77
|
-
|
78
|
-
# p-value calculation for p_hwe
|
79
|
-
target = probs[obs_hets]
|
80
|
-
p_hwe = 0.0
|
81
|
-
|
82
|
-
for p in probs:
|
83
|
-
if p <= target:
|
84
|
-
p_hwe += p / _sum
|
85
|
-
|
86
|
-
return min(1.0, p_hwe)
|
87
|
-
|
88
|
-
|
89
|
-
def hwe_test(
|
90
|
-
seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
|
91
|
-
) -> bool:
|
92
|
-
""" The Hardy-Weinberg equilibrium is a principle stating that the genetic
|
93
|
-
variation in a population will remain constant from one generation to the
|
94
|
-
next in the absence of disturbing factors.
|
95
|
-
https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
|
96
|
-
|
97
|
-
:param seq_snp: SNP sequence
|
98
|
-
:param freq: Allele frequency
|
99
|
-
:param crit_chi2: The critical value for a test ("either / or":
|
100
|
-
observed and expected values are either one way or the other),
|
101
|
-
therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
|
102
|
-
:return: A decision is returned to exclude or retain the inspected snp
|
103
|
-
"""
|
104
|
-
|
105
|
-
_seq = seq_snp.replace(5, np.nan)
|
106
|
-
|
107
|
-
if _seq.nunique() == 1:
|
108
|
-
return True
|
109
|
-
|
110
|
-
n_genotypes = _seq.count()
|
111
|
-
|
112
|
-
observed = {
|
113
|
-
0: (_seq == 0).sum(),
|
114
|
-
1: (_seq == 1).sum(),
|
115
|
-
2: (_seq == 2).sum()
|
116
|
-
}
|
117
|
-
|
118
|
-
expected = {
|
119
|
-
0: ((1 - freq) ** 2) * n_genotypes,
|
120
|
-
1: (2 * ((1 - freq) * freq)) * n_genotypes,
|
121
|
-
2: (freq ** 2) * n_genotypes
|
122
|
-
}
|
123
|
-
|
124
|
-
chi = sum([
|
125
|
-
((obs - exp) ** 2) / exp
|
126
|
-
for obs, exp in zip(observed.values(), expected.values())
|
127
|
-
])
|
128
|
-
|
129
|
-
if chi > crit_chi2:
|
130
|
-
return False
|
131
|
-
else:
|
132
|
-
return True
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def hwe(
|
10
|
+
obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
|
11
|
+
) -> float:
|
12
|
+
""" Python interpretation hwe - https://github.com/jeremymcrae/snphwe
|
13
|
+
|
14
|
+
:param obs_hets: Number of observed heterozygotes (AB, BA)
|
15
|
+
:param obs_hom1: Number of observed homozygotes1 (AA)
|
16
|
+
:param obs_hom2: Number of observed homozygotes2 (BB)
|
17
|
+
:return: This is where the p-value is returned
|
18
|
+
"""
|
19
|
+
|
20
|
+
obs_hets = round(obs_hets)
|
21
|
+
obs_hom1 = round(obs_hom1)
|
22
|
+
obs_hom2 = round(obs_hom2)
|
23
|
+
|
24
|
+
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
|
25
|
+
raise ValueError("snphwe: negative allele count")
|
26
|
+
|
27
|
+
obs_homr = min(obs_hom1, obs_hom2)
|
28
|
+
obs_homc = max(obs_hom1, obs_hom2)
|
29
|
+
|
30
|
+
rare = 2 * obs_homr + obs_hets
|
31
|
+
genotypes = obs_hets + obs_homc + obs_homr
|
32
|
+
|
33
|
+
if genotypes == 0:
|
34
|
+
raise ValueError("snphwe: zero genotypes")
|
35
|
+
|
36
|
+
probs = np.zeros(round(rare) + 1)
|
37
|
+
|
38
|
+
# get distribution midpoint, but ensure midpoint and rare alleles have
|
39
|
+
# same parity
|
40
|
+
mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
|
41
|
+
if mid % 2 != rare % 2:
|
42
|
+
mid += 1
|
43
|
+
|
44
|
+
probs[mid] = 1.0
|
45
|
+
_sum = probs[mid]
|
46
|
+
|
47
|
+
curr_homr = (rare - mid) / 2
|
48
|
+
curr_homc = genotypes - mid - curr_homr
|
49
|
+
curr_hets = mid
|
50
|
+
while curr_hets > 1:
|
51
|
+
probs[curr_hets - 2] = (
|
52
|
+
probs[curr_hets] * curr_hets * (curr_hets - 1.0)
|
53
|
+
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
|
54
|
+
)
|
55
|
+
_sum += probs[curr_hets - 2]
|
56
|
+
|
57
|
+
# fewer heterozygotes -> add one rare, one common homozygote
|
58
|
+
curr_homr += 1
|
59
|
+
curr_homc += 1
|
60
|
+
curr_hets -= 2
|
61
|
+
|
62
|
+
# calculate probabilities from midpoint up
|
63
|
+
curr_homr = (rare - mid) / 2
|
64
|
+
curr_homc = genotypes - mid - curr_homr
|
65
|
+
|
66
|
+
curr_hets = mid
|
67
|
+
while curr_hets <= rare - 2:
|
68
|
+
probs[curr_hets + 2] = \
|
69
|
+
(probs[curr_hets] * 4.0 * curr_homr * curr_homc
|
70
|
+
/ ((curr_hets + 2.0) * (curr_hets + 1.0)))
|
71
|
+
_sum += probs[curr_hets + 2]
|
72
|
+
|
73
|
+
# add 2 heterozygotes -> subtract one rare, one common homozygote
|
74
|
+
curr_homr -= 1
|
75
|
+
curr_homc -= 1
|
76
|
+
curr_hets += 2
|
77
|
+
|
78
|
+
# p-value calculation for p_hwe
|
79
|
+
target = probs[obs_hets]
|
80
|
+
p_hwe = 0.0
|
81
|
+
|
82
|
+
for p in probs:
|
83
|
+
if p <= target:
|
84
|
+
p_hwe += p / _sum
|
85
|
+
|
86
|
+
return min(1.0, p_hwe)
|
87
|
+
|
88
|
+
|
89
|
+
def hwe_test(
|
90
|
+
seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
|
91
|
+
) -> bool:
|
92
|
+
""" The Hardy-Weinberg equilibrium is a principle stating that the genetic
|
93
|
+
variation in a population will remain constant from one generation to the
|
94
|
+
next in the absence of disturbing factors.
|
95
|
+
https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
|
96
|
+
|
97
|
+
:param seq_snp: SNP sequence
|
98
|
+
:param freq: Allele frequency
|
99
|
+
:param crit_chi2: The critical value for a test ("either / or":
|
100
|
+
observed and expected values are either one way or the other),
|
101
|
+
therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
|
102
|
+
:return: A decision is returned to exclude or retain the inspected snp
|
103
|
+
"""
|
104
|
+
|
105
|
+
_seq = seq_snp.replace(5, np.nan)
|
106
|
+
|
107
|
+
if _seq.nunique() == 1:
|
108
|
+
return True
|
109
|
+
|
110
|
+
n_genotypes = _seq.count()
|
111
|
+
|
112
|
+
observed = {
|
113
|
+
0: (_seq == 0).sum(),
|
114
|
+
1: (_seq == 1).sum(),
|
115
|
+
2: (_seq == 2).sum()
|
116
|
+
}
|
117
|
+
|
118
|
+
expected = {
|
119
|
+
0: ((1 - freq) ** 2) * n_genotypes,
|
120
|
+
1: (2 * ((1 - freq) * freq)) * n_genotypes,
|
121
|
+
2: (freq ** 2) * n_genotypes
|
122
|
+
}
|
123
|
+
|
124
|
+
chi = sum([
|
125
|
+
((obs - exp) ** 2) / exp
|
126
|
+
for obs, exp in zip(observed.values(), expected.values())
|
127
|
+
])
|
128
|
+
|
129
|
+
if chi > crit_chi2:
|
130
|
+
return False
|
131
|
+
else:
|
132
|
+
return True
|