snplib 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. snplib/__init__.py +8 -8
  2. snplib/finalreport/__init__.py +7 -7
  3. snplib/finalreport/_finalreport.py +251 -251
  4. snplib/format/__init__.py +19 -19
  5. snplib/format/__settings.py +7 -7
  6. snplib/format/_plink.py +291 -305
  7. snplib/format/_snp.py +113 -113
  8. snplib/parentage/__init__.py +15 -15
  9. snplib/parentage/_discov.py +102 -102
  10. snplib/parentage/_isagmark.py +15 -15
  11. snplib/parentage/_verif.py +91 -91
  12. snplib/parentage/isag_disc.pl +0 -0
  13. snplib/parentage/isag_verif.pl +0 -0
  14. snplib/statistics/__init__.py +16 -16
  15. snplib/statistics/_callrate.py +60 -59
  16. snplib/statistics/_freq.py +67 -67
  17. snplib/statistics/_snphwe.py +132 -132
  18. {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/LICENSE +674 -674
  19. {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/METADATA +80 -97
  20. snplib-1.0.9.dist-info/RECORD +22 -0
  21. snplib/finalreport/tests/__init__.py +0 -7
  22. snplib/finalreport/tests/test_finalreport.py +0 -215
  23. snplib/format/tests/__init__.py +0 -7
  24. snplib/format/tests/test_plink_fam.py +0 -121
  25. snplib/format/tests/test_plink_lgen.py +0 -106
  26. snplib/format/tests/test_plink_map.py +0 -42
  27. snplib/format/tests/test_plink_ped.py +0 -136
  28. snplib/format/tests/test_snp.py +0 -128
  29. snplib/parentage/tests/__init__.py +0 -7
  30. snplib/parentage/tests/test_discov.py +0 -164
  31. snplib/parentage/tests/test_verif.py +0 -160
  32. snplib/statistics/tests/__init__.py +0 -7
  33. snplib/statistics/tests/test_callrate.py +0 -171
  34. snplib/statistics/tests/test_freq_allele.py +0 -87
  35. snplib/statistics/tests/test_freq_maf.py +0 -17
  36. snplib/statistics/tests/test_hwe_t.py +0 -41
  37. snplib/statistics/tests/test_snphwe.py +0 -41
  38. snplib-1.0.7.dist-info/RECORD +0 -37
  39. {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/WHEEL +0 -0
  40. {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/top_level.txt +0 -0
@@ -1,132 +1,132 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- import numpy as np
6
- import pandas as pd
7
-
8
-
9
- def hwe(
10
- obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
11
- ) -> float:
12
- """ Python interpretation hwe - https://github.com/jeremymcrae/snphwe
13
-
14
- :param obs_hets: Number of observed heterozygotes (AB, BA)
15
- :param obs_hom1: Number of observed homozygotes1 (AA)
16
- :param obs_hom2: Number of observed homozygotes2 (BB)
17
- :return: This is where the p-value is returned
18
- """
19
-
20
- obs_hets = round(obs_hets)
21
- obs_hom1 = round(obs_hom1)
22
- obs_hom2 = round(obs_hom2)
23
-
24
- if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
25
- raise ValueError("snphwe: negative allele count")
26
-
27
- obs_homr = min(obs_hom1, obs_hom2)
28
- obs_homc = max(obs_hom1, obs_hom2)
29
-
30
- rare = 2 * obs_homr + obs_hets
31
- genotypes = obs_hets + obs_homc + obs_homr
32
-
33
- if genotypes == 0:
34
- raise ValueError("snphwe: zero genotypes")
35
-
36
- probs = np.zeros(round(rare) + 1)
37
-
38
- # get distribution midpoint, but ensure midpoint and rare alleles have
39
- # same parity
40
- mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
41
- if mid % 2 != rare % 2:
42
- mid += 1
43
-
44
- probs[mid] = 1.0
45
- _sum = probs[mid]
46
-
47
- curr_homr = (rare - mid) / 2
48
- curr_homc = genotypes - mid - curr_homr
49
- curr_hets = mid
50
- while curr_hets > 1:
51
- probs[curr_hets - 2] = (
52
- probs[curr_hets] * curr_hets * (curr_hets - 1.0)
53
- / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
54
- )
55
- _sum += probs[curr_hets - 2]
56
-
57
- # fewer heterozygotes -> add one rare, one common homozygote
58
- curr_homr += 1
59
- curr_homc += 1
60
- curr_hets -= 2
61
-
62
- # calculate probabilities from midpoint up
63
- curr_homr = (rare - mid) / 2
64
- curr_homc = genotypes - mid - curr_homr
65
-
66
- curr_hets = mid
67
- while curr_hets <= rare - 2:
68
- probs[curr_hets + 2] = \
69
- (probs[curr_hets] * 4.0 * curr_homr * curr_homc
70
- / ((curr_hets + 2.0) * (curr_hets + 1.0)))
71
- _sum += probs[curr_hets + 2]
72
-
73
- # add 2 heterozygotes -> subtract one rare, one common homozygote
74
- curr_homr -= 1
75
- curr_homc -= 1
76
- curr_hets += 2
77
-
78
- # p-value calculation for p_hwe
79
- target = probs[obs_hets]
80
- p_hwe = 0.0
81
-
82
- for p in probs:
83
- if p <= target:
84
- p_hwe += p / _sum
85
-
86
- return min(1.0, p_hwe)
87
-
88
-
89
- def hwe_test(
90
- seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
91
- ) -> bool:
92
- """ The Hardy-Weinberg equilibrium is a principle stating that the genetic
93
- variation in a population will remain constant from one generation to the
94
- next in the absence of disturbing factors.
95
- https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
96
-
97
- :param seq_snp: SNP sequence
98
- :param freq: Allele frequency
99
- :param crit_chi2: The critical value for a test ("either / or":
100
- observed and expected values are either one way or the other),
101
- therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
102
- :return: A decision is returned to exclude or retain the inspected snp
103
- """
104
-
105
- _seq = seq_snp.replace(5, np.nan)
106
-
107
- if _seq.nunique() == 1:
108
- return True
109
-
110
- n_genotypes = _seq.count()
111
-
112
- observed = {
113
- 0: (_seq == 0).sum(),
114
- 1: (_seq == 1).sum(),
115
- 2: (_seq == 2).sum()
116
- }
117
-
118
- expected = {
119
- 0: ((1 - freq) ** 2) * n_genotypes,
120
- 1: (2 * ((1 - freq) * freq)) * n_genotypes,
121
- 2: (freq ** 2) * n_genotypes
122
- }
123
-
124
- chi = sum([
125
- ((obs - exp) ** 2) / exp
126
- for obs, exp in zip(observed.values(), expected.values())
127
- ])
128
-
129
- if chi > crit_chi2:
130
- return False
131
- else:
132
- return True
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def hwe(
10
+ obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
11
+ ) -> float:
12
+ """ Python interpretation hwe - https://github.com/jeremymcrae/snphwe
13
+
14
+ :param obs_hets: Number of observed heterozygotes (AB, BA)
15
+ :param obs_hom1: Number of observed homozygotes1 (AA)
16
+ :param obs_hom2: Number of observed homozygotes2 (BB)
17
+ :return: This is where the p-value is returned
18
+ """
19
+
20
+ obs_hets = round(obs_hets)
21
+ obs_hom1 = round(obs_hom1)
22
+ obs_hom2 = round(obs_hom2)
23
+
24
+ if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
25
+ raise ValueError("snphwe: negative allele count")
26
+
27
+ obs_homr = min(obs_hom1, obs_hom2)
28
+ obs_homc = max(obs_hom1, obs_hom2)
29
+
30
+ rare = 2 * obs_homr + obs_hets
31
+ genotypes = obs_hets + obs_homc + obs_homr
32
+
33
+ if genotypes == 0:
34
+ raise ValueError("snphwe: zero genotypes")
35
+
36
+ probs = np.zeros(round(rare) + 1)
37
+
38
+ # get distribution midpoint, but ensure midpoint and rare alleles have
39
+ # same parity
40
+ mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
41
+ if mid % 2 != rare % 2:
42
+ mid += 1
43
+
44
+ probs[mid] = 1.0
45
+ _sum = probs[mid]
46
+
47
+ curr_homr = (rare - mid) / 2
48
+ curr_homc = genotypes - mid - curr_homr
49
+ curr_hets = mid
50
+ while curr_hets > 1:
51
+ probs[curr_hets - 2] = (
52
+ probs[curr_hets] * curr_hets * (curr_hets - 1.0)
53
+ / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
54
+ )
55
+ _sum += probs[curr_hets - 2]
56
+
57
+ # fewer heterozygotes -> add one rare, one common homozygote
58
+ curr_homr += 1
59
+ curr_homc += 1
60
+ curr_hets -= 2
61
+
62
+ # calculate probabilities from midpoint up
63
+ curr_homr = (rare - mid) / 2
64
+ curr_homc = genotypes - mid - curr_homr
65
+
66
+ curr_hets = mid
67
+ while curr_hets <= rare - 2:
68
+ probs[curr_hets + 2] = \
69
+ (probs[curr_hets] * 4.0 * curr_homr * curr_homc
70
+ / ((curr_hets + 2.0) * (curr_hets + 1.0)))
71
+ _sum += probs[curr_hets + 2]
72
+
73
+ # add 2 heterozygotes -> subtract one rare, one common homozygote
74
+ curr_homr -= 1
75
+ curr_homc -= 1
76
+ curr_hets += 2
77
+
78
+ # p-value calculation for p_hwe
79
+ target = probs[obs_hets]
80
+ p_hwe = 0.0
81
+
82
+ for p in probs:
83
+ if p <= target:
84
+ p_hwe += p / _sum
85
+
86
+ return min(1.0, p_hwe)
87
+
88
+
89
+ def hwe_test(
90
+ seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
91
+ ) -> bool:
92
+ """ The Hardy-Weinberg equilibrium is a principle stating that the genetic
93
+ variation in a population will remain constant from one generation to the
94
+ next in the absence of disturbing factors.
95
+ https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
96
+
97
+ :param seq_snp: SNP sequence
98
+ :param freq: Allele frequency
99
+ :param crit_chi2: The critical value for a test ("either / or":
100
+ observed and expected values are either one way or the other),
101
+ therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
102
+ :return: A decision is returned to exclude or retain the inspected snp
103
+ """
104
+
105
+ _seq = seq_snp.replace(5, np.nan)
106
+
107
+ if _seq.nunique() == 1:
108
+ return True
109
+
110
+ n_genotypes = _seq.count()
111
+
112
+ observed = {
113
+ 0: (_seq == 0).sum(),
114
+ 1: (_seq == 1).sum(),
115
+ 2: (_seq == 2).sum()
116
+ }
117
+
118
+ expected = {
119
+ 0: ((1 - freq) ** 2) * n_genotypes,
120
+ 1: (2 * ((1 - freq) * freq)) * n_genotypes,
121
+ 2: (freq ** 2) * n_genotypes
122
+ }
123
+
124
+ chi = sum([
125
+ ((obs - exp) ** 2) / exp
126
+ for obs, exp in zip(observed.values(), expected.values())
127
+ ])
128
+
129
+ if chi > crit_chi2:
130
+ return False
131
+ else:
132
+ return True