snplib 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- finalreport/__init__.py +7 -0
- finalreport/_finalreport.py +251 -0
- finalreport/tests/__init__.py +7 -0
- finalreport/tests/test_finalreport.py +215 -0
- format/__init__.py +19 -0
- format/__settings.py +7 -0
- format/_plink.py +305 -0
- format/_snp.py +113 -0
- format/tests/__init__.py +7 -0
- format/tests/test_plink_fam.py +121 -0
- format/tests/test_plink_lgen.py +106 -0
- format/tests/test_plink_map.py +42 -0
- format/tests/test_plink_ped.py +136 -0
- format/tests/test_snp.py +128 -0
- parentage/__init__.py +15 -0
- parentage/_discov.py +102 -0
- parentage/_isagmark.py +15 -0
- parentage/_verif.py +91 -0
- parentage/tests/__init__.py +7 -0
- parentage/tests/test_discov.py +164 -0
- parentage/tests/test_verif.py +160 -0
- snplib-1.0.0.dist-info/LICENSE +674 -0
- snplib-1.0.0.dist-info/METADATA +89 -0
- snplib-1.0.0.dist-info/RECORD +36 -0
- snplib-1.0.0.dist-info/WHEEL +5 -0
- snplib-1.0.0.dist-info/top_level.txt +4 -0
- statistics/__init__.py +16 -0
- statistics/_callrate.py +59 -0
- statistics/_freq.py +67 -0
- statistics/_snphwe.py +132 -0
- statistics/tests/__init__.py +7 -0
- statistics/tests/test_callrate.py +171 -0
- statistics/tests/test_freq_allele.py +87 -0
- statistics/tests/test_freq_maf.py +17 -0
- statistics/tests/test_hwe_t.py +41 -0
- statistics/tests/test_snphwe.py +41 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: snplib
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
|
+
Author-email: Igor <igor.loschinin@gmail.com>
|
6
|
+
License: GNU
|
7
|
+
Project-URL: Homepage, https://github.com/IgorekLoschinin/snptools
|
8
|
+
Requires-Python: >=3.10
|
9
|
+
Description-Content-Type: text/markdown
|
10
|
+
License-File: LICENSE
|
11
|
+
Requires-Dist: numpy==1.26.1
|
12
|
+
Requires-Dist: pandas==2.1.1
|
13
|
+
Requires-Dist: six==1.16.0
|
14
|
+
Requires-Dist: swifter==1.4.0
|
15
|
+
Requires-Dist: xlrd==2.0.1
|
16
|
+
Requires-Dist: XlsxWriter==3.1.9
|
17
|
+
Requires-Dist: openpyxl==3.1.2
|
18
|
+
Requires-Dist: pydantic==2.4.2
|
19
|
+
Requires-Dist: pytest==7.4.2
|
20
|
+
Requires-Dist: sphinx==7.2.6
|
21
|
+
Requires-Dist: sphinx_rtd_theme==1.3.0
|
22
|
+
|
23
|
+
# snptools
|
24
|
+
|
25
|
+
**Snptools** is a tool for SNP (Single Nucleotide Polymorphism) data processing,
|
26
|
+
parentage calculation and call rate estimation.
|
27
|
+
|
28
|
+
## Introduction
|
29
|
+
|
30
|
+
SNP (Single Nucleotide Polymorphism) represent genetic variations, that can
|
31
|
+
be used to analyze genetic data. SNPTools provides a set of tools for working
|
32
|
+
with SNP data, including the following capabilities:
|
33
|
+
|
34
|
+
- SNP data processing - FinalReport.
|
35
|
+
- Parentage Verification and Parentage Discovery Based on SNP Genotypes (ICAR).
|
36
|
+
- Call rate estimation (percentage of missing data).
|
37
|
+
- Processing and preparation of data in plink formats.
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
To install SNPTools, follow the steps below:
|
42
|
+
|
43
|
+
1. Clone the repository into your project directory:
|
44
|
+
```
|
45
|
+
git clone https://github.com/yourusername/snpTools.git
|
46
|
+
```
|
47
|
+
2. Set dependencies:
|
48
|
+
```
|
49
|
+
pip install -r requirements.txt
|
50
|
+
```
|
51
|
+
3. Use SNPTools:
|
52
|
+
```
|
53
|
+
import snptools
|
54
|
+
```
|
55
|
+
|
56
|
+
## Usage
|
57
|
+
Snptools provides commands for a variety of operations. Here are examples of
|
58
|
+
usage:
|
59
|
+
|
60
|
+
#### SNP data processing:
|
61
|
+
```
|
62
|
+
from snptools.finalreport import FinalReport
|
63
|
+
```
|
64
|
+
|
65
|
+
#### Computation of parentage:
|
66
|
+
```
|
67
|
+
from snptools.parentage import Discovery, Verification
|
68
|
+
```
|
69
|
+
|
70
|
+
#### Preparation format files:
|
71
|
+
```
|
72
|
+
from snptools.format import (
|
73
|
+
Snp, make_fam, make_ped, make_lgen, make_map
|
74
|
+
)
|
75
|
+
```
|
76
|
+
|
77
|
+
#### Stat:
|
78
|
+
```
|
79
|
+
from snptools.statistics import (
|
80
|
+
hwe, hwe_test, call_rate, allele_freq, minor_allele_freq
|
81
|
+
)
|
82
|
+
```
|
83
|
+
|
84
|
+
## Documentation
|
85
|
+
Detailed documentation on how to use SNPTools is available see the docs.
|
86
|
+
|
87
|
+
## License
|
88
|
+
This project is licensed under the GNU General Public License - see the
|
89
|
+
LICENSE file for details.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
|
2
|
+
finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
|
3
|
+
finalreport/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
|
4
|
+
finalreport/tests/test_finalreport.py,sha256=uOnV-p2_yhqKEZSoVOt7W4oxJwEyuBVTfpi3iQ2tTp4,6498
|
5
|
+
format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
|
6
|
+
format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
|
7
|
+
format/_plink.py,sha256=Z09IOPACOt3n8CKEVRkE4tLT16I8e_6ZoMaWRxSImrA,10529
|
8
|
+
format/_snp.py,sha256=oI-V4-_w28aX-VxoimywLDDnX6owhdjLbqt9a54_ouU,3172
|
9
|
+
format/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
|
10
|
+
format/tests/test_plink_fam.py,sha256=vUcDFIU17ez2S9hIXIxrqjs4OxgWuC8wat6NzG4nwYQ,2757
|
11
|
+
format/tests/test_plink_lgen.py,sha256=cw8jeAe74iWI45gZ8Ix8diLWFGeoAoq3LWmHW9Mz8Nw,2205
|
12
|
+
format/tests/test_plink_map.py,sha256=DtFgJpYjSdKk5AzDbZW4O9mRu1M43lgZMYMb-bmzEX8,947
|
13
|
+
format/tests/test_plink_ped.py,sha256=8SbNs9v9ss8ZDv52csMU1D5uONfhic7_mvuBrgN5Xss,2938
|
14
|
+
format/tests/test_snp.py,sha256=xtb2fniEARHoNFvFd9IrnX-QBBEKO6zyDmhaTdJ3Ric,3263
|
15
|
+
parentage/__init__.py,sha256=bN3mWTxmaFQ1qzRtyMLaAoxfomz6jnoWa-kmnJ9q_fE,280
|
16
|
+
parentage/_discov.py,sha256=qGlNzpl4xKOWRr6-fi1osylzizgiPO8vCus8VE56nec,3180
|
17
|
+
parentage/_isagmark.py,sha256=0xi9YhuIpU7zf16HnWw1XIkcQLk4rTNeAeCE-5p9hQE,356
|
18
|
+
parentage/_verif.py,sha256=VbX46dC4tl4Qeuw65aRg1s6hSn0FI25Hy9-3U_jxmrg,3019
|
19
|
+
parentage/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
|
20
|
+
parentage/tests/test_discov.py,sha256=gJt1SjYTr7ZaphZayc1oBR9THTFjFxvViTi9a0X4fnI,4150
|
21
|
+
parentage/tests/test_verif.py,sha256=JROafzDlyMUBfbXq6iq3EVhEqHkrh9tRB-C0EdOf7aY,3951
|
22
|
+
statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw,302
|
23
|
+
statistics/_callrate.py,sha256=ghB1EXT5JLQeIEIzh8LjWpqAnhCtCOk6l5ecNMLtQa0,1865
|
24
|
+
statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
|
25
|
+
statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
|
26
|
+
statistics/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
|
27
|
+
statistics/tests/test_callrate.py,sha256=a-j3muIn5unTH9dZW7xe_v2WU6uIMmj4PvzfWYkQbIs,5653
|
28
|
+
statistics/tests/test_freq_allele.py,sha256=7jbxM7MCkUnrAouiuphJ5lGdZq4K72IoIqz7JQdTFds,2586
|
29
|
+
statistics/tests/test_freq_maf.py,sha256=IndsZaCUBn7Hql8_ao35Cip78GA_ytDZjYugWXkdPt4,430
|
30
|
+
statistics/tests/test_hwe_t.py,sha256=MqoP3DJ-159WTJEbWsAzIVsuGG791PYPm4DjPFoehfw,847
|
31
|
+
statistics/tests/test_snphwe.py,sha256=chutPCO7lYYUMvsuDwvuehB6_4oBRkp1OnUJtL52HTE,995
|
32
|
+
snplib-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
33
|
+
snplib-1.0.0.dist-info/METADATA,sha256=E_EU6XriENiCXVP0rHOjnqwQGa19oqnALVKK3glNqg0,2298
|
34
|
+
snplib-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
35
|
+
snplib-1.0.0.dist-info/top_level.txt,sha256=JrsCvtXEC8OFF-STMY1jzbM5YEuA_57kuXXq5aI8dfc,40
|
36
|
+
snplib-1.0.0.dist-info/RECORD,,
|
statistics/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from ._snphwe import hwe, hwe_test
|
6
|
+
from ._callrate import call_rate
|
7
|
+
from ._freq import allele_freq, minor_allele_freq
|
8
|
+
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"call_rate",
|
12
|
+
"allele_freq",
|
13
|
+
"minor_allele_freq",
|
14
|
+
"hwe",
|
15
|
+
"hwe_test"
|
16
|
+
]
|
statistics/_callrate.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
def call_rate(
|
9
|
+
data: pd.DataFrame | str,
|
10
|
+
id_col: str = None,
|
11
|
+
snp_col: str = None
|
12
|
+
) -> pd.DataFrame | float | None:
|
13
|
+
""" The call rate for a given SNP is defined as the proportion of
|
14
|
+
individuals in the study for which the corresponding SNP information is
|
15
|
+
not missing. In the following example, we filter using a call rate of 95%,
|
16
|
+
meaning we retain SNPs for which there is less than 5% missing data.
|
17
|
+
|
18
|
+
Of the say, 54K markers in the chip, 50K have been genotyped for a
|
19
|
+
particular animal, the “call rate animal” is 50K/54K=93%
|
20
|
+
Of the say, 900 animals genotyped for marker CL635944_160.1, how many
|
21
|
+
have actually been successfully read? Assume that 600 have been read, then
|
22
|
+
the “call rate marker” is 600/900 = 67%
|
23
|
+
|
24
|
+
:param data: Pre-processed data on which the call rate is calculated.
|
25
|
+
:param id_col: The name of the column with the id of the animals or
|
26
|
+
markers.
|
27
|
+
:param snp_col: The name of the column with the snp sequence.
|
28
|
+
:return: Return dataframe with call rates for each animal if a dataframe
|
29
|
+
is transmitted. The number if the snp sequence is passed as a string.
|
30
|
+
None if there were errors.
|
31
|
+
"""
|
32
|
+
|
33
|
+
if isinstance(data, pd.DataFrame):
|
34
|
+
try:
|
35
|
+
if data[snp_col].dtype.hasobject:
|
36
|
+
if not data[snp_col].str.isdigit().all():
|
37
|
+
return None
|
38
|
+
|
39
|
+
return data[[id_col, snp_col]].\
|
40
|
+
groupby(by=id_col)[snp_col].\
|
41
|
+
apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
|
42
|
+
reset_index()
|
43
|
+
|
44
|
+
return data[[id_col, snp_col]]. \
|
45
|
+
groupby(by=id_col)[snp_col]. \
|
46
|
+
apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
|
47
|
+
reset_index()
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
raise e
|
51
|
+
|
52
|
+
elif isinstance(data, str):
|
53
|
+
if not data.isdigit():
|
54
|
+
return None
|
55
|
+
|
56
|
+
return round(1 - (data.count('5') / len(data)), 6)
|
57
|
+
|
58
|
+
else:
|
59
|
+
return None
|
statistics/_freq.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
def allele_freq(
|
9
|
+
data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
|
10
|
+
) -> pd.DataFrame | float | None:
|
11
|
+
""" The allele frequency represents the incidence of a gene variant in a
|
12
|
+
population.
|
13
|
+
|
14
|
+
:param data: Data array.
|
15
|
+
:param id_col: Columns with snp names.
|
16
|
+
:param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
|
17
|
+
:return: Return the alleles frequency.
|
18
|
+
"""
|
19
|
+
|
20
|
+
if isinstance(data, pd.DataFrame):
|
21
|
+
try:
|
22
|
+
if data[seq_col].dtype.hasobject:
|
23
|
+
if not data[seq_col].str.isdigit().all():
|
24
|
+
return None
|
25
|
+
|
26
|
+
return data.\
|
27
|
+
loc[data[seq_col] != "5", [id_col, seq_col]]. \
|
28
|
+
groupby(by=id_col)[seq_col]. \
|
29
|
+
apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
|
30
|
+
reset_index().\
|
31
|
+
round(3)
|
32
|
+
|
33
|
+
return data.\
|
34
|
+
loc[data[seq_col] != 5, [id_col, seq_col]].\
|
35
|
+
groupby(by=id_col)[seq_col].\
|
36
|
+
apply(lambda x: x.sum() / (2 * x.count())).\
|
37
|
+
reset_index().\
|
38
|
+
round(3)
|
39
|
+
|
40
|
+
except Exception as e:
|
41
|
+
raise e
|
42
|
+
|
43
|
+
elif isinstance(data, str):
|
44
|
+
if not data.isdigit():
|
45
|
+
return None
|
46
|
+
|
47
|
+
sam_seq = tuple(
|
48
|
+
map(int, filter(lambda x: x if x != "5" else None, data))
|
49
|
+
)
|
50
|
+
return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
|
51
|
+
|
52
|
+
else:
|
53
|
+
return None
|
54
|
+
|
55
|
+
|
56
|
+
def minor_allele_freq(value: float) -> float:
|
57
|
+
""" The minor allele frequency is therefore the frequency at which the
|
58
|
+
minor allele occurs within a population.
|
59
|
+
|
60
|
+
:param value: Allele frequency
|
61
|
+
:return: Return the minor alleles frequency
|
62
|
+
"""
|
63
|
+
|
64
|
+
if value > 0.5:
|
65
|
+
return round(1 - value, 3)
|
66
|
+
|
67
|
+
return round(value, 3)
|
statistics/_snphwe.py
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def hwe(
|
10
|
+
obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
|
11
|
+
) -> float:
|
12
|
+
""" Python interpretation hwe - https://github.com/jeremymcrae/snphwe
|
13
|
+
|
14
|
+
:param obs_hets: Number of observed heterozygotes (AB, BA)
|
15
|
+
:param obs_hom1: Number of observed homozygotes1 (AA)
|
16
|
+
:param obs_hom2: Number of observed homozygotes2 (BB)
|
17
|
+
:return: This is where the p-value is returned
|
18
|
+
"""
|
19
|
+
|
20
|
+
obs_hets = round(obs_hets)
|
21
|
+
obs_hom1 = round(obs_hom1)
|
22
|
+
obs_hom2 = round(obs_hom2)
|
23
|
+
|
24
|
+
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
|
25
|
+
raise ValueError("snphwe: negative allele count")
|
26
|
+
|
27
|
+
obs_homr = min(obs_hom1, obs_hom2)
|
28
|
+
obs_homc = max(obs_hom1, obs_hom2)
|
29
|
+
|
30
|
+
rare = 2 * obs_homr + obs_hets
|
31
|
+
genotypes = obs_hets + obs_homc + obs_homr
|
32
|
+
|
33
|
+
if genotypes == 0:
|
34
|
+
raise ValueError("snphwe: zero genotypes")
|
35
|
+
|
36
|
+
probs = np.zeros(round(rare) + 1)
|
37
|
+
|
38
|
+
# get distribution midpoint, but ensure midpoint and rare alleles have
|
39
|
+
# same parity
|
40
|
+
mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
|
41
|
+
if mid % 2 != rare % 2:
|
42
|
+
mid += 1
|
43
|
+
|
44
|
+
probs[mid] = 1.0
|
45
|
+
_sum = probs[mid]
|
46
|
+
|
47
|
+
curr_homr = (rare - mid) / 2
|
48
|
+
curr_homc = genotypes - mid - curr_homr
|
49
|
+
curr_hets = mid
|
50
|
+
while curr_hets > 1:
|
51
|
+
probs[curr_hets - 2] = (
|
52
|
+
probs[curr_hets] * curr_hets * (curr_hets - 1.0)
|
53
|
+
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
|
54
|
+
)
|
55
|
+
_sum += probs[curr_hets - 2]
|
56
|
+
|
57
|
+
# fewer heterozygotes -> add one rare, one common homozygote
|
58
|
+
curr_homr += 1
|
59
|
+
curr_homc += 1
|
60
|
+
curr_hets -= 2
|
61
|
+
|
62
|
+
# calculate probabilities from midpoint up
|
63
|
+
curr_homr = (rare - mid) / 2
|
64
|
+
curr_homc = genotypes - mid - curr_homr
|
65
|
+
|
66
|
+
curr_hets = mid
|
67
|
+
while curr_hets <= rare - 2:
|
68
|
+
probs[curr_hets + 2] = \
|
69
|
+
(probs[curr_hets] * 4.0 * curr_homr * curr_homc
|
70
|
+
/ ((curr_hets + 2.0) * (curr_hets + 1.0)))
|
71
|
+
_sum += probs[curr_hets + 2]
|
72
|
+
|
73
|
+
# add 2 heterozygotes -> subtract one rare, one common homozygote
|
74
|
+
curr_homr -= 1
|
75
|
+
curr_homc -= 1
|
76
|
+
curr_hets += 2
|
77
|
+
|
78
|
+
# p-value calculation for p_hwe
|
79
|
+
target = probs[obs_hets]
|
80
|
+
p_hwe = 0.0
|
81
|
+
|
82
|
+
for p in probs:
|
83
|
+
if p <= target:
|
84
|
+
p_hwe += p / _sum
|
85
|
+
|
86
|
+
return min(1.0, p_hwe)
|
87
|
+
|
88
|
+
|
89
|
+
def hwe_test(
|
90
|
+
seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
|
91
|
+
) -> bool:
|
92
|
+
""" The Hardy-Weinberg equilibrium is a principle stating that the genetic
|
93
|
+
variation in a population will remain constant from one generation to the
|
94
|
+
next in the absence of disturbing factors.
|
95
|
+
https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
|
96
|
+
|
97
|
+
:param seq_snp: SNP sequence
|
98
|
+
:param freq: Allele frequency
|
99
|
+
:param crit_chi2: The critical value for a test ("either / or":
|
100
|
+
observed and expected values are either one way or the other),
|
101
|
+
therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
|
102
|
+
:return: A decision is returned to exclude or retain the inspected snp
|
103
|
+
"""
|
104
|
+
|
105
|
+
_seq = seq_snp.replace(5, np.nan)
|
106
|
+
|
107
|
+
if _seq.nunique() == 1:
|
108
|
+
return True
|
109
|
+
|
110
|
+
n_genotypes = _seq.count()
|
111
|
+
|
112
|
+
observed = {
|
113
|
+
0: (_seq == 0).sum(),
|
114
|
+
1: (_seq == 1).sum(),
|
115
|
+
2: (_seq == 2).sum()
|
116
|
+
}
|
117
|
+
|
118
|
+
expected = {
|
119
|
+
0: ((1 - freq) ** 2) * n_genotypes,
|
120
|
+
1: (2 * ((1 - freq) * freq)) * n_genotypes,
|
121
|
+
2: (freq ** 2) * n_genotypes
|
122
|
+
}
|
123
|
+
|
124
|
+
chi = sum([
|
125
|
+
((obs - exp) ** 2) / exp
|
126
|
+
for obs, exp in zip(observed.values(), expected.values())
|
127
|
+
])
|
128
|
+
|
129
|
+
if chi > crit_chi2:
|
130
|
+
return False
|
131
|
+
else:
|
132
|
+
return True
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_DATA
|
6
|
+
from .. import call_rate
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def data_df(request) -> pd.DataFrame:
|
15
|
+
match request.param:
|
16
|
+
case "cra":
|
17
|
+
return pd.read_pickle(DIR_DATA / "cr/file_cra.pl")
|
18
|
+
|
19
|
+
case "crm":
|
20
|
+
return pd.read_pickle(DIR_DATA / "cr/file_crm.pl")
|
21
|
+
|
22
|
+
|
23
|
+
@pytest.fixture
|
24
|
+
def data_str() -> list[str]:
|
25
|
+
return ['02011015010000500', '01110152120222512']
|
26
|
+
|
27
|
+
|
28
|
+
class TestCallRateAnimal(object):
|
29
|
+
|
30
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
31
|
+
def test_cra_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
32
|
+
data_df.SNP = data_df.SNP.astype(str)
|
33
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
34
|
+
|
35
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
36
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
37
|
+
|
38
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
39
|
+
def test_cra_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
|
40
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
41
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
42
|
+
|
43
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
44
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
45
|
+
|
46
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
47
|
+
def test_cra_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
|
48
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
49
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
50
|
+
|
51
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
52
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
53
|
+
|
54
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
55
|
+
def test_cra_datafame_dtype_random_simbols(
|
56
|
+
self, data_df: pd.DataFrame
|
57
|
+
) -> None:
|
58
|
+
data_df.SNP = [
|
59
|
+
np.random.choice(["A", "C", "G", "T"])
|
60
|
+
for _ in range(data_df.SNP.shape[0])
|
61
|
+
]
|
62
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
63
|
+
|
64
|
+
assert result is None
|
65
|
+
|
66
|
+
def test_cra_datafame_empty1(self) -> None:
|
67
|
+
with pytest.raises(KeyError):
|
68
|
+
call_rate(data=pd.DataFrame(), id_col="SAMPLE_ID", snp_col="SNP")
|
69
|
+
|
70
|
+
def test_cra_datafame_empty2(self) -> None:
|
71
|
+
result = call_rate(
|
72
|
+
data=pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
73
|
+
id_col="SAMPLE_ID",
|
74
|
+
snp_col="SNP"
|
75
|
+
)
|
76
|
+
|
77
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
78
|
+
|
79
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
80
|
+
def test_cra_datafame_fail(self, data_df: pd.DataFrame) -> None:
|
81
|
+
with pytest.raises(KeyError):
|
82
|
+
call_rate(data=data_df, id_col="SAMPLE_ID")
|
83
|
+
call_rate(data=data_df, snp_col="SNP")
|
84
|
+
call_rate(data=data_df)
|
85
|
+
|
86
|
+
def test_cra_str_int(self, data_str: list[str]) -> None:
|
87
|
+
for sequence in data_str:
|
88
|
+
assert call_rate(data=sequence) == 0.882353
|
89
|
+
|
90
|
+
def test_cra_str_simbols(self) -> None:
|
91
|
+
data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
|
92
|
+
|
93
|
+
for sequence in data_str:
|
94
|
+
assert call_rate(data=sequence) is None
|
95
|
+
|
96
|
+
def test_cra_str_empty(self) -> None:
|
97
|
+
assert call_rate(data="") is None
|
98
|
+
|
99
|
+
def test_cra_str_mixid(self) -> None:
|
100
|
+
assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
|
101
|
+
|
102
|
+
|
103
|
+
class TestCallRateMarker(object):
|
104
|
+
|
105
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
106
|
+
def test_crm_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
107
|
+
data_df.SNP = data_df.SNP.astype(str)
|
108
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
109
|
+
|
110
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
111
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
112
|
+
|
113
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
114
|
+
def test_crm_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
|
115
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
116
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
117
|
+
|
118
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
119
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
120
|
+
|
121
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
122
|
+
def test_crm_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
|
123
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
124
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
125
|
+
|
126
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
127
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
128
|
+
|
129
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
130
|
+
def test_crm_datafame_dtype_random_simbols(
|
131
|
+
self, data_df: pd.DataFrame
|
132
|
+
) -> None:
|
133
|
+
data_df.SNP = [
|
134
|
+
np.random.choice(["A", "C", "G", "T"])
|
135
|
+
for _ in range(data_df.SNP.shape[0])
|
136
|
+
]
|
137
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
138
|
+
|
139
|
+
assert result is None
|
140
|
+
|
141
|
+
def test_crm_datafame_empty1(self) -> None:
|
142
|
+
with pytest.raises(KeyError):
|
143
|
+
call_rate(data=pd.DataFrame(), id_col="SNP_NAME", snp_col="SNP")
|
144
|
+
|
145
|
+
def test_crm_datafame_empty2(self) -> None:
|
146
|
+
result = call_rate(
|
147
|
+
data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
|
148
|
+
id_col="SNP_NAME",
|
149
|
+
snp_col="SNP"
|
150
|
+
)
|
151
|
+
|
152
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
153
|
+
|
154
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
155
|
+
def test_crm_datafame_fail(self, data_df: pd.DataFrame) -> None:
|
156
|
+
with pytest.raises(KeyError):
|
157
|
+
call_rate(data=data_df, id_col="SNP_NAME")
|
158
|
+
call_rate(data=data_df, snp_col="SNP")
|
159
|
+
call_rate(data=data_df)
|
160
|
+
|
161
|
+
def test_crm_str_simbols(self) -> None:
|
162
|
+
data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
|
163
|
+
|
164
|
+
for sequence in data_str:
|
165
|
+
assert call_rate(data=sequence) is None
|
166
|
+
|
167
|
+
def test_crm_str_empty(self) -> None:
|
168
|
+
assert call_rate(data="") is None
|
169
|
+
|
170
|
+
def test_crm_str_mixid(self) -> None:
|
171
|
+
assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_DATA
|
6
|
+
from .. import allele_freq
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def data_df() -> pd.DataFrame:
|
15
|
+
return pd.read_pickle(DIR_DATA / "freq/file.pl")
|
16
|
+
# [0. , 0.9 , 0.889]
|
17
|
+
|
18
|
+
|
19
|
+
def data_str() -> list[tuple]:
|
20
|
+
return [
|
21
|
+
('2212120', 0.714),
|
22
|
+
('02011015010000500', 0.2),
|
23
|
+
('01110152120222512', 0.6)
|
24
|
+
]
|
25
|
+
|
26
|
+
|
27
|
+
class TestAlleleFreq(object):
|
28
|
+
|
29
|
+
def test_allele_freq_df_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
30
|
+
data_df.SNP = data_df.SNP.astype(str)
|
31
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
32
|
+
|
33
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
34
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
35
|
+
|
36
|
+
def test_allele_freq_df_dtype_int(self, data_df: pd.DataFrame) -> None:
|
37
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
38
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
39
|
+
|
40
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
41
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
42
|
+
|
43
|
+
def test_allele_freq_df_dtype_float(self, data_df: pd.DataFrame) -> None:
|
44
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
45
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
46
|
+
|
47
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
48
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
49
|
+
|
50
|
+
def test_allele_freq_df_data_rand_simbols(
|
51
|
+
self, data_df: pd.DataFrame
|
52
|
+
) -> None:
|
53
|
+
data_df.SNP = [
|
54
|
+
np.random.choice(["A", "C", "G", "T"])
|
55
|
+
for _ in range(data_df.SNP.shape[0])
|
56
|
+
]
|
57
|
+
assert allele_freq(
|
58
|
+
data=data_df, id_col="SNP_NAME", seq_col="SNP"
|
59
|
+
) is None
|
60
|
+
|
61
|
+
def test_allele_freq_df_empty(self) -> None:
|
62
|
+
with pytest.raises(KeyError):
|
63
|
+
allele_freq(
|
64
|
+
data=pd.DataFrame(), id_col="SNP_NAME", seq_col="SNP"
|
65
|
+
)
|
66
|
+
|
67
|
+
def test_allele_freq_df_empty_only_columns(self) -> None:
|
68
|
+
result = allele_freq(
|
69
|
+
data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
|
70
|
+
id_col="SNP_NAME",
|
71
|
+
seq_col="SNP"
|
72
|
+
)
|
73
|
+
|
74
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
75
|
+
|
76
|
+
def test_allele_freq_df_raises(self, data_df: pd.DataFrame) -> None:
|
77
|
+
with pytest.raises(KeyError):
|
78
|
+
allele_freq(data=data_df, id_col="SNP_NAME")
|
79
|
+
allele_freq(data=data_df, seq_col="SNP")
|
80
|
+
allele_freq(data=data_df)
|
81
|
+
|
82
|
+
@pytest.mark.parametrize("data, obs_value", data_str())
|
83
|
+
def test_allele_freq_str(self, data: str, obs_value: float) -> None:
|
84
|
+
assert allele_freq(data=data) == obs_value
|
85
|
+
|
86
|
+
def test_allele_freq_non_type(self) -> None:
|
87
|
+
assert allele_freq(data=1423) is None
|