snplib 1.0.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- finalreport/__init__.py +7 -0
- finalreport/_finalreport.py +251 -0
- finalreport/tests/__init__.py +7 -0
- finalreport/tests/test_finalreport.py +215 -0
- format/__init__.py +19 -0
- format/__settings.py +7 -0
- format/_plink.py +305 -0
- format/_snp.py +113 -0
- format/tests/__init__.py +7 -0
- format/tests/test_plink_fam.py +121 -0
- format/tests/test_plink_lgen.py +106 -0
- format/tests/test_plink_map.py +42 -0
- format/tests/test_plink_ped.py +136 -0
- format/tests/test_snp.py +128 -0
- parentage/__init__.py +15 -0
- parentage/_discov.py +102 -0
- parentage/_isagmark.py +15 -0
- parentage/_verif.py +91 -0
- parentage/tests/__init__.py +7 -0
- parentage/tests/test_discov.py +164 -0
- parentage/tests/test_verif.py +160 -0
- snplib-1.0.0.dist-info/LICENSE +674 -0
- snplib-1.0.0.dist-info/METADATA +89 -0
- snplib-1.0.0.dist-info/RECORD +36 -0
- snplib-1.0.0.dist-info/WHEEL +5 -0
- snplib-1.0.0.dist-info/top_level.txt +4 -0
- statistics/__init__.py +16 -0
- statistics/_callrate.py +59 -0
- statistics/_freq.py +67 -0
- statistics/_snphwe.py +132 -0
- statistics/tests/__init__.py +7 -0
- statistics/tests/test_callrate.py +171 -0
- statistics/tests/test_freq_allele.py +87 -0
- statistics/tests/test_freq_maf.py +17 -0
- statistics/tests/test_hwe_t.py +41 -0
- statistics/tests/test_snphwe.py +41 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: snplib
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
|
+
Author-email: Igor <igor.loschinin@gmail.com>
|
6
|
+
License: GNU
|
7
|
+
Project-URL: Homepage, https://github.com/IgorekLoschinin/snptools
|
8
|
+
Requires-Python: >=3.10
|
9
|
+
Description-Content-Type: text/markdown
|
10
|
+
License-File: LICENSE
|
11
|
+
Requires-Dist: numpy==1.26.1
|
12
|
+
Requires-Dist: pandas==2.1.1
|
13
|
+
Requires-Dist: six==1.16.0
|
14
|
+
Requires-Dist: swifter==1.4.0
|
15
|
+
Requires-Dist: xlrd==2.0.1
|
16
|
+
Requires-Dist: XlsxWriter==3.1.9
|
17
|
+
Requires-Dist: openpyxl==3.1.2
|
18
|
+
Requires-Dist: pydantic==2.4.2
|
19
|
+
Requires-Dist: pytest==7.4.2
|
20
|
+
Requires-Dist: sphinx==7.2.6
|
21
|
+
Requires-Dist: sphinx_rtd_theme==1.3.0
|
22
|
+
|
23
|
+
# snptools
|
24
|
+
|
25
|
+
**Snptools** is a tool for SNP (Single Nucleotide Polymorphism) data processing,
|
26
|
+
parentage calculation and call rate estimation.
|
27
|
+
|
28
|
+
## Introduction
|
29
|
+
|
30
|
+
SNP (Single Nucleotide Polymorphism) represent genetic variations, that can
|
31
|
+
be used to analyze genetic data. SNPTools provides a set of tools for working
|
32
|
+
with SNP data, including the following capabilities:
|
33
|
+
|
34
|
+
- SNP data processing - FinalReport.
|
35
|
+
- Parentage Verification and Parentage Discovery Based on SNP Genotypes (ICAR).
|
36
|
+
- Call rate estimation (percentage of missing data).
|
37
|
+
- Processing and preparation of data in plink formats.
|
38
|
+
|
39
|
+
## Installation
|
40
|
+
|
41
|
+
To install SNPTools, follow the steps below:
|
42
|
+
|
43
|
+
1. Clone the repository into your project directory:
|
44
|
+
```
|
45
|
+
git clone https://github.com/yourusername/snpTools.git
|
46
|
+
```
|
47
|
+
2. Set dependencies:
|
48
|
+
```
|
49
|
+
pip install -r requirements.txt
|
50
|
+
```
|
51
|
+
3. Use SNPTools:
|
52
|
+
```
|
53
|
+
import snptools
|
54
|
+
```
|
55
|
+
|
56
|
+
## Usage
|
57
|
+
Snptools provides commands for a variety of operations. Here are examples of
|
58
|
+
usage:
|
59
|
+
|
60
|
+
#### SNP data processing:
|
61
|
+
```
|
62
|
+
from snptools.finalreport import FinalReport
|
63
|
+
```
|
64
|
+
|
65
|
+
#### Computation of parentage:
|
66
|
+
```
|
67
|
+
from snptools.parentage import Discovery, Verification
|
68
|
+
```
|
69
|
+
|
70
|
+
#### Preparation format files:
|
71
|
+
```
|
72
|
+
from snptools.format import (
|
73
|
+
Snp, make_fam, make_ped, make_lgen, make_map
|
74
|
+
)
|
75
|
+
```
|
76
|
+
|
77
|
+
#### Stat:
|
78
|
+
```
|
79
|
+
from snptools.statistics import (
|
80
|
+
hwe, hwe_test, call_rate, allele_freq, minor_allele_freq
|
81
|
+
)
|
82
|
+
```
|
83
|
+
|
84
|
+
## Documentation
|
85
|
+
Detailed documentation on how to use SNPTools is available see the docs.
|
86
|
+
|
87
|
+
## License
|
88
|
+
This project is licensed under the GNU General Public License - see the
|
89
|
+
LICENSE file for details.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
|
2
|
+
finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
|
3
|
+
finalreport/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
|
4
|
+
finalreport/tests/test_finalreport.py,sha256=uOnV-p2_yhqKEZSoVOt7W4oxJwEyuBVTfpi3iQ2tTp4,6498
|
5
|
+
format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
|
6
|
+
format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
|
7
|
+
format/_plink.py,sha256=Z09IOPACOt3n8CKEVRkE4tLT16I8e_6ZoMaWRxSImrA,10529
|
8
|
+
format/_snp.py,sha256=oI-V4-_w28aX-VxoimywLDDnX6owhdjLbqt9a54_ouU,3172
|
9
|
+
format/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
|
10
|
+
format/tests/test_plink_fam.py,sha256=vUcDFIU17ez2S9hIXIxrqjs4OxgWuC8wat6NzG4nwYQ,2757
|
11
|
+
format/tests/test_plink_lgen.py,sha256=cw8jeAe74iWI45gZ8Ix8diLWFGeoAoq3LWmHW9Mz8Nw,2205
|
12
|
+
format/tests/test_plink_map.py,sha256=DtFgJpYjSdKk5AzDbZW4O9mRu1M43lgZMYMb-bmzEX8,947
|
13
|
+
format/tests/test_plink_ped.py,sha256=8SbNs9v9ss8ZDv52csMU1D5uONfhic7_mvuBrgN5Xss,2938
|
14
|
+
format/tests/test_snp.py,sha256=xtb2fniEARHoNFvFd9IrnX-QBBEKO6zyDmhaTdJ3Ric,3263
|
15
|
+
parentage/__init__.py,sha256=bN3mWTxmaFQ1qzRtyMLaAoxfomz6jnoWa-kmnJ9q_fE,280
|
16
|
+
parentage/_discov.py,sha256=qGlNzpl4xKOWRr6-fi1osylzizgiPO8vCus8VE56nec,3180
|
17
|
+
parentage/_isagmark.py,sha256=0xi9YhuIpU7zf16HnWw1XIkcQLk4rTNeAeCE-5p9hQE,356
|
18
|
+
parentage/_verif.py,sha256=VbX46dC4tl4Qeuw65aRg1s6hSn0FI25Hy9-3U_jxmrg,3019
|
19
|
+
parentage/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
|
20
|
+
parentage/tests/test_discov.py,sha256=gJt1SjYTr7ZaphZayc1oBR9THTFjFxvViTi9a0X4fnI,4150
|
21
|
+
parentage/tests/test_verif.py,sha256=JROafzDlyMUBfbXq6iq3EVhEqHkrh9tRB-C0EdOf7aY,3951
|
22
|
+
statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw,302
|
23
|
+
statistics/_callrate.py,sha256=ghB1EXT5JLQeIEIzh8LjWpqAnhCtCOk6l5ecNMLtQa0,1865
|
24
|
+
statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
|
25
|
+
statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
|
26
|
+
statistics/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
|
27
|
+
statistics/tests/test_callrate.py,sha256=a-j3muIn5unTH9dZW7xe_v2WU6uIMmj4PvzfWYkQbIs,5653
|
28
|
+
statistics/tests/test_freq_allele.py,sha256=7jbxM7MCkUnrAouiuphJ5lGdZq4K72IoIqz7JQdTFds,2586
|
29
|
+
statistics/tests/test_freq_maf.py,sha256=IndsZaCUBn7Hql8_ao35Cip78GA_ytDZjYugWXkdPt4,430
|
30
|
+
statistics/tests/test_hwe_t.py,sha256=MqoP3DJ-159WTJEbWsAzIVsuGG791PYPm4DjPFoehfw,847
|
31
|
+
statistics/tests/test_snphwe.py,sha256=chutPCO7lYYUMvsuDwvuehB6_4oBRkp1OnUJtL52HTE,995
|
32
|
+
snplib-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
33
|
+
snplib-1.0.0.dist-info/METADATA,sha256=E_EU6XriENiCXVP0rHOjnqwQGa19oqnALVKK3glNqg0,2298
|
34
|
+
snplib-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
35
|
+
snplib-1.0.0.dist-info/top_level.txt,sha256=JrsCvtXEC8OFF-STMY1jzbM5YEuA_57kuXXq5aI8dfc,40
|
36
|
+
snplib-1.0.0.dist-info/RECORD,,
|
statistics/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from ._snphwe import hwe, hwe_test
|
6
|
+
from ._callrate import call_rate
|
7
|
+
from ._freq import allele_freq, minor_allele_freq
|
8
|
+
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"call_rate",
|
12
|
+
"allele_freq",
|
13
|
+
"minor_allele_freq",
|
14
|
+
"hwe",
|
15
|
+
"hwe_test"
|
16
|
+
]
|
statistics/_callrate.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
def call_rate(
|
9
|
+
data: pd.DataFrame | str,
|
10
|
+
id_col: str = None,
|
11
|
+
snp_col: str = None
|
12
|
+
) -> pd.DataFrame | float | None:
|
13
|
+
""" The call rate for a given SNP is defined as the proportion of
|
14
|
+
individuals in the study for which the corresponding SNP information is
|
15
|
+
not missing. In the following example, we filter using a call rate of 95%,
|
16
|
+
meaning we retain SNPs for which there is less than 5% missing data.
|
17
|
+
|
18
|
+
Of the say, 54K markers in the chip, 50K have been genotyped for a
|
19
|
+
particular animal, the “call rate animal” is 50K/54K=93%
|
20
|
+
Of the say, 900 animals genotyped for marker CL635944_160.1, how many
|
21
|
+
have actually been successfully read? Assume that 600 have been read, then
|
22
|
+
the “call rate marker” is 600/900 = 67%
|
23
|
+
|
24
|
+
:param data: Pre-processed data on which the call rate is calculated.
|
25
|
+
:param id_col: The name of the column with the id of the animals or
|
26
|
+
markers.
|
27
|
+
:param snp_col: The name of the column with the snp sequence.
|
28
|
+
:return: Return dataframe with call rates for each animal if a dataframe
|
29
|
+
is transmitted. The number if the snp sequence is passed as a string.
|
30
|
+
None if there were errors.
|
31
|
+
"""
|
32
|
+
|
33
|
+
if isinstance(data, pd.DataFrame):
|
34
|
+
try:
|
35
|
+
if data[snp_col].dtype.hasobject:
|
36
|
+
if not data[snp_col].str.isdigit().all():
|
37
|
+
return None
|
38
|
+
|
39
|
+
return data[[id_col, snp_col]].\
|
40
|
+
groupby(by=id_col)[snp_col].\
|
41
|
+
apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
|
42
|
+
reset_index()
|
43
|
+
|
44
|
+
return data[[id_col, snp_col]]. \
|
45
|
+
groupby(by=id_col)[snp_col]. \
|
46
|
+
apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
|
47
|
+
reset_index()
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
raise e
|
51
|
+
|
52
|
+
elif isinstance(data, str):
|
53
|
+
if not data.isdigit():
|
54
|
+
return None
|
55
|
+
|
56
|
+
return round(1 - (data.count('5') / len(data)), 6)
|
57
|
+
|
58
|
+
else:
|
59
|
+
return None
|
statistics/_freq.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
def allele_freq(
|
9
|
+
data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
|
10
|
+
) -> pd.DataFrame | float | None:
|
11
|
+
""" The allele frequency represents the incidence of a gene variant in a
|
12
|
+
population.
|
13
|
+
|
14
|
+
:param data: Data array.
|
15
|
+
:param id_col: Columns with snp names.
|
16
|
+
:param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
|
17
|
+
:return: Return the alleles frequency.
|
18
|
+
"""
|
19
|
+
|
20
|
+
if isinstance(data, pd.DataFrame):
|
21
|
+
try:
|
22
|
+
if data[seq_col].dtype.hasobject:
|
23
|
+
if not data[seq_col].str.isdigit().all():
|
24
|
+
return None
|
25
|
+
|
26
|
+
return data.\
|
27
|
+
loc[data[seq_col] != "5", [id_col, seq_col]]. \
|
28
|
+
groupby(by=id_col)[seq_col]. \
|
29
|
+
apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
|
30
|
+
reset_index().\
|
31
|
+
round(3)
|
32
|
+
|
33
|
+
return data.\
|
34
|
+
loc[data[seq_col] != 5, [id_col, seq_col]].\
|
35
|
+
groupby(by=id_col)[seq_col].\
|
36
|
+
apply(lambda x: x.sum() / (2 * x.count())).\
|
37
|
+
reset_index().\
|
38
|
+
round(3)
|
39
|
+
|
40
|
+
except Exception as e:
|
41
|
+
raise e
|
42
|
+
|
43
|
+
elif isinstance(data, str):
|
44
|
+
if not data.isdigit():
|
45
|
+
return None
|
46
|
+
|
47
|
+
sam_seq = tuple(
|
48
|
+
map(int, filter(lambda x: x if x != "5" else None, data))
|
49
|
+
)
|
50
|
+
return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
|
51
|
+
|
52
|
+
else:
|
53
|
+
return None
|
54
|
+
|
55
|
+
|
56
|
+
def minor_allele_freq(value: float) -> float:
|
57
|
+
""" The minor allele frequency is therefore the frequency at which the
|
58
|
+
minor allele occurs within a population.
|
59
|
+
|
60
|
+
:param value: Allele frequency
|
61
|
+
:return: Return the minor alleles frequency
|
62
|
+
"""
|
63
|
+
|
64
|
+
if value > 0.5:
|
65
|
+
return round(1 - value, 3)
|
66
|
+
|
67
|
+
return round(value, 3)
|
statistics/_snphwe.py
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def hwe(
|
10
|
+
obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
|
11
|
+
) -> float:
|
12
|
+
""" Python interpretation hwe - https://github.com/jeremymcrae/snphwe
|
13
|
+
|
14
|
+
:param obs_hets: Number of observed heterozygotes (AB, BA)
|
15
|
+
:param obs_hom1: Number of observed homozygotes1 (AA)
|
16
|
+
:param obs_hom2: Number of observed homozygotes2 (BB)
|
17
|
+
:return: This is where the p-value is returned
|
18
|
+
"""
|
19
|
+
|
20
|
+
obs_hets = round(obs_hets)
|
21
|
+
obs_hom1 = round(obs_hom1)
|
22
|
+
obs_hom2 = round(obs_hom2)
|
23
|
+
|
24
|
+
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
|
25
|
+
raise ValueError("snphwe: negative allele count")
|
26
|
+
|
27
|
+
obs_homr = min(obs_hom1, obs_hom2)
|
28
|
+
obs_homc = max(obs_hom1, obs_hom2)
|
29
|
+
|
30
|
+
rare = 2 * obs_homr + obs_hets
|
31
|
+
genotypes = obs_hets + obs_homc + obs_homr
|
32
|
+
|
33
|
+
if genotypes == 0:
|
34
|
+
raise ValueError("snphwe: zero genotypes")
|
35
|
+
|
36
|
+
probs = np.zeros(round(rare) + 1)
|
37
|
+
|
38
|
+
# get distribution midpoint, but ensure midpoint and rare alleles have
|
39
|
+
# same parity
|
40
|
+
mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
|
41
|
+
if mid % 2 != rare % 2:
|
42
|
+
mid += 1
|
43
|
+
|
44
|
+
probs[mid] = 1.0
|
45
|
+
_sum = probs[mid]
|
46
|
+
|
47
|
+
curr_homr = (rare - mid) / 2
|
48
|
+
curr_homc = genotypes - mid - curr_homr
|
49
|
+
curr_hets = mid
|
50
|
+
while curr_hets > 1:
|
51
|
+
probs[curr_hets - 2] = (
|
52
|
+
probs[curr_hets] * curr_hets * (curr_hets - 1.0)
|
53
|
+
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
|
54
|
+
)
|
55
|
+
_sum += probs[curr_hets - 2]
|
56
|
+
|
57
|
+
# fewer heterozygotes -> add one rare, one common homozygote
|
58
|
+
curr_homr += 1
|
59
|
+
curr_homc += 1
|
60
|
+
curr_hets -= 2
|
61
|
+
|
62
|
+
# calculate probabilities from midpoint up
|
63
|
+
curr_homr = (rare - mid) / 2
|
64
|
+
curr_homc = genotypes - mid - curr_homr
|
65
|
+
|
66
|
+
curr_hets = mid
|
67
|
+
while curr_hets <= rare - 2:
|
68
|
+
probs[curr_hets + 2] = \
|
69
|
+
(probs[curr_hets] * 4.0 * curr_homr * curr_homc
|
70
|
+
/ ((curr_hets + 2.0) * (curr_hets + 1.0)))
|
71
|
+
_sum += probs[curr_hets + 2]
|
72
|
+
|
73
|
+
# add 2 heterozygotes -> subtract one rare, one common homozygote
|
74
|
+
curr_homr -= 1
|
75
|
+
curr_homc -= 1
|
76
|
+
curr_hets += 2
|
77
|
+
|
78
|
+
# p-value calculation for p_hwe
|
79
|
+
target = probs[obs_hets]
|
80
|
+
p_hwe = 0.0
|
81
|
+
|
82
|
+
for p in probs:
|
83
|
+
if p <= target:
|
84
|
+
p_hwe += p / _sum
|
85
|
+
|
86
|
+
return min(1.0, p_hwe)
|
87
|
+
|
88
|
+
|
89
|
+
def hwe_test(
|
90
|
+
seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
|
91
|
+
) -> bool:
|
92
|
+
""" The Hardy-Weinberg equilibrium is a principle stating that the genetic
|
93
|
+
variation in a population will remain constant from one generation to the
|
94
|
+
next in the absence of disturbing factors.
|
95
|
+
https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
|
96
|
+
|
97
|
+
:param seq_snp: SNP sequence
|
98
|
+
:param freq: Allele frequency
|
99
|
+
:param crit_chi2: The critical value for a test ("either / or":
|
100
|
+
observed and expected values are either one way or the other),
|
101
|
+
therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
|
102
|
+
:return: A decision is returned to exclude or retain the inspected snp
|
103
|
+
"""
|
104
|
+
|
105
|
+
_seq = seq_snp.replace(5, np.nan)
|
106
|
+
|
107
|
+
if _seq.nunique() == 1:
|
108
|
+
return True
|
109
|
+
|
110
|
+
n_genotypes = _seq.count()
|
111
|
+
|
112
|
+
observed = {
|
113
|
+
0: (_seq == 0).sum(),
|
114
|
+
1: (_seq == 1).sum(),
|
115
|
+
2: (_seq == 2).sum()
|
116
|
+
}
|
117
|
+
|
118
|
+
expected = {
|
119
|
+
0: ((1 - freq) ** 2) * n_genotypes,
|
120
|
+
1: (2 * ((1 - freq) * freq)) * n_genotypes,
|
121
|
+
2: (freq ** 2) * n_genotypes
|
122
|
+
}
|
123
|
+
|
124
|
+
chi = sum([
|
125
|
+
((obs - exp) ** 2) / exp
|
126
|
+
for obs, exp in zip(observed.values(), expected.values())
|
127
|
+
])
|
128
|
+
|
129
|
+
if chi > crit_chi2:
|
130
|
+
return False
|
131
|
+
else:
|
132
|
+
return True
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_DATA
|
6
|
+
from .. import call_rate
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def data_df(request) -> pd.DataFrame:
|
15
|
+
match request.param:
|
16
|
+
case "cra":
|
17
|
+
return pd.read_pickle(DIR_DATA / "cr/file_cra.pl")
|
18
|
+
|
19
|
+
case "crm":
|
20
|
+
return pd.read_pickle(DIR_DATA / "cr/file_crm.pl")
|
21
|
+
|
22
|
+
|
23
|
+
@pytest.fixture
|
24
|
+
def data_str() -> list[str]:
|
25
|
+
return ['02011015010000500', '01110152120222512']
|
26
|
+
|
27
|
+
|
28
|
+
class TestCallRateAnimal(object):
|
29
|
+
|
30
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
31
|
+
def test_cra_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
32
|
+
data_df.SNP = data_df.SNP.astype(str)
|
33
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
34
|
+
|
35
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
36
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
37
|
+
|
38
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
39
|
+
def test_cra_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
|
40
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
41
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
42
|
+
|
43
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
44
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
45
|
+
|
46
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
47
|
+
def test_cra_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
|
48
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
49
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
50
|
+
|
51
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
52
|
+
assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
|
53
|
+
|
54
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
55
|
+
def test_cra_datafame_dtype_random_simbols(
|
56
|
+
self, data_df: pd.DataFrame
|
57
|
+
) -> None:
|
58
|
+
data_df.SNP = [
|
59
|
+
np.random.choice(["A", "C", "G", "T"])
|
60
|
+
for _ in range(data_df.SNP.shape[0])
|
61
|
+
]
|
62
|
+
result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
|
63
|
+
|
64
|
+
assert result is None
|
65
|
+
|
66
|
+
def test_cra_datafame_empty1(self) -> None:
|
67
|
+
with pytest.raises(KeyError):
|
68
|
+
call_rate(data=pd.DataFrame(), id_col="SAMPLE_ID", snp_col="SNP")
|
69
|
+
|
70
|
+
def test_cra_datafame_empty2(self) -> None:
|
71
|
+
result = call_rate(
|
72
|
+
data=pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
73
|
+
id_col="SAMPLE_ID",
|
74
|
+
snp_col="SNP"
|
75
|
+
)
|
76
|
+
|
77
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
78
|
+
|
79
|
+
@pytest.mark.parametrize("data_df", ["cra"], indirect=True)
|
80
|
+
def test_cra_datafame_fail(self, data_df: pd.DataFrame) -> None:
|
81
|
+
with pytest.raises(KeyError):
|
82
|
+
call_rate(data=data_df, id_col="SAMPLE_ID")
|
83
|
+
call_rate(data=data_df, snp_col="SNP")
|
84
|
+
call_rate(data=data_df)
|
85
|
+
|
86
|
+
def test_cra_str_int(self, data_str: list[str]) -> None:
|
87
|
+
for sequence in data_str:
|
88
|
+
assert call_rate(data=sequence) == 0.882353
|
89
|
+
|
90
|
+
def test_cra_str_simbols(self) -> None:
|
91
|
+
data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
|
92
|
+
|
93
|
+
for sequence in data_str:
|
94
|
+
assert call_rate(data=sequence) is None
|
95
|
+
|
96
|
+
def test_cra_str_empty(self) -> None:
|
97
|
+
assert call_rate(data="") is None
|
98
|
+
|
99
|
+
def test_cra_str_mixid(self) -> None:
|
100
|
+
assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
|
101
|
+
|
102
|
+
|
103
|
+
class TestCallRateMarker(object):
|
104
|
+
|
105
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
106
|
+
def test_crm_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
107
|
+
data_df.SNP = data_df.SNP.astype(str)
|
108
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
109
|
+
|
110
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
111
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
112
|
+
|
113
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
114
|
+
def test_crm_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
|
115
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
116
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
117
|
+
|
118
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
119
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
120
|
+
|
121
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
122
|
+
def test_crm_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
|
123
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
124
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
125
|
+
|
126
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
127
|
+
assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
|
128
|
+
|
129
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
130
|
+
def test_crm_datafame_dtype_random_simbols(
|
131
|
+
self, data_df: pd.DataFrame
|
132
|
+
) -> None:
|
133
|
+
data_df.SNP = [
|
134
|
+
np.random.choice(["A", "C", "G", "T"])
|
135
|
+
for _ in range(data_df.SNP.shape[0])
|
136
|
+
]
|
137
|
+
result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
|
138
|
+
|
139
|
+
assert result is None
|
140
|
+
|
141
|
+
def test_crm_datafame_empty1(self) -> None:
|
142
|
+
with pytest.raises(KeyError):
|
143
|
+
call_rate(data=pd.DataFrame(), id_col="SNP_NAME", snp_col="SNP")
|
144
|
+
|
145
|
+
def test_crm_datafame_empty2(self) -> None:
|
146
|
+
result = call_rate(
|
147
|
+
data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
|
148
|
+
id_col="SNP_NAME",
|
149
|
+
snp_col="SNP"
|
150
|
+
)
|
151
|
+
|
152
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
153
|
+
|
154
|
+
@pytest.mark.parametrize("data_df", ["crm"], indirect=True)
|
155
|
+
def test_crm_datafame_fail(self, data_df: pd.DataFrame) -> None:
|
156
|
+
with pytest.raises(KeyError):
|
157
|
+
call_rate(data=data_df, id_col="SNP_NAME")
|
158
|
+
call_rate(data=data_df, snp_col="SNP")
|
159
|
+
call_rate(data=data_df)
|
160
|
+
|
161
|
+
def test_crm_str_simbols(self) -> None:
|
162
|
+
data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
|
163
|
+
|
164
|
+
for sequence in data_str:
|
165
|
+
assert call_rate(data=sequence) is None
|
166
|
+
|
167
|
+
def test_crm_str_empty(self) -> None:
|
168
|
+
assert call_rate(data="") is None
|
169
|
+
|
170
|
+
def test_crm_str_mixid(self) -> None:
|
171
|
+
assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_DATA
|
6
|
+
from .. import allele_freq
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def data_df() -> pd.DataFrame:
|
15
|
+
return pd.read_pickle(DIR_DATA / "freq/file.pl")
|
16
|
+
# [0. , 0.9 , 0.889]
|
17
|
+
|
18
|
+
|
19
|
+
def data_str() -> list[tuple]:
|
20
|
+
return [
|
21
|
+
('2212120', 0.714),
|
22
|
+
('02011015010000500', 0.2),
|
23
|
+
('01110152120222512', 0.6)
|
24
|
+
]
|
25
|
+
|
26
|
+
|
27
|
+
class TestAlleleFreq(object):
|
28
|
+
|
29
|
+
def test_allele_freq_df_dtype_obj(self, data_df: pd.DataFrame) -> None:
|
30
|
+
data_df.SNP = data_df.SNP.astype(str)
|
31
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
32
|
+
|
33
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
34
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
35
|
+
|
36
|
+
def test_allele_freq_df_dtype_int(self, data_df: pd.DataFrame) -> None:
|
37
|
+
data_df.SNP = data_df.SNP.astype("int8")
|
38
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
39
|
+
|
40
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
41
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
42
|
+
|
43
|
+
def test_allele_freq_df_dtype_float(self, data_df: pd.DataFrame) -> None:
|
44
|
+
data_df.SNP = data_df.SNP.astype("float32")
|
45
|
+
result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
|
46
|
+
|
47
|
+
assert isinstance(result, pd.DataFrame) and not result.empty
|
48
|
+
assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
|
49
|
+
|
50
|
+
def test_allele_freq_df_data_rand_simbols(
|
51
|
+
self, data_df: pd.DataFrame
|
52
|
+
) -> None:
|
53
|
+
data_df.SNP = [
|
54
|
+
np.random.choice(["A", "C", "G", "T"])
|
55
|
+
for _ in range(data_df.SNP.shape[0])
|
56
|
+
]
|
57
|
+
assert allele_freq(
|
58
|
+
data=data_df, id_col="SNP_NAME", seq_col="SNP"
|
59
|
+
) is None
|
60
|
+
|
61
|
+
def test_allele_freq_df_empty(self) -> None:
|
62
|
+
with pytest.raises(KeyError):
|
63
|
+
allele_freq(
|
64
|
+
data=pd.DataFrame(), id_col="SNP_NAME", seq_col="SNP"
|
65
|
+
)
|
66
|
+
|
67
|
+
def test_allele_freq_df_empty_only_columns(self) -> None:
|
68
|
+
result = allele_freq(
|
69
|
+
data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
|
70
|
+
id_col="SNP_NAME",
|
71
|
+
seq_col="SNP"
|
72
|
+
)
|
73
|
+
|
74
|
+
assert isinstance(result, pd.DataFrame) and result.empty
|
75
|
+
|
76
|
+
def test_allele_freq_df_raises(self, data_df: pd.DataFrame) -> None:
|
77
|
+
with pytest.raises(KeyError):
|
78
|
+
allele_freq(data=data_df, id_col="SNP_NAME")
|
79
|
+
allele_freq(data=data_df, seq_col="SNP")
|
80
|
+
allele_freq(data=data_df)
|
81
|
+
|
82
|
+
@pytest.mark.parametrize("data, obs_value", data_str())
|
83
|
+
def test_allele_freq_str(self, data: str, obs_value: float) -> None:
|
84
|
+
assert allele_freq(data=data) == obs_value
|
85
|
+
|
86
|
+
def test_allele_freq_non_type(self) -> None:
|
87
|
+
assert allele_freq(data=1423) is None
|