snplib 1.0.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.2
2
+ Name: snplib
3
+ Version: 1.0.0
4
+ Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
5
+ Author-email: Igor <igor.loschinin@gmail.com>
6
+ License: GNU
7
+ Project-URL: Homepage, https://github.com/IgorekLoschinin/snptools
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: numpy==1.26.1
12
+ Requires-Dist: pandas==2.1.1
13
+ Requires-Dist: six==1.16.0
14
+ Requires-Dist: swifter==1.4.0
15
+ Requires-Dist: xlrd==2.0.1
16
+ Requires-Dist: XlsxWriter==3.1.9
17
+ Requires-Dist: openpyxl==3.1.2
18
+ Requires-Dist: pydantic==2.4.2
19
+ Requires-Dist: pytest==7.4.2
20
+ Requires-Dist: sphinx==7.2.6
21
+ Requires-Dist: sphinx_rtd_theme==1.3.0
22
+
23
+ # snptools
24
+
25
+ **Snptools** is a tool for SNP (Single Nucleotide Polymorphism) data processing,
26
+ parentage calculation and call rate estimation.
27
+
28
+ ## Introduction
29
+
30
+ SNP (Single Nucleotide Polymorphism) represent genetic variations, that can
31
+ be used to analyze genetic data. SNPTools provides a set of tools for working
32
+ with SNP data, including the following capabilities:
33
+
34
+ - SNP data processing - FinalReport.
35
+ - Parentage Verification and Parentage Discovery Based on SNP Genotypes (ICAR).
36
+ - Call rate estimation (percentage of missing data).
37
+ - Processing and preparation of data in plink formats.
38
+
39
+ ## Installation
40
+
41
+ To install SNPTools, follow the steps below:
42
+
43
+ 1. Clone the repository into your project directory:
44
+ ```
45
+ git clone https://github.com/yourusername/snpTools.git
46
+ ```
47
+ 2. Set dependencies:
48
+ ```
49
+ pip install -r requirements.txt
50
+ ```
51
+ 3. Use SNPTools:
52
+ ```
53
+ import snptools
54
+ ```
55
+
56
+ ## Usage
57
+ Snptools provides commands for a variety of operations. Here are examples of
58
+ usage:
59
+
60
+ #### SNP data processing:
61
+ ```
62
+ from snptools.finalreport import FinalReport
63
+ ```
64
+
65
+ #### Computation of parentage:
66
+ ```
67
+ from snptools.parentage import Discovery, Verification
68
+ ```
69
+
70
+ #### Preparation format files:
71
+ ```
72
+ from snptools.format import (
73
+ Snp, make_fam, make_ped, make_lgen, make_map
74
+ )
75
+ ```
76
+
77
+ #### Stat:
78
+ ```
79
+ from snptools.statistics import (
80
+ hwe, hwe_test, call_rate, allele_freq, minor_allele_freq
81
+ )
82
+ ```
83
+
84
+ ## Documentation
85
+ Detailed documentation on how to use SNPTools is available see the docs.
86
+
87
+ ## License
88
+ This project is licensed under the GNU General Public License - see the
89
+ LICENSE file for details.
@@ -0,0 +1,36 @@
1
+ finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
2
+ finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
3
+ finalreport/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
4
+ finalreport/tests/test_finalreport.py,sha256=uOnV-p2_yhqKEZSoVOt7W4oxJwEyuBVTfpi3iQ2tTp4,6498
5
+ format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
6
+ format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
7
+ format/_plink.py,sha256=Z09IOPACOt3n8CKEVRkE4tLT16I8e_6ZoMaWRxSImrA,10529
8
+ format/_snp.py,sha256=oI-V4-_w28aX-VxoimywLDDnX6owhdjLbqt9a54_ouU,3172
9
+ format/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
10
+ format/tests/test_plink_fam.py,sha256=vUcDFIU17ez2S9hIXIxrqjs4OxgWuC8wat6NzG4nwYQ,2757
11
+ format/tests/test_plink_lgen.py,sha256=cw8jeAe74iWI45gZ8Ix8diLWFGeoAoq3LWmHW9Mz8Nw,2205
12
+ format/tests/test_plink_map.py,sha256=DtFgJpYjSdKk5AzDbZW4O9mRu1M43lgZMYMb-bmzEX8,947
13
+ format/tests/test_plink_ped.py,sha256=8SbNs9v9ss8ZDv52csMU1D5uONfhic7_mvuBrgN5Xss,2938
14
+ format/tests/test_snp.py,sha256=xtb2fniEARHoNFvFd9IrnX-QBBEKO6zyDmhaTdJ3Ric,3263
15
+ parentage/__init__.py,sha256=bN3mWTxmaFQ1qzRtyMLaAoxfomz6jnoWa-kmnJ9q_fE,280
16
+ parentage/_discov.py,sha256=qGlNzpl4xKOWRr6-fi1osylzizgiPO8vCus8VE56nec,3180
17
+ parentage/_isagmark.py,sha256=0xi9YhuIpU7zf16HnWw1XIkcQLk4rTNeAeCE-5p9hQE,356
18
+ parentage/_verif.py,sha256=VbX46dC4tl4Qeuw65aRg1s6hSn0FI25Hy9-3U_jxmrg,3019
19
+ parentage/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
20
+ parentage/tests/test_discov.py,sha256=gJt1SjYTr7ZaphZayc1oBR9THTFjFxvViTi9a0X4fnI,4150
21
+ parentage/tests/test_verif.py,sha256=JROafzDlyMUBfbXq6iq3EVhEqHkrh9tRB-C0EdOf7aY,3951
22
+ statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw,302
23
+ statistics/_callrate.py,sha256=ghB1EXT5JLQeIEIzh8LjWpqAnhCtCOk6l5ecNMLtQa0,1865
24
+ statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
25
+ statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
26
+ statistics/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
27
+ statistics/tests/test_callrate.py,sha256=a-j3muIn5unTH9dZW7xe_v2WU6uIMmj4PvzfWYkQbIs,5653
28
+ statistics/tests/test_freq_allele.py,sha256=7jbxM7MCkUnrAouiuphJ5lGdZq4K72IoIqz7JQdTFds,2586
29
+ statistics/tests/test_freq_maf.py,sha256=IndsZaCUBn7Hql8_ao35Cip78GA_ytDZjYugWXkdPt4,430
30
+ statistics/tests/test_hwe_t.py,sha256=MqoP3DJ-159WTJEbWsAzIVsuGG791PYPm4DjPFoehfw,847
31
+ statistics/tests/test_snphwe.py,sha256=chutPCO7lYYUMvsuDwvuehB6_4oBRkp1OnUJtL52HTE,995
32
+ snplib-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ snplib-1.0.0.dist-info/METADATA,sha256=E_EU6XriENiCXVP0rHOjnqwQGa19oqnALVKK3glNqg0,2298
34
+ snplib-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
35
+ snplib-1.0.0.dist-info/top_level.txt,sha256=JrsCvtXEC8OFF-STMY1jzbM5YEuA_57kuXXq5aI8dfc,40
36
+ snplib-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,4 @@
1
+ finalreport
2
+ format
3
+ parentage
4
+ statistics
statistics/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snphwe import hwe, hwe_test
6
+ from ._callrate import call_rate
7
+ from ._freq import allele_freq, minor_allele_freq
8
+
9
+
10
+ __all__ = [
11
+ "call_rate",
12
+ "allele_freq",
13
+ "minor_allele_freq",
14
+ "hwe",
15
+ "hwe_test"
16
+ ]
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def call_rate(
9
+ data: pd.DataFrame | str,
10
+ id_col: str = None,
11
+ snp_col: str = None
12
+ ) -> pd.DataFrame | float | None:
13
+ """ The call rate for a given SNP is defined as the proportion of
14
+ individuals in the study for which the corresponding SNP information is
15
+ not missing. In the following example, we filter using a call rate of 95%,
16
+ meaning we retain SNPs for which there is less than 5% missing data.
17
+
18
+ Of the say, 54K markers in the chip, 50K have been genotyped for a
19
+ particular animal, the “call rate animal” is 50K/54K=93%
20
+ Of the say, 900 animals genotyped for marker CL635944_160.1, how many
21
+ have actually been successfully read? Assume that 600 have been read, then
22
+ the “call rate marker” is 600/900 = 67%
23
+
24
+ :param data: Pre-processed data on which the call rate is calculated.
25
+ :param id_col: The name of the column with the id of the animals or
26
+ markers.
27
+ :param snp_col: The name of the column with the snp sequence.
28
+ :return: Return dataframe with call rates for each animal if a dataframe
29
+ is transmitted. The number if the snp sequence is passed as a string.
30
+ None if there were errors.
31
+ """
32
+
33
+ if isinstance(data, pd.DataFrame):
34
+ try:
35
+ if data[snp_col].dtype.hasobject:
36
+ if not data[snp_col].str.isdigit().all():
37
+ return None
38
+
39
+ return data[[id_col, snp_col]].\
40
+ groupby(by=id_col)[snp_col].\
41
+ apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
42
+ reset_index()
43
+
44
+ return data[[id_col, snp_col]]. \
45
+ groupby(by=id_col)[snp_col]. \
46
+ apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
47
+ reset_index()
48
+
49
+ except Exception as e:
50
+ raise e
51
+
52
+ elif isinstance(data, str):
53
+ if not data.isdigit():
54
+ return None
55
+
56
+ return round(1 - (data.count('5') / len(data)), 6)
57
+
58
+ else:
59
+ return None
statistics/_freq.py ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def allele_freq(
9
+ data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
10
+ ) -> pd.DataFrame | float | None:
11
+ """ The allele frequency represents the incidence of a gene variant in a
12
+ population.
13
+
14
+ :param data: Data array.
15
+ :param id_col: Columns with snp names.
16
+ :param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
17
+ :return: Return the alleles frequency.
18
+ """
19
+
20
+ if isinstance(data, pd.DataFrame):
21
+ try:
22
+ if data[seq_col].dtype.hasobject:
23
+ if not data[seq_col].str.isdigit().all():
24
+ return None
25
+
26
+ return data.\
27
+ loc[data[seq_col] != "5", [id_col, seq_col]]. \
28
+ groupby(by=id_col)[seq_col]. \
29
+ apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
30
+ reset_index().\
31
+ round(3)
32
+
33
+ return data.\
34
+ loc[data[seq_col] != 5, [id_col, seq_col]].\
35
+ groupby(by=id_col)[seq_col].\
36
+ apply(lambda x: x.sum() / (2 * x.count())).\
37
+ reset_index().\
38
+ round(3)
39
+
40
+ except Exception as e:
41
+ raise e
42
+
43
+ elif isinstance(data, str):
44
+ if not data.isdigit():
45
+ return None
46
+
47
+ sam_seq = tuple(
48
+ map(int, filter(lambda x: x if x != "5" else None, data))
49
+ )
50
+ return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
51
+
52
+ else:
53
+ return None
54
+
55
+
56
+ def minor_allele_freq(value: float) -> float:
57
+ """ The minor allele frequency is therefore the frequency at which the
58
+ minor allele occurs within a population.
59
+
60
+ :param value: Allele frequency
61
+ :return: Return the minor alleles frequency
62
+ """
63
+
64
+ if value > 0.5:
65
+ return round(1 - value, 3)
66
+
67
+ return round(value, 3)
statistics/_snphwe.py ADDED
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def hwe(
10
+ obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
11
+ ) -> float:
12
+ """ Python interpretation hwe - https://github.com/jeremymcrae/snphwe
13
+
14
+ :param obs_hets: Number of observed heterozygotes (AB, BA)
15
+ :param obs_hom1: Number of observed homozygotes1 (AA)
16
+ :param obs_hom2: Number of observed homozygotes2 (BB)
17
+ :return: This is where the p-value is returned
18
+ """
19
+
20
+ obs_hets = round(obs_hets)
21
+ obs_hom1 = round(obs_hom1)
22
+ obs_hom2 = round(obs_hom2)
23
+
24
+ if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
25
+ raise ValueError("snphwe: negative allele count")
26
+
27
+ obs_homr = min(obs_hom1, obs_hom2)
28
+ obs_homc = max(obs_hom1, obs_hom2)
29
+
30
+ rare = 2 * obs_homr + obs_hets
31
+ genotypes = obs_hets + obs_homc + obs_homr
32
+
33
+ if genotypes == 0:
34
+ raise ValueError("snphwe: zero genotypes")
35
+
36
+ probs = np.zeros(round(rare) + 1)
37
+
38
+ # get distribution midpoint, but ensure midpoint and rare alleles have
39
+ # same parity
40
+ mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
41
+ if mid % 2 != rare % 2:
42
+ mid += 1
43
+
44
+ probs[mid] = 1.0
45
+ _sum = probs[mid]
46
+
47
+ curr_homr = (rare - mid) / 2
48
+ curr_homc = genotypes - mid - curr_homr
49
+ curr_hets = mid
50
+ while curr_hets > 1:
51
+ probs[curr_hets - 2] = (
52
+ probs[curr_hets] * curr_hets * (curr_hets - 1.0)
53
+ / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
54
+ )
55
+ _sum += probs[curr_hets - 2]
56
+
57
+ # fewer heterozygotes -> add one rare, one common homozygote
58
+ curr_homr += 1
59
+ curr_homc += 1
60
+ curr_hets -= 2
61
+
62
+ # calculate probabilities from midpoint up
63
+ curr_homr = (rare - mid) / 2
64
+ curr_homc = genotypes - mid - curr_homr
65
+
66
+ curr_hets = mid
67
+ while curr_hets <= rare - 2:
68
+ probs[curr_hets + 2] = \
69
+ (probs[curr_hets] * 4.0 * curr_homr * curr_homc
70
+ / ((curr_hets + 2.0) * (curr_hets + 1.0)))
71
+ _sum += probs[curr_hets + 2]
72
+
73
+ # add 2 heterozygotes -> subtract one rare, one common homozygote
74
+ curr_homr -= 1
75
+ curr_homc -= 1
76
+ curr_hets += 2
77
+
78
+ # p-value calculation for p_hwe
79
+ target = probs[obs_hets]
80
+ p_hwe = 0.0
81
+
82
+ for p in probs:
83
+ if p <= target:
84
+ p_hwe += p / _sum
85
+
86
+ return min(1.0, p_hwe)
87
+
88
+
89
+ def hwe_test(
90
+ seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
91
+ ) -> bool:
92
+ """ The Hardy-Weinberg equilibrium is a principle stating that the genetic
93
+ variation in a population will remain constant from one generation to the
94
+ next in the absence of disturbing factors.
95
+ https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
96
+
97
+ :param seq_snp: SNP sequence
98
+ :param freq: Allele frequency
99
+ :param crit_chi2: The critical value for a test ("either / or":
100
+ observed and expected values are either one way or the other),
101
+ therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
102
+ :return: A decision is returned to exclude or retain the inspected snp
103
+ """
104
+
105
+ _seq = seq_snp.replace(5, np.nan)
106
+
107
+ if _seq.nunique() == 1:
108
+ return True
109
+
110
+ n_genotypes = _seq.count()
111
+
112
+ observed = {
113
+ 0: (_seq == 0).sum(),
114
+ 1: (_seq == 1).sum(),
115
+ 2: (_seq == 2).sum()
116
+ }
117
+
118
+ expected = {
119
+ 0: ((1 - freq) ** 2) * n_genotypes,
120
+ 1: (2 * ((1 - freq) * freq)) * n_genotypes,
121
+ 2: (freq ** 2) * n_genotypes
122
+ }
123
+
124
+ chi = sum([
125
+ ((obs - exp) ** 2) / exp
126
+ for obs, exp in zip(observed.values(), expected.values())
127
+ ])
128
+
129
+ if chi > crit_chi2:
130
+ return False
131
+ else:
132
+ return True
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_DATA = Path(__file__).parent.joinpath("data")
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_DATA
6
+ from .. import call_rate
7
+
8
+ import pytest
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ @pytest.fixture
14
+ def data_df(request) -> pd.DataFrame:
15
+ match request.param:
16
+ case "cra":
17
+ return pd.read_pickle(DIR_DATA / "cr/file_cra.pl")
18
+
19
+ case "crm":
20
+ return pd.read_pickle(DIR_DATA / "cr/file_crm.pl")
21
+
22
+
23
+ @pytest.fixture
24
+ def data_str() -> list[str]:
25
+ return ['02011015010000500', '01110152120222512']
26
+
27
+
28
+ class TestCallRateAnimal(object):
29
+
30
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
31
+ def test_cra_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
32
+ data_df.SNP = data_df.SNP.astype(str)
33
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
34
+
35
+ assert isinstance(result, pd.DataFrame) and not result.empty
36
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
37
+
38
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
39
+ def test_cra_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
40
+ data_df.SNP = data_df.SNP.astype("int8")
41
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
42
+
43
+ assert isinstance(result, pd.DataFrame) and not result.empty
44
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
45
+
46
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
47
+ def test_cra_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
48
+ data_df.SNP = data_df.SNP.astype("float32")
49
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
50
+
51
+ assert isinstance(result, pd.DataFrame) and not result.empty
52
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
53
+
54
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
55
+ def test_cra_datafame_dtype_random_simbols(
56
+ self, data_df: pd.DataFrame
57
+ ) -> None:
58
+ data_df.SNP = [
59
+ np.random.choice(["A", "C", "G", "T"])
60
+ for _ in range(data_df.SNP.shape[0])
61
+ ]
62
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
63
+
64
+ assert result is None
65
+
66
+ def test_cra_datafame_empty1(self) -> None:
67
+ with pytest.raises(KeyError):
68
+ call_rate(data=pd.DataFrame(), id_col="SAMPLE_ID", snp_col="SNP")
69
+
70
+ def test_cra_datafame_empty2(self) -> None:
71
+ result = call_rate(
72
+ data=pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
73
+ id_col="SAMPLE_ID",
74
+ snp_col="SNP"
75
+ )
76
+
77
+ assert isinstance(result, pd.DataFrame) and result.empty
78
+
79
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
80
+ def test_cra_datafame_fail(self, data_df: pd.DataFrame) -> None:
81
+ with pytest.raises(KeyError):
82
+ call_rate(data=data_df, id_col="SAMPLE_ID")
83
+ call_rate(data=data_df, snp_col="SNP")
84
+ call_rate(data=data_df)
85
+
86
+ def test_cra_str_int(self, data_str: list[str]) -> None:
87
+ for sequence in data_str:
88
+ assert call_rate(data=sequence) == 0.882353
89
+
90
+ def test_cra_str_simbols(self) -> None:
91
+ data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
92
+
93
+ for sequence in data_str:
94
+ assert call_rate(data=sequence) is None
95
+
96
+ def test_cra_str_empty(self) -> None:
97
+ assert call_rate(data="") is None
98
+
99
+ def test_cra_str_mixid(self) -> None:
100
+ assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
101
+
102
+
103
+ class TestCallRateMarker(object):
104
+
105
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
106
+ def test_crm_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
107
+ data_df.SNP = data_df.SNP.astype(str)
108
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
109
+
110
+ assert isinstance(result, pd.DataFrame) and not result.empty
111
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
112
+
113
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
114
+ def test_crm_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
115
+ data_df.SNP = data_df.SNP.astype("int8")
116
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
117
+
118
+ assert isinstance(result, pd.DataFrame) and not result.empty
119
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
120
+
121
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
122
+ def test_crm_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
123
+ data_df.SNP = data_df.SNP.astype("float32")
124
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
125
+
126
+ assert isinstance(result, pd.DataFrame) and not result.empty
127
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
128
+
129
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
130
+ def test_crm_datafame_dtype_random_simbols(
131
+ self, data_df: pd.DataFrame
132
+ ) -> None:
133
+ data_df.SNP = [
134
+ np.random.choice(["A", "C", "G", "T"])
135
+ for _ in range(data_df.SNP.shape[0])
136
+ ]
137
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
138
+
139
+ assert result is None
140
+
141
+ def test_crm_datafame_empty1(self) -> None:
142
+ with pytest.raises(KeyError):
143
+ call_rate(data=pd.DataFrame(), id_col="SNP_NAME", snp_col="SNP")
144
+
145
+ def test_crm_datafame_empty2(self) -> None:
146
+ result = call_rate(
147
+ data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
148
+ id_col="SNP_NAME",
149
+ snp_col="SNP"
150
+ )
151
+
152
+ assert isinstance(result, pd.DataFrame) and result.empty
153
+
154
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
155
+ def test_crm_datafame_fail(self, data_df: pd.DataFrame) -> None:
156
+ with pytest.raises(KeyError):
157
+ call_rate(data=data_df, id_col="SNP_NAME")
158
+ call_rate(data=data_df, snp_col="SNP")
159
+ call_rate(data=data_df)
160
+
161
+ def test_crm_str_simbols(self) -> None:
162
+ data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
163
+
164
+ for sequence in data_str:
165
+ assert call_rate(data=sequence) is None
166
+
167
+ def test_crm_str_empty(self) -> None:
168
+ assert call_rate(data="") is None
169
+
170
+ def test_crm_str_mixid(self) -> None:
171
+ assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_DATA
6
+ from .. import allele_freq
7
+
8
+ import pytest
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ @pytest.fixture
14
+ def data_df() -> pd.DataFrame:
15
+ return pd.read_pickle(DIR_DATA / "freq/file.pl")
16
+ # [0. , 0.9 , 0.889]
17
+
18
+
19
+ def data_str() -> list[tuple]:
20
+ return [
21
+ ('2212120', 0.714),
22
+ ('02011015010000500', 0.2),
23
+ ('01110152120222512', 0.6)
24
+ ]
25
+
26
+
27
+ class TestAlleleFreq(object):
28
+
29
+ def test_allele_freq_df_dtype_obj(self, data_df: pd.DataFrame) -> None:
30
+ data_df.SNP = data_df.SNP.astype(str)
31
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
32
+
33
+ assert isinstance(result, pd.DataFrame) and not result.empty
34
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
35
+
36
+ def test_allele_freq_df_dtype_int(self, data_df: pd.DataFrame) -> None:
37
+ data_df.SNP = data_df.SNP.astype("int8")
38
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
39
+
40
+ assert isinstance(result, pd.DataFrame) and not result.empty
41
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
42
+
43
+ def test_allele_freq_df_dtype_float(self, data_df: pd.DataFrame) -> None:
44
+ data_df.SNP = data_df.SNP.astype("float32")
45
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
46
+
47
+ assert isinstance(result, pd.DataFrame) and not result.empty
48
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
49
+
50
+ def test_allele_freq_df_data_rand_simbols(
51
+ self, data_df: pd.DataFrame
52
+ ) -> None:
53
+ data_df.SNP = [
54
+ np.random.choice(["A", "C", "G", "T"])
55
+ for _ in range(data_df.SNP.shape[0])
56
+ ]
57
+ assert allele_freq(
58
+ data=data_df, id_col="SNP_NAME", seq_col="SNP"
59
+ ) is None
60
+
61
+ def test_allele_freq_df_empty(self) -> None:
62
+ with pytest.raises(KeyError):
63
+ allele_freq(
64
+ data=pd.DataFrame(), id_col="SNP_NAME", seq_col="SNP"
65
+ )
66
+
67
+ def test_allele_freq_df_empty_only_columns(self) -> None:
68
+ result = allele_freq(
69
+ data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
70
+ id_col="SNP_NAME",
71
+ seq_col="SNP"
72
+ )
73
+
74
+ assert isinstance(result, pd.DataFrame) and result.empty
75
+
76
+ def test_allele_freq_df_raises(self, data_df: pd.DataFrame) -> None:
77
+ with pytest.raises(KeyError):
78
+ allele_freq(data=data_df, id_col="SNP_NAME")
79
+ allele_freq(data=data_df, seq_col="SNP")
80
+ allele_freq(data=data_df)
81
+
82
+ @pytest.mark.parametrize("data, obs_value", data_str())
83
+ def test_allele_freq_str(self, data: str, obs_value: float) -> None:
84
+ assert allele_freq(data=data) == obs_value
85
+
86
+ def test_allele_freq_non_type(self) -> None:
87
+ assert allele_freq(data=1423) is None