snplib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.2
2
+ Name: snplib
3
+ Version: 1.0.0
4
+ Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
5
+ Author-email: Igor <igor.loschinin@gmail.com>
6
+ License: GNU
7
+ Project-URL: Homepage, https://github.com/IgorekLoschinin/snptools
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: numpy==1.26.1
12
+ Requires-Dist: pandas==2.1.1
13
+ Requires-Dist: six==1.16.0
14
+ Requires-Dist: swifter==1.4.0
15
+ Requires-Dist: xlrd==2.0.1
16
+ Requires-Dist: XlsxWriter==3.1.9
17
+ Requires-Dist: openpyxl==3.1.2
18
+ Requires-Dist: pydantic==2.4.2
19
+ Requires-Dist: pytest==7.4.2
20
+ Requires-Dist: sphinx==7.2.6
21
+ Requires-Dist: sphinx_rtd_theme==1.3.0
22
+
23
+ # snptools
24
+
25
+ **Snptools** is a tool for SNP (Single Nucleotide Polymorphism) data processing,
26
+ parentage calculation and call rate estimation.
27
+
28
+ ## Introduction
29
+
30
+ SNP (Single Nucleotide Polymorphism) represent genetic variations, that can
31
+ be used to analyze genetic data. SNPTools provides a set of tools for working
32
+ with SNP data, including the following capabilities:
33
+
34
+ - SNP data processing - FinalReport.
35
+ - Parentage Verification and Parentage Discovery Based on SNP Genotypes (ICAR).
36
+ - Call rate estimation (percentage of missing data).
37
+ - Processing and preparation of data in plink formats.
38
+
39
+ ## Installation
40
+
41
+ To install SNPTools, follow the steps below:
42
+
43
+ 1. Clone the repository into your project directory:
44
+ ```
45
+ git clone https://github.com/yourusername/snpTools.git
46
+ ```
47
+ 2. Set dependencies:
48
+ ```
49
+ pip install -r requirements.txt
50
+ ```
51
+ 3. Use SNPTools:
52
+ ```
53
+ import snptools
54
+ ```
55
+
56
+ ## Usage
57
+ Snptools provides commands for a variety of operations. Here are examples of
58
+ usage:
59
+
60
+ #### SNP data processing:
61
+ ```
62
+ from snptools.finalreport import FinalReport
63
+ ```
64
+
65
+ #### Computation of parentage:
66
+ ```
67
+ from snptools.parentage import Discovery, Verification
68
+ ```
69
+
70
+ #### Preparation format files:
71
+ ```
72
+ from snptools.format import (
73
+ Snp, make_fam, make_ped, make_lgen, make_map
74
+ )
75
+ ```
76
+
77
+ #### Stat:
78
+ ```
79
+ from snptools.statistics import (
80
+ hwe, hwe_test, call_rate, allele_freq, minor_allele_freq
81
+ )
82
+ ```
83
+
84
+ ## Documentation
85
+ Detailed documentation on how to use SNPTools is available see the docs.
86
+
87
+ ## License
88
+ This project is licensed under the GNU General Public License - see the
89
+ LICENSE file for details.
@@ -0,0 +1,36 @@
1
+ finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
2
+ finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
3
+ finalreport/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
4
+ finalreport/tests/test_finalreport.py,sha256=uOnV-p2_yhqKEZSoVOt7W4oxJwEyuBVTfpi3iQ2tTp4,6498
5
+ format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
6
+ format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
7
+ format/_plink.py,sha256=Z09IOPACOt3n8CKEVRkE4tLT16I8e_6ZoMaWRxSImrA,10529
8
+ format/_snp.py,sha256=oI-V4-_w28aX-VxoimywLDDnX6owhdjLbqt9a54_ouU,3172
9
+ format/tests/__init__.py,sha256=oNE_1bfqJEIatN_AZEuBAoM7uim_Q1X9DZuLAe1-dDc,174
10
+ format/tests/test_plink_fam.py,sha256=vUcDFIU17ez2S9hIXIxrqjs4OxgWuC8wat6NzG4nwYQ,2757
11
+ format/tests/test_plink_lgen.py,sha256=cw8jeAe74iWI45gZ8Ix8diLWFGeoAoq3LWmHW9Mz8Nw,2205
12
+ format/tests/test_plink_map.py,sha256=DtFgJpYjSdKk5AzDbZW4O9mRu1M43lgZMYMb-bmzEX8,947
13
+ format/tests/test_plink_ped.py,sha256=8SbNs9v9ss8ZDv52csMU1D5uONfhic7_mvuBrgN5Xss,2938
14
+ format/tests/test_snp.py,sha256=xtb2fniEARHoNFvFd9IrnX-QBBEKO6zyDmhaTdJ3Ric,3263
15
+ parentage/__init__.py,sha256=bN3mWTxmaFQ1qzRtyMLaAoxfomz6jnoWa-kmnJ9q_fE,280
16
+ parentage/_discov.py,sha256=qGlNzpl4xKOWRr6-fi1osylzizgiPO8vCus8VE56nec,3180
17
+ parentage/_isagmark.py,sha256=0xi9YhuIpU7zf16HnWw1XIkcQLk4rTNeAeCE-5p9hQE,356
18
+ parentage/_verif.py,sha256=VbX46dC4tl4Qeuw65aRg1s6hSn0FI25Hy9-3U_jxmrg,3019
19
+ parentage/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
20
+ parentage/tests/test_discov.py,sha256=gJt1SjYTr7ZaphZayc1oBR9THTFjFxvViTi9a0X4fnI,4150
21
+ parentage/tests/test_verif.py,sha256=JROafzDlyMUBfbXq6iq3EVhEqHkrh9tRB-C0EdOf7aY,3951
22
+ statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw,302
23
+ statistics/_callrate.py,sha256=ghB1EXT5JLQeIEIzh8LjWpqAnhCtCOk6l5ecNMLtQa0,1865
24
+ statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
25
+ statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
26
+ statistics/tests/__init__.py,sha256=DvTuot4rW2KfvH-orj-CJdZc1seq3SXhFcrkO39geeQ,172
27
+ statistics/tests/test_callrate.py,sha256=a-j3muIn5unTH9dZW7xe_v2WU6uIMmj4PvzfWYkQbIs,5653
28
+ statistics/tests/test_freq_allele.py,sha256=7jbxM7MCkUnrAouiuphJ5lGdZq4K72IoIqz7JQdTFds,2586
29
+ statistics/tests/test_freq_maf.py,sha256=IndsZaCUBn7Hql8_ao35Cip78GA_ytDZjYugWXkdPt4,430
30
+ statistics/tests/test_hwe_t.py,sha256=MqoP3DJ-159WTJEbWsAzIVsuGG791PYPm4DjPFoehfw,847
31
+ statistics/tests/test_snphwe.py,sha256=chutPCO7lYYUMvsuDwvuehB6_4oBRkp1OnUJtL52HTE,995
32
+ snplib-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
33
+ snplib-1.0.0.dist-info/METADATA,sha256=E_EU6XriENiCXVP0rHOjnqwQGa19oqnALVKK3glNqg0,2298
34
+ snplib-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
35
+ snplib-1.0.0.dist-info/top_level.txt,sha256=JrsCvtXEC8OFF-STMY1jzbM5YEuA_57kuXXq5aI8dfc,40
36
+ snplib-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,4 @@
1
+ finalreport
2
+ format
3
+ parentage
4
+ statistics
statistics/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snphwe import hwe, hwe_test
6
+ from ._callrate import call_rate
7
+ from ._freq import allele_freq, minor_allele_freq
8
+
9
+
10
+ __all__ = [
11
+ "call_rate",
12
+ "allele_freq",
13
+ "minor_allele_freq",
14
+ "hwe",
15
+ "hwe_test"
16
+ ]
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def call_rate(
9
+ data: pd.DataFrame | str,
10
+ id_col: str = None,
11
+ snp_col: str = None
12
+ ) -> pd.DataFrame | float | None:
13
+ """ The call rate for a given SNP is defined as the proportion of
14
+ individuals in the study for which the corresponding SNP information is
15
+ not missing. In the following example, we filter using a call rate of 95%,
16
+ meaning we retain SNPs for which there is less than 5% missing data.
17
+
18
+ Of the say, 54K markers in the chip, 50K have been genotyped for a
19
+ particular animal, the “call rate animal” is 50K/54K=93%
20
+ Of the say, 900 animals genotyped for marker CL635944_160.1, how many
21
+ have actually been successfully read? Assume that 600 have been read, then
22
+ the “call rate marker” is 600/900 = 67%
23
+
24
+ :param data: Pre-processed data on which the call rate is calculated.
25
+ :param id_col: The name of the column with the id of the animals or
26
+ markers.
27
+ :param snp_col: The name of the column with the snp sequence.
28
+ :return: Return dataframe with call rates for each animal if a dataframe
29
+ is transmitted. The number if the snp sequence is passed as a string.
30
+ None if there were errors.
31
+ """
32
+
33
+ if isinstance(data, pd.DataFrame):
34
+ try:
35
+ if data[snp_col].dtype.hasobject:
36
+ if not data[snp_col].str.isdigit().all():
37
+ return None
38
+
39
+ return data[[id_col, snp_col]].\
40
+ groupby(by=id_col)[snp_col].\
41
+ apply(lambda x: 1 - ((x == "5").sum() / len(x))).\
42
+ reset_index()
43
+
44
+ return data[[id_col, snp_col]]. \
45
+ groupby(by=id_col)[snp_col]. \
46
+ apply(lambda x: 1 - ((x == 5).sum() / len(x))). \
47
+ reset_index()
48
+
49
+ except Exception as e:
50
+ raise e
51
+
52
+ elif isinstance(data, str):
53
+ if not data.isdigit():
54
+ return None
55
+
56
+ return round(1 - (data.count('5') / len(data)), 6)
57
+
58
+ else:
59
+ return None
statistics/_freq.py ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def allele_freq(
9
+ data: pd.DataFrame | str, id_col: str = None, seq_col: str = None
10
+ ) -> pd.DataFrame | float | None:
11
+ """ The allele frequency represents the incidence of a gene variant in a
12
+ population.
13
+
14
+ :param data: Data array.
15
+ :param id_col: Columns with snp names.
16
+ :param seq_col: Columns with value snp in format ucg - 0, 1, 2, 5.
17
+ :return: Return the alleles frequency.
18
+ """
19
+
20
+ if isinstance(data, pd.DataFrame):
21
+ try:
22
+ if data[seq_col].dtype.hasobject:
23
+ if not data[seq_col].str.isdigit().all():
24
+ return None
25
+
26
+ return data.\
27
+ loc[data[seq_col] != "5", [id_col, seq_col]]. \
28
+ groupby(by=id_col)[seq_col]. \
29
+ apply(lambda x: x.astype("int8").sum() / (2 * x.count())).\
30
+ reset_index().\
31
+ round(3)
32
+
33
+ return data.\
34
+ loc[data[seq_col] != 5, [id_col, seq_col]].\
35
+ groupby(by=id_col)[seq_col].\
36
+ apply(lambda x: x.sum() / (2 * x.count())).\
37
+ reset_index().\
38
+ round(3)
39
+
40
+ except Exception as e:
41
+ raise e
42
+
43
+ elif isinstance(data, str):
44
+ if not data.isdigit():
45
+ return None
46
+
47
+ sam_seq = tuple(
48
+ map(int, filter(lambda x: x if x != "5" else None, data))
49
+ )
50
+ return round(sum(sam_seq) / (2 * len(sam_seq)), 3)
51
+
52
+ else:
53
+ return None
54
+
55
+
56
+ def minor_allele_freq(value: float) -> float:
57
+ """ The minor allele frequency is therefore the frequency at which the
58
+ minor allele occurs within a population.
59
+
60
+ :param value: Allele frequency
61
+ :return: Return the minor alleles frequency
62
+ """
63
+
64
+ if value > 0.5:
65
+ return round(1 - value, 3)
66
+
67
+ return round(value, 3)
statistics/_snphwe.py ADDED
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ def hwe(
10
+ obs_hets: int | float, obs_hom1: int | float, obs_hom2: int | float
11
+ ) -> float:
12
+ """ Python interpretation hwe - https://github.com/jeremymcrae/snphwe
13
+
14
+ :param obs_hets: Number of observed heterozygotes (AB, BA)
15
+ :param obs_hom1: Number of observed homozygotes1 (AA)
16
+ :param obs_hom2: Number of observed homozygotes2 (BB)
17
+ :return: This is where the p-value is returned
18
+ """
19
+
20
+ obs_hets = round(obs_hets)
21
+ obs_hom1 = round(obs_hom1)
22
+ obs_hom2 = round(obs_hom2)
23
+
24
+ if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
25
+ raise ValueError("snphwe: negative allele count")
26
+
27
+ obs_homr = min(obs_hom1, obs_hom2)
28
+ obs_homc = max(obs_hom1, obs_hom2)
29
+
30
+ rare = 2 * obs_homr + obs_hets
31
+ genotypes = obs_hets + obs_homc + obs_homr
32
+
33
+ if genotypes == 0:
34
+ raise ValueError("snphwe: zero genotypes")
35
+
36
+ probs = np.zeros(round(rare) + 1)
37
+
38
+ # get distribution midpoint, but ensure midpoint and rare alleles have
39
+ # same parity
40
+ mid = int(rare * (2 * genotypes - rare) / (2 * genotypes))
41
+ if mid % 2 != rare % 2:
42
+ mid += 1
43
+
44
+ probs[mid] = 1.0
45
+ _sum = probs[mid]
46
+
47
+ curr_homr = (rare - mid) / 2
48
+ curr_homc = genotypes - mid - curr_homr
49
+ curr_hets = mid
50
+ while curr_hets > 1:
51
+ probs[curr_hets - 2] = (
52
+ probs[curr_hets] * curr_hets * (curr_hets - 1.0)
53
+ / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
54
+ )
55
+ _sum += probs[curr_hets - 2]
56
+
57
+ # fewer heterozygotes -> add one rare, one common homozygote
58
+ curr_homr += 1
59
+ curr_homc += 1
60
+ curr_hets -= 2
61
+
62
+ # calculate probabilities from midpoint up
63
+ curr_homr = (rare - mid) / 2
64
+ curr_homc = genotypes - mid - curr_homr
65
+
66
+ curr_hets = mid
67
+ while curr_hets <= rare - 2:
68
+ probs[curr_hets + 2] = \
69
+ (probs[curr_hets] * 4.0 * curr_homr * curr_homc
70
+ / ((curr_hets + 2.0) * (curr_hets + 1.0)))
71
+ _sum += probs[curr_hets + 2]
72
+
73
+ # add 2 heterozygotes -> subtract one rare, one common homozygote
74
+ curr_homr -= 1
75
+ curr_homc -= 1
76
+ curr_hets += 2
77
+
78
+ # p-value calculation for p_hwe
79
+ target = probs[obs_hets]
80
+ p_hwe = 0.0
81
+
82
+ for p in probs:
83
+ if p <= target:
84
+ p_hwe += p / _sum
85
+
86
+ return min(1.0, p_hwe)
87
+
88
+
89
+ def hwe_test(
90
+ seq_snp: pd.Series, freq: float, crit_chi2: float = 3.841
91
+ ) -> bool:
92
+ """ The Hardy-Weinberg equilibrium is a principle stating that the genetic
93
+ variation in a population will remain constant from one generation to the
94
+ next in the absence of disturbing factors.
95
+ https://www.nature.com/scitable/definition/hardy-weinberg-equilibrium-122/
96
+
97
+ :param seq_snp: SNP sequence
98
+ :param freq: Allele frequency
99
+ :param crit_chi2: The critical value for a test ("either / or":
100
+ observed and expected values are either one way or the other),
101
+ therefore with degrees of freedom = df = 1 is 3.84 at p = 0.05
102
+ :return: A decision is returned to exclude or retain the inspected snp
103
+ """
104
+
105
+ _seq = seq_snp.replace(5, np.nan)
106
+
107
+ if _seq.nunique() == 1:
108
+ return True
109
+
110
+ n_genotypes = _seq.count()
111
+
112
+ observed = {
113
+ 0: (_seq == 0).sum(),
114
+ 1: (_seq == 1).sum(),
115
+ 2: (_seq == 2).sum()
116
+ }
117
+
118
+ expected = {
119
+ 0: ((1 - freq) ** 2) * n_genotypes,
120
+ 1: (2 * ((1 - freq) * freq)) * n_genotypes,
121
+ 2: (freq ** 2) * n_genotypes
122
+ }
123
+
124
+ chi = sum([
125
+ ((obs - exp) ** 2) / exp
126
+ for obs, exp in zip(observed.values(), expected.values())
127
+ ])
128
+
129
+ if chi > crit_chi2:
130
+ return False
131
+ else:
132
+ return True
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_DATA = Path(__file__).parent.joinpath("data")
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_DATA
6
+ from .. import call_rate
7
+
8
+ import pytest
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ @pytest.fixture
14
+ def data_df(request) -> pd.DataFrame:
15
+ match request.param:
16
+ case "cra":
17
+ return pd.read_pickle(DIR_DATA / "cr/file_cra.pl")
18
+
19
+ case "crm":
20
+ return pd.read_pickle(DIR_DATA / "cr/file_crm.pl")
21
+
22
+
23
+ @pytest.fixture
24
+ def data_str() -> list[str]:
25
+ return ['02011015010000500', '01110152120222512']
26
+
27
+
28
+ class TestCallRateAnimal(object):
29
+
30
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
31
+ def test_cra_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
32
+ data_df.SNP = data_df.SNP.astype(str)
33
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
34
+
35
+ assert isinstance(result, pd.DataFrame) and not result.empty
36
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
37
+
38
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
39
+ def test_cra_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
40
+ data_df.SNP = data_df.SNP.astype("int8")
41
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
42
+
43
+ assert isinstance(result, pd.DataFrame) and not result.empty
44
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
45
+
46
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
47
+ def test_cra_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
48
+ data_df.SNP = data_df.SNP.astype("float32")
49
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
50
+
51
+ assert isinstance(result, pd.DataFrame) and not result.empty
52
+ assert result.SNP.round(6).isin([0.882353, 0.882353]).all()
53
+
54
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
55
+ def test_cra_datafame_dtype_random_simbols(
56
+ self, data_df: pd.DataFrame
57
+ ) -> None:
58
+ data_df.SNP = [
59
+ np.random.choice(["A", "C", "G", "T"])
60
+ for _ in range(data_df.SNP.shape[0])
61
+ ]
62
+ result = call_rate(data=data_df, id_col="SAMPLE_ID", snp_col="SNP")
63
+
64
+ assert result is None
65
+
66
+ def test_cra_datafame_empty1(self) -> None:
67
+ with pytest.raises(KeyError):
68
+ call_rate(data=pd.DataFrame(), id_col="SAMPLE_ID", snp_col="SNP")
69
+
70
+ def test_cra_datafame_empty2(self) -> None:
71
+ result = call_rate(
72
+ data=pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
73
+ id_col="SAMPLE_ID",
74
+ snp_col="SNP"
75
+ )
76
+
77
+ assert isinstance(result, pd.DataFrame) and result.empty
78
+
79
+ @pytest.mark.parametrize("data_df", ["cra"], indirect=True)
80
+ def test_cra_datafame_fail(self, data_df: pd.DataFrame) -> None:
81
+ with pytest.raises(KeyError):
82
+ call_rate(data=data_df, id_col="SAMPLE_ID")
83
+ call_rate(data=data_df, snp_col="SNP")
84
+ call_rate(data=data_df)
85
+
86
+ def test_cra_str_int(self, data_str: list[str]) -> None:
87
+ for sequence in data_str:
88
+ assert call_rate(data=sequence) == 0.882353
89
+
90
+ def test_cra_str_simbols(self) -> None:
91
+ data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
92
+
93
+ for sequence in data_str:
94
+ assert call_rate(data=sequence) is None
95
+
96
+ def test_cra_str_empty(self) -> None:
97
+ assert call_rate(data="") is None
98
+
99
+ def test_cra_str_mixid(self) -> None:
100
+ assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
101
+
102
+
103
+ class TestCallRateMarker(object):
104
+
105
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
106
+ def test_crm_datafame_dtype_obj(self, data_df: pd.DataFrame) -> None:
107
+ data_df.SNP = data_df.SNP.astype(str)
108
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
109
+
110
+ assert isinstance(result, pd.DataFrame) and not result.empty
111
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
112
+
113
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
114
+ def test_crm_datafame_dtype_int(self, data_df: pd.DataFrame) -> None:
115
+ data_df.SNP = data_df.SNP.astype("int8")
116
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
117
+
118
+ assert isinstance(result, pd.DataFrame) and not result.empty
119
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
120
+
121
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
122
+ def test_crm_datafame_dtype_float(self, data_df: pd.DataFrame) -> None:
123
+ data_df.SNP = data_df.SNP.astype("float32")
124
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
125
+
126
+ assert isinstance(result, pd.DataFrame) and not result.empty
127
+ assert result.SNP.round(6).isin([0.727273, 0.909091, 0.818182]).all()
128
+
129
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
130
+ def test_crm_datafame_dtype_random_simbols(
131
+ self, data_df: pd.DataFrame
132
+ ) -> None:
133
+ data_df.SNP = [
134
+ np.random.choice(["A", "C", "G", "T"])
135
+ for _ in range(data_df.SNP.shape[0])
136
+ ]
137
+ result = call_rate(data=data_df, id_col="SNP_NAME", snp_col="SNP")
138
+
139
+ assert result is None
140
+
141
+ def test_crm_datafame_empty1(self) -> None:
142
+ with pytest.raises(KeyError):
143
+ call_rate(data=pd.DataFrame(), id_col="SNP_NAME", snp_col="SNP")
144
+
145
+ def test_crm_datafame_empty2(self) -> None:
146
+ result = call_rate(
147
+ data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
148
+ id_col="SNP_NAME",
149
+ snp_col="SNP"
150
+ )
151
+
152
+ assert isinstance(result, pd.DataFrame) and result.empty
153
+
154
+ @pytest.mark.parametrize("data_df", ["crm"], indirect=True)
155
+ def test_crm_datafame_fail(self, data_df: pd.DataFrame) -> None:
156
+ with pytest.raises(KeyError):
157
+ call_rate(data=data_df, id_col="SNP_NAME")
158
+ call_rate(data=data_df, snp_col="SNP")
159
+ call_rate(data=data_df)
160
+
161
+ def test_crm_str_simbols(self) -> None:
162
+ data_str = ['GCATGAGGTATACTCTA', 'CGCCATGCTGTATATCC']
163
+
164
+ for sequence in data_str:
165
+ assert call_rate(data=sequence) is None
166
+
167
+ def test_crm_str_empty(self) -> None:
168
+ assert call_rate(data="") is None
169
+
170
+ def test_crm_str_mixid(self) -> None:
171
+ assert call_rate(data="GCATGAG3G4T6A67TACTCTA") is None
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_DATA
6
+ from .. import allele_freq
7
+
8
+ import pytest
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ @pytest.fixture
14
+ def data_df() -> pd.DataFrame:
15
+ return pd.read_pickle(DIR_DATA / "freq/file.pl")
16
+ # [0. , 0.9 , 0.889]
17
+
18
+
19
+ def data_str() -> list[tuple]:
20
+ return [
21
+ ('2212120', 0.714),
22
+ ('02011015010000500', 0.2),
23
+ ('01110152120222512', 0.6)
24
+ ]
25
+
26
+
27
+ class TestAlleleFreq(object):
28
+
29
+ def test_allele_freq_df_dtype_obj(self, data_df: pd.DataFrame) -> None:
30
+ data_df.SNP = data_df.SNP.astype(str)
31
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
32
+
33
+ assert isinstance(result, pd.DataFrame) and not result.empty
34
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
35
+
36
+ def test_allele_freq_df_dtype_int(self, data_df: pd.DataFrame) -> None:
37
+ data_df.SNP = data_df.SNP.astype("int8")
38
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
39
+
40
+ assert isinstance(result, pd.DataFrame) and not result.empty
41
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
42
+
43
+ def test_allele_freq_df_dtype_float(self, data_df: pd.DataFrame) -> None:
44
+ data_df.SNP = data_df.SNP.astype("float32")
45
+ result = allele_freq(data=data_df, id_col="SNP_NAME", seq_col="SNP")
46
+
47
+ assert isinstance(result, pd.DataFrame) and not result.empty
48
+ assert result.SNP.round(6).isin([0.000, 0.900, 0.889]).all()
49
+
50
+ def test_allele_freq_df_data_rand_simbols(
51
+ self, data_df: pd.DataFrame
52
+ ) -> None:
53
+ data_df.SNP = [
54
+ np.random.choice(["A", "C", "G", "T"])
55
+ for _ in range(data_df.SNP.shape[0])
56
+ ]
57
+ assert allele_freq(
58
+ data=data_df, id_col="SNP_NAME", seq_col="SNP"
59
+ ) is None
60
+
61
+ def test_allele_freq_df_empty(self) -> None:
62
+ with pytest.raises(KeyError):
63
+ allele_freq(
64
+ data=pd.DataFrame(), id_col="SNP_NAME", seq_col="SNP"
65
+ )
66
+
67
+ def test_allele_freq_df_empty_only_columns(self) -> None:
68
+ result = allele_freq(
69
+ data=pd.DataFrame(columns=["SNP_NAME", "SNP"]),
70
+ id_col="SNP_NAME",
71
+ seq_col="SNP"
72
+ )
73
+
74
+ assert isinstance(result, pd.DataFrame) and result.empty
75
+
76
+ def test_allele_freq_df_raises(self, data_df: pd.DataFrame) -> None:
77
+ with pytest.raises(KeyError):
78
+ allele_freq(data=data_df, id_col="SNP_NAME")
79
+ allele_freq(data=data_df, seq_col="SNP")
80
+ allele_freq(data=data_df)
81
+
82
+ @pytest.mark.parametrize("data, obs_value", data_str())
83
+ def test_allele_freq_str(self, data: str, obs_value: float) -> None:
84
+ assert allele_freq(data=data) == obs_value
85
+
86
+ def test_allele_freq_non_type(self) -> None:
87
+ assert allele_freq(data=1423) is None