snplib 1.0.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- finalreport/__init__.py +7 -0
- finalreport/_finalreport.py +251 -0
- finalreport/tests/__init__.py +7 -0
- finalreport/tests/test_finalreport.py +215 -0
- format/__init__.py +19 -0
- format/__settings.py +7 -0
- format/_plink.py +305 -0
- format/_snp.py +113 -0
- format/tests/__init__.py +7 -0
- format/tests/test_plink_fam.py +121 -0
- format/tests/test_plink_lgen.py +106 -0
- format/tests/test_plink_map.py +42 -0
- format/tests/test_plink_ped.py +136 -0
- format/tests/test_snp.py +128 -0
- parentage/__init__.py +15 -0
- parentage/_discov.py +102 -0
- parentage/_isagmark.py +15 -0
- parentage/_verif.py +91 -0
- parentage/tests/__init__.py +7 -0
- parentage/tests/test_discov.py +164 -0
- parentage/tests/test_verif.py +160 -0
- snplib-1.0.0.dist-info/LICENSE +674 -0
- snplib-1.0.0.dist-info/METADATA +89 -0
- snplib-1.0.0.dist-info/RECORD +36 -0
- snplib-1.0.0.dist-info/WHEEL +5 -0
- snplib-1.0.0.dist-info/top_level.txt +4 -0
- statistics/__init__.py +16 -0
- statistics/_callrate.py +59 -0
- statistics/_freq.py +67 -0
- statistics/_snphwe.py +132 -0
- statistics/tests/__init__.py +7 -0
- statistics/tests/test_callrate.py +171 -0
- statistics/tests/test_freq_allele.py +87 -0
- statistics/tests/test_freq_maf.py +17 -0
- statistics/tests/test_hwe_t.py +41 -0
- statistics/tests/test_snphwe.py +41 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_ped
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_ped(request) -> pd.DataFrame | None:
|
14
|
+
return pd.read_pickle(DIR_FILES / f"fplink/ped/{request.param}")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatPed(object):
|
18
|
+
|
19
|
+
@pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
|
20
|
+
def test_ped_true(self, data_ped: pd.DataFrame) -> None:
|
21
|
+
assert not make_ped(
|
22
|
+
data_ped,
|
23
|
+
"SAMPLE_ID",
|
24
|
+
"SNP",
|
25
|
+
fid_col="SAMPLE_ID"
|
26
|
+
).empty
|
27
|
+
|
28
|
+
assert not make_ped(
|
29
|
+
data_ped,
|
30
|
+
"SAMPLE_ID",
|
31
|
+
"SNP"
|
32
|
+
).empty
|
33
|
+
|
34
|
+
def test_ped_empty(self) -> None:
|
35
|
+
assert make_ped(
|
36
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
37
|
+
"SAMPLE_ID",
|
38
|
+
"SNP"
|
39
|
+
).empty
|
40
|
+
|
41
|
+
assert make_ped(
|
42
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
43
|
+
"SAMPLE_ID",
|
44
|
+
"SNP",
|
45
|
+
fid_col="SAMPLE_ID"
|
46
|
+
).empty
|
47
|
+
|
48
|
+
@pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
|
49
|
+
def test_ped_raise_columns(self, data_ped: pd.DataFrame) -> None:
|
50
|
+
# SID_COL
|
51
|
+
with pytest.raises(
|
52
|
+
KeyError, match="Data has not in name columns!"
|
53
|
+
):
|
54
|
+
make_ped(
|
55
|
+
data=data_ped,
|
56
|
+
sid_col="SAMPLE_ID1",
|
57
|
+
fid_col="SAMPLE_ID",
|
58
|
+
snp_col="SNP"
|
59
|
+
)
|
60
|
+
|
61
|
+
# SNP_COL
|
62
|
+
with pytest.raises(
|
63
|
+
KeyError, match="Data has not in name columns!"
|
64
|
+
):
|
65
|
+
make_ped(
|
66
|
+
data_ped,
|
67
|
+
"SAMPLE_ID",
|
68
|
+
"SNP1",
|
69
|
+
fid_col="SAMPLE_ID"
|
70
|
+
)
|
71
|
+
|
72
|
+
# FID_COL
|
73
|
+
with pytest.raises(
|
74
|
+
KeyError, match="Data has not in name columns SAMPLE_ID1!"
|
75
|
+
):
|
76
|
+
make_ped(
|
77
|
+
data_ped,
|
78
|
+
"SAMPLE_ID",
|
79
|
+
"SNP",
|
80
|
+
fid_col="SAMPLE_ID1"
|
81
|
+
)
|
82
|
+
|
83
|
+
@pytest.mark.parametrize("data_ped", ["file2.pl"], indirect=True)
|
84
|
+
def test_ped_raises_underscope_sid(self, data_ped: pd.DataFrame) -> None:
|
85
|
+
|
86
|
+
# SID_COL
|
87
|
+
with pytest.raises(
|
88
|
+
Exception,
|
89
|
+
match="Replace in 'Sample ID' columns '_' on another a simbols"
|
90
|
+
):
|
91
|
+
res = make_ped(
|
92
|
+
data_ped,
|
93
|
+
"SAMPLE_ID",
|
94
|
+
"SNP"
|
95
|
+
)
|
96
|
+
|
97
|
+
@pytest.mark.parametrize("data_ped", ["file3.pl"], indirect=True)
|
98
|
+
def test_ped_raises_underscope_fid(self, data_ped: pd.DataFrame) -> None:
|
99
|
+
|
100
|
+
# FID_COL
|
101
|
+
with pytest.raises(
|
102
|
+
Exception,
|
103
|
+
match="Replace in 'Family ID' columns '_' on another a simbols"
|
104
|
+
):
|
105
|
+
res = make_ped(
|
106
|
+
data_ped,
|
107
|
+
"SAMPLE_ID",
|
108
|
+
"SNP",
|
109
|
+
fid_col="FAMILY_ID"
|
110
|
+
)
|
111
|
+
|
112
|
+
@pytest.mark.parametrize("data_ped", ["file4.pl"], indirect=True)
|
113
|
+
def test_ped_check_data(self, data_ped: pd.DataFrame) -> None:
|
114
|
+
res = make_ped(
|
115
|
+
data_ped,
|
116
|
+
"SAMPLE_ID",
|
117
|
+
"SNP",
|
118
|
+
fid_col="FAMILY_ID",
|
119
|
+
father_col="father",
|
120
|
+
mother_col="mother",
|
121
|
+
sex_col="sex"
|
122
|
+
)
|
123
|
+
|
124
|
+
res2 = make_ped(
|
125
|
+
data_ped,
|
126
|
+
"SAMPLE_ID",
|
127
|
+
"SNP",
|
128
|
+
fid_col="FAMILY_ID",
|
129
|
+
)
|
130
|
+
|
131
|
+
assert all(res.father.values == list('1234'))
|
132
|
+
assert all(res.mother.values == list('5678'))
|
133
|
+
assert all(res.sex.values == list('1210'))
|
134
|
+
assert all(res2.father.values == list('0000'))
|
135
|
+
assert all(res2.mother.values == list('0000'))
|
136
|
+
assert all(res2.sex.values == list('0000'))
|
format/tests/test_snp.py
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import Snp
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_fr(request) -> pd.DataFrame:
|
14
|
+
return pd.read_csv(DIR_FILES / f"fsnp/{request.param}", sep="\t")
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def obj_snp(request) -> Snp:
|
19
|
+
return Snp(fmt=request.param)
|
20
|
+
|
21
|
+
|
22
|
+
class TestSNP(object):
|
23
|
+
|
24
|
+
@pytest.mark.parametrize(
|
25
|
+
"obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
|
26
|
+
)
|
27
|
+
def test_snp_process_uga_true(
|
28
|
+
self, obj_snp: Snp, data_fr: pd.DataFrame
|
29
|
+
) -> None:
|
30
|
+
|
31
|
+
obj_snp.process(data_fr)
|
32
|
+
assert obj_snp.data is not None and not obj_snp.data.empty
|
33
|
+
assert obj_snp.data.SNP.isin([
|
34
|
+
'02011015010000500', '01110152120222512'
|
35
|
+
]).all()
|
36
|
+
|
37
|
+
@pytest.mark.parametrize("obj_snp", ["uga"], indirect=True)
|
38
|
+
def test_snp_process_uga_empty(self, obj_snp: Snp) -> None:
|
39
|
+
|
40
|
+
obj_snp.process(pd.DataFrame(columns=[
|
41
|
+
'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
|
42
|
+
'GC Score', 'GT Score'
|
43
|
+
]))
|
44
|
+
assert obj_snp.data is not None and obj_snp.data.empty
|
45
|
+
|
46
|
+
@pytest.mark.parametrize(
|
47
|
+
"obj_snp, data_fr",
|
48
|
+
[("uga", 'file1.txt'), (None, 'file1.txt')],
|
49
|
+
indirect=True
|
50
|
+
)
|
51
|
+
def test_snp_process_raises(
|
52
|
+
self, obj_snp: Snp, data_fr: pd.DataFrame
|
53
|
+
) -> None:
|
54
|
+
|
55
|
+
with pytest.raises(KeyError):
|
56
|
+
obj_snp.process(pd.DataFrame(columns=[
|
57
|
+
'SNP Name1', 'Sample ID1', 'Allele1 - AB', 'Allele2 - AB',
|
58
|
+
'GC Score', 'GT Score'
|
59
|
+
]))
|
60
|
+
|
61
|
+
assert obj_snp.data is None
|
62
|
+
|
63
|
+
@pytest.mark.parametrize(
|
64
|
+
"obj_snp, data_fr", [(None, 'file1.txt')], indirect=True
|
65
|
+
)
|
66
|
+
def test_snp_process_df(
|
67
|
+
self, obj_snp: Snp, data_fr: pd.DataFrame
|
68
|
+
) -> None:
|
69
|
+
|
70
|
+
obj_snp.process(data_fr)
|
71
|
+
assert obj_snp.data is not None and not obj_snp.data.empty
|
72
|
+
|
73
|
+
@pytest.mark.parametrize("obj_snp", [None], indirect=True)
|
74
|
+
def test_snp_process_df_empty(self, obj_snp: Snp) -> None:
|
75
|
+
|
76
|
+
obj_snp.process(pd.DataFrame(columns=[
|
77
|
+
'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
|
78
|
+
'GC Score', 'GT Score'
|
79
|
+
]))
|
80
|
+
assert obj_snp.data is not None and obj_snp.data.empty
|
81
|
+
|
82
|
+
@pytest.mark.parametrize(
|
83
|
+
"obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
|
84
|
+
)
|
85
|
+
def test_snp_to_file_uga1(
|
86
|
+
self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
|
87
|
+
) -> None:
|
88
|
+
"""
|
89
|
+
The name sample_id is one length
|
90
|
+
"""
|
91
|
+
|
92
|
+
_dir_sub = tmp_path / "sub"
|
93
|
+
_dir_sub.mkdir()
|
94
|
+
_file_save = _dir_sub / "data_snp.csv"
|
95
|
+
|
96
|
+
obj_snp.process(data_fr)
|
97
|
+
assert obj_snp.data is not None and not obj_snp.data.empty
|
98
|
+
|
99
|
+
obj_snp.to_file(_file_save)
|
100
|
+
assert _file_save.is_file() and _file_save.exists()
|
101
|
+
assert (
|
102
|
+
_file_save.read_text() ==
|
103
|
+
"14814 02011015010000500\n14815 01110152120222512\n"
|
104
|
+
)
|
105
|
+
|
106
|
+
@pytest.mark.parametrize(
|
107
|
+
"obj_snp, data_fr", [("uga", 'file2.txt')], indirect=True
|
108
|
+
)
|
109
|
+
def test_snp_to_file_uga2(
|
110
|
+
self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
|
111
|
+
) -> None:
|
112
|
+
"""
|
113
|
+
The name sample_id of different length
|
114
|
+
"""
|
115
|
+
|
116
|
+
_dir_sub = tmp_path / "sub"
|
117
|
+
_dir_sub.mkdir()
|
118
|
+
_file_save = _dir_sub / "data_snp.csv"
|
119
|
+
|
120
|
+
obj_snp.process(data_fr)
|
121
|
+
assert obj_snp.data is not None and not obj_snp.data.empty
|
122
|
+
|
123
|
+
obj_snp.to_file(_file_save)
|
124
|
+
assert _file_save.is_file() and _file_save.exists()
|
125
|
+
assert (
|
126
|
+
_file_save.read_text() ==
|
127
|
+
"14814qwert 02011015010000500\n14815 01110152120222512\n"
|
128
|
+
)
|
parentage/__init__.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from ._discov import Discovery
|
6
|
+
from ._verif import Verification
|
7
|
+
from ._isagmark import isag_verif, isag_disc
|
8
|
+
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"Discovery",
|
12
|
+
"Verification",
|
13
|
+
"isag_disc",
|
14
|
+
"isag_verif"
|
15
|
+
]
|
parentage/_discov.py
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
"""
|
8
|
+
Search for paternity according to ICAR recommendations
|
9
|
+
https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
class Discovery(object):
|
14
|
+
""" Search for paternity according to ICAR recommendations
|
15
|
+
|
16
|
+
:argument isag_markers: Fixed sample of markers to confirm paternity.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self, isag_markers: pd.Series | list | set | None = None
|
21
|
+
) -> None:
|
22
|
+
self.__isag_markers = isag_markers
|
23
|
+
|
24
|
+
self.__num_conflicts = None # Number of conflicts
|
25
|
+
self.__perc_conflicts = None
|
26
|
+
|
27
|
+
@property
|
28
|
+
def status(self) -> None | str:
|
29
|
+
""" The status of each parent discovered. """
|
30
|
+
|
31
|
+
if self.__perc_conflicts is not None:
|
32
|
+
if 0 <= self.__perc_conflicts < 1:
|
33
|
+
return 'Discovered'
|
34
|
+
elif 1 < self.__perc_conflicts < 3:
|
35
|
+
return 'Doubtful'
|
36
|
+
elif self.__perc_conflicts >= 3:
|
37
|
+
return 'Excluded'
|
38
|
+
else:
|
39
|
+
return None
|
40
|
+
|
41
|
+
@property
|
42
|
+
def num_conflicts(self) -> None | int:
|
43
|
+
return self.__num_conflicts
|
44
|
+
|
45
|
+
@property
|
46
|
+
def perc_conflicts(self) -> None | float:
|
47
|
+
return self.__perc_conflicts
|
48
|
+
|
49
|
+
def search_parent(
|
50
|
+
self,
|
51
|
+
data: pd.DataFrame,
|
52
|
+
descendant: str,
|
53
|
+
parents: str,
|
54
|
+
snp_name_col: str
|
55
|
+
) -> None:
|
56
|
+
""" Search for paternity.
|
57
|
+
|
58
|
+
:param data: SNP data for descendant and parent.
|
59
|
+
:param descendant: Columns name of the descendant in the data.
|
60
|
+
:param parents: Columns name or list name of the parents in the data.
|
61
|
+
:param snp_name_col: SNP columns name is data.
|
62
|
+
"""
|
63
|
+
|
64
|
+
if self.__isag_markers is None:
|
65
|
+
raise ValueError("Error. No array of snp names to verify")
|
66
|
+
|
67
|
+
sample_by_markers = data.loc[
|
68
|
+
data[snp_name_col].isin(self.__isag_markers),
|
69
|
+
[snp_name_col, descendant, parents]
|
70
|
+
]
|
71
|
+
|
72
|
+
# Filtering 5s from a descendent
|
73
|
+
desc_marks = sample_by_markers.loc[
|
74
|
+
sample_by_markers[descendant] != 5, [snp_name_col, descendant]
|
75
|
+
]
|
76
|
+
|
77
|
+
# According to ICAR, the number of available markers must be
|
78
|
+
# above 450
|
79
|
+
if len(desc_marks) < 450:
|
80
|
+
raise Exception("Calf call rate is low.")
|
81
|
+
|
82
|
+
# Common after filtering markers of potential ancestors
|
83
|
+
sample_parents = sample_by_markers.loc[
|
84
|
+
sample_by_markers[snp_name_col].isin(desc_marks[snp_name_col]),
|
85
|
+
parents
|
86
|
+
]
|
87
|
+
|
88
|
+
# Number of available markers in potential ancestors
|
89
|
+
prob_parents_same_n_markers = (sample_parents < 5).sum()
|
90
|
+
|
91
|
+
# number of conflicts
|
92
|
+
self.__num_conflicts = (
|
93
|
+
abs(sample_parents.sub(desc_marks[descendant], axis=0)) == 2
|
94
|
+
).sum()
|
95
|
+
|
96
|
+
# Percentage of conflicts
|
97
|
+
self.__perc_conflicts = (
|
98
|
+
(self.__num_conflicts / prob_parents_same_n_markers) * 100
|
99
|
+
).round(2)
|
100
|
+
|
101
|
+
def __status_define(self) -> None:
|
102
|
+
...
|
parentage/_isagmark.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
|
10
|
+
def isag_disc() -> pd.DataFrame:
|
11
|
+
return pd.read_pickle(Path(__file__).parent.joinpath("isag_disc.pl"))
|
12
|
+
|
13
|
+
|
14
|
+
def isag_verif() -> pd.DataFrame:
|
15
|
+
return pd.read_pickle(Path(__file__).parent.joinpath("isag_verif.pl"))
|
parentage/_verif.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
"""
|
10
|
+
https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
|
11
|
+
"""
|
12
|
+
|
13
|
+
|
14
|
+
class Verification(object):
|
15
|
+
"""
|
16
|
+
Verification of paternity according to ICAR recommendations.
|
17
|
+
|
18
|
+
:argument isag_marks: Fixed sample of markers to confirm paternity.
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self, isag_marks: pd.Series | list | set | None = None
|
23
|
+
) -> None:
|
24
|
+
self.__isag_marks = isag_marks
|
25
|
+
|
26
|
+
# The minimum number of SNP available in the profile
|
27
|
+
# of each animal and potential parent must be scaled (i.e.: 95%
|
28
|
+
# truncated down)
|
29
|
+
self.__min_num_snp = 0.95
|
30
|
+
self.__num_conflicts = None # Number of conflicts
|
31
|
+
|
32
|
+
@property
|
33
|
+
def status(self) -> None | str:
|
34
|
+
if self.__num_conflicts is not None:
|
35
|
+
if self.__num_conflicts <= 2:
|
36
|
+
return 'Accept'
|
37
|
+
elif 3 <= self.__num_conflicts <= 5:
|
38
|
+
return 'Doubtful'
|
39
|
+
elif self.__num_conflicts > 5:
|
40
|
+
return 'Excluded'
|
41
|
+
else:
|
42
|
+
return None
|
43
|
+
|
44
|
+
@property
|
45
|
+
def num_conflicts(self) -> None | int:
|
46
|
+
return self.__num_conflicts
|
47
|
+
|
48
|
+
def check_on(
|
49
|
+
self,
|
50
|
+
data: pd.DataFrame,
|
51
|
+
descendant: str,
|
52
|
+
parent: str,
|
53
|
+
snp_name_col: str
|
54
|
+
) -> None:
|
55
|
+
""" Verification of paternity according to ICAR recommendations.
|
56
|
+
|
57
|
+
:param data: SNP data for descendant and parent.
|
58
|
+
:param descendant: Columns name of the descendant in the data.
|
59
|
+
:param parent: Columns name of the parent in the data.
|
60
|
+
:param snp_name_col: SNP column name in data.
|
61
|
+
"""
|
62
|
+
|
63
|
+
if self.__isag_marks is None:
|
64
|
+
raise ValueError('Error. No array of snp names to verify')
|
65
|
+
|
66
|
+
num_isag_mark = len(self.__isag_marks)
|
67
|
+
min_num_comm_snp = int(num_isag_mark - (2 * (num_isag_mark * 0.05)))
|
68
|
+
|
69
|
+
sample_mark = data.loc[
|
70
|
+
data[snp_name_col].isin(self.__isag_marks), [descendant, parent]
|
71
|
+
]
|
72
|
+
|
73
|
+
# The number of markers is not 5ok
|
74
|
+
desc_n_markers = (sample_mark[descendant] < 5).sum()
|
75
|
+
parent_n_markers = (sample_mark[parent] < 5).sum()
|
76
|
+
|
77
|
+
# According to ICAR, the number of markers not 5ok should be more
|
78
|
+
# than 95%
|
79
|
+
if (desc_n_markers < num_isag_mark * self.__min_num_snp) and \
|
80
|
+
(parent_n_markers < num_isag_mark * self.__min_num_snp):
|
81
|
+
raise Exception('Calf and parent have low call rate')
|
82
|
+
|
83
|
+
comm_snp_no_missing = sample_mark.replace(5, np.nan).dropna()
|
84
|
+
num_comm_markers = len(comm_snp_no_missing)
|
85
|
+
|
86
|
+
if num_comm_markers < min_num_comm_snp:
|
87
|
+
raise Exception('Pair call rate is low')
|
88
|
+
|
89
|
+
self.__num_conflicts = (abs(
|
90
|
+
comm_snp_no_missing[descendant] - comm_snp_no_missing[parent]
|
91
|
+
) == 2).sum()
|
@@ -0,0 +1,164 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_DATA
|
6
|
+
from .. import Discovery, isag_disc
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data() -> pd.DataFrame:
|
14
|
+
return pd.read_csv(DIR_DATA / "parentage_test_disc.csv", sep=" ")
|
15
|
+
|
16
|
+
|
17
|
+
@pytest.fixture
|
18
|
+
def obj_discovery() -> Discovery:
|
19
|
+
return Discovery(isag_markers=isag_disc().markers)
|
20
|
+
|
21
|
+
|
22
|
+
class TestDiscovery(object):
|
23
|
+
|
24
|
+
def test_search_parent_successfully(
|
25
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
26
|
+
) -> None:
|
27
|
+
|
28
|
+
assert obj_discovery.search_parent(
|
29
|
+
data=data,
|
30
|
+
descendant="BY000041988163",
|
31
|
+
parents="EE10512586",
|
32
|
+
snp_name_col="SNP_Name"
|
33
|
+
) is None
|
34
|
+
assert obj_discovery.num_conflicts == 77
|
35
|
+
assert obj_discovery.status == "Excluded"
|
36
|
+
assert obj_discovery.perc_conflicts == 14.86
|
37
|
+
|
38
|
+
def test_search_parent_1(self, data: pd.DataFrame) -> None:
|
39
|
+
"""
|
40
|
+
An exception is thrown for the absence of data with isag markers
|
41
|
+
"""
|
42
|
+
obj_discovery = Discovery()
|
43
|
+
|
44
|
+
with pytest.raises(
|
45
|
+
ValueError, match="Error. No array of snp names to verify"
|
46
|
+
):
|
47
|
+
obj_discovery.search_parent(
|
48
|
+
data=data,
|
49
|
+
descendant="BY000041988163",
|
50
|
+
parents="EE10512586",
|
51
|
+
snp_name_col="SNP_Name"
|
52
|
+
)
|
53
|
+
assert obj_discovery.status is None
|
54
|
+
assert obj_discovery.num_conflicts is None
|
55
|
+
assert obj_discovery.perc_conflicts is None
|
56
|
+
|
57
|
+
def test_search_parent_2(
|
58
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
59
|
+
) -> None:
|
60
|
+
"""
|
61
|
+
Exception when the number of markers required to confirm paternity is
|
62
|
+
less than the established value.
|
63
|
+
"""
|
64
|
+
|
65
|
+
with pytest.raises(
|
66
|
+
Exception, match="Calf call rate is low."
|
67
|
+
):
|
68
|
+
obj_discovery.search_parent(
|
69
|
+
data=data[:-100],
|
70
|
+
descendant="BY000041988163",
|
71
|
+
parents="EE10512586",
|
72
|
+
snp_name_col="SNP_Name"
|
73
|
+
)
|
74
|
+
assert obj_discovery.status is None
|
75
|
+
assert obj_discovery.num_conflicts is None
|
76
|
+
assert obj_discovery.perc_conflicts is None
|
77
|
+
|
78
|
+
def test_search_parent_3(
|
79
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
80
|
+
) -> None:
|
81
|
+
"""
|
82
|
+
Test if the transmitted animal names are not in the dataframe.
|
83
|
+
"""
|
84
|
+
|
85
|
+
# For descendant
|
86
|
+
with pytest.raises(KeyError):
|
87
|
+
obj_discovery.search_parent(
|
88
|
+
data=data,
|
89
|
+
descendant="BY00004198816",
|
90
|
+
parents="EE10512586",
|
91
|
+
snp_name_col="SNP_Name"
|
92
|
+
)
|
93
|
+
assert obj_discovery.status is None
|
94
|
+
assert obj_discovery.num_conflicts is None
|
95
|
+
assert obj_discovery.perc_conflicts is None
|
96
|
+
|
97
|
+
# For parents
|
98
|
+
with pytest.raises(KeyError):
|
99
|
+
obj_discovery.search_parent(
|
100
|
+
data=data,
|
101
|
+
descendant="BY000041988163",
|
102
|
+
parents="EE105125864",
|
103
|
+
snp_name_col="SNP_Name"
|
104
|
+
)
|
105
|
+
assert obj_discovery.status is None
|
106
|
+
assert obj_discovery.num_conflicts is None
|
107
|
+
assert obj_discovery.perc_conflicts is None
|
108
|
+
|
109
|
+
def test_search_parent_4(
|
110
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
111
|
+
) -> None:
|
112
|
+
"""
|
113
|
+
Test when all snp data is not read - equal to 5.
|
114
|
+
"""
|
115
|
+
data[["BY000041988163", "EE10512586"]] = 5
|
116
|
+
|
117
|
+
with pytest.raises(
|
118
|
+
Exception, match="Calf call rate is low."
|
119
|
+
):
|
120
|
+
obj_discovery.search_parent(
|
121
|
+
data=data,
|
122
|
+
descendant="BY000041988163",
|
123
|
+
parents="EE10512586",
|
124
|
+
snp_name_col="SNP_Name"
|
125
|
+
)
|
126
|
+
assert obj_discovery.status is None
|
127
|
+
assert obj_discovery.num_conflicts is None
|
128
|
+
assert obj_discovery.perc_conflicts is None
|
129
|
+
|
130
|
+
def test_search_parent_5(
|
131
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
132
|
+
) -> None:
|
133
|
+
"""
|
134
|
+
Test when there is a complete match.
|
135
|
+
"""
|
136
|
+
data[["BY000041988163", "EE10512586"]] = 2
|
137
|
+
|
138
|
+
obj_discovery.search_parent(
|
139
|
+
data=data,
|
140
|
+
descendant="BY000041988163",
|
141
|
+
parents="EE10512586",
|
142
|
+
snp_name_col="SNP_Name"
|
143
|
+
)
|
144
|
+
assert obj_discovery.status == "Discovered"
|
145
|
+
assert obj_discovery.num_conflicts == 0
|
146
|
+
assert obj_discovery.perc_conflicts == 0.0
|
147
|
+
|
148
|
+
def test_search_parent_6(
|
149
|
+
self, data: pd.DataFrame, obj_discovery: Discovery
|
150
|
+
) -> None:
|
151
|
+
"""
|
152
|
+
Partial match test.
|
153
|
+
"""
|
154
|
+
data.loc[202:, "EE10512586"] = 1
|
155
|
+
|
156
|
+
obj_discovery.search_parent(
|
157
|
+
data=data,
|
158
|
+
descendant="BY000041988163",
|
159
|
+
parents="EE10512586",
|
160
|
+
snp_name_col="SNP_Name"
|
161
|
+
)
|
162
|
+
assert obj_discovery.status == "Doubtful"
|
163
|
+
assert obj_discovery.num_conflicts == 14
|
164
|
+
assert obj_discovery.perc_conflicts == 2.70
|