snplib 1.0.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_ped
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_ped(request) -> pd.DataFrame | None:
14
+ return pd.read_pickle(DIR_FILES / f"fplink/ped/{request.param}")
15
+
16
+
17
+ class TestPlinkFormatPed(object):
18
+
19
+ @pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
20
+ def test_ped_true(self, data_ped: pd.DataFrame) -> None:
21
+ assert not make_ped(
22
+ data_ped,
23
+ "SAMPLE_ID",
24
+ "SNP",
25
+ fid_col="SAMPLE_ID"
26
+ ).empty
27
+
28
+ assert not make_ped(
29
+ data_ped,
30
+ "SAMPLE_ID",
31
+ "SNP"
32
+ ).empty
33
+
34
+ def test_ped_empty(self) -> None:
35
+ assert make_ped(
36
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
37
+ "SAMPLE_ID",
38
+ "SNP"
39
+ ).empty
40
+
41
+ assert make_ped(
42
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
43
+ "SAMPLE_ID",
44
+ "SNP",
45
+ fid_col="SAMPLE_ID"
46
+ ).empty
47
+
48
+ @pytest.mark.parametrize("data_ped", ["file.pl"], indirect=True)
49
+ def test_ped_raise_columns(self, data_ped: pd.DataFrame) -> None:
50
+ # SID_COL
51
+ with pytest.raises(
52
+ KeyError, match="Data has not in name columns!"
53
+ ):
54
+ make_ped(
55
+ data=data_ped,
56
+ sid_col="SAMPLE_ID1",
57
+ fid_col="SAMPLE_ID",
58
+ snp_col="SNP"
59
+ )
60
+
61
+ # SNP_COL
62
+ with pytest.raises(
63
+ KeyError, match="Data has not in name columns!"
64
+ ):
65
+ make_ped(
66
+ data_ped,
67
+ "SAMPLE_ID",
68
+ "SNP1",
69
+ fid_col="SAMPLE_ID"
70
+ )
71
+
72
+ # FID_COL
73
+ with pytest.raises(
74
+ KeyError, match="Data has not in name columns SAMPLE_ID1!"
75
+ ):
76
+ make_ped(
77
+ data_ped,
78
+ "SAMPLE_ID",
79
+ "SNP",
80
+ fid_col="SAMPLE_ID1"
81
+ )
82
+
83
+ @pytest.mark.parametrize("data_ped", ["file2.pl"], indirect=True)
84
+ def test_ped_raises_underscope_sid(self, data_ped: pd.DataFrame) -> None:
85
+
86
+ # SID_COL
87
+ with pytest.raises(
88
+ Exception,
89
+ match="Replace in 'Sample ID' columns '_' on another a simbols"
90
+ ):
91
+ res = make_ped(
92
+ data_ped,
93
+ "SAMPLE_ID",
94
+ "SNP"
95
+ )
96
+
97
+ @pytest.mark.parametrize("data_ped", ["file3.pl"], indirect=True)
98
+ def test_ped_raises_underscope_fid(self, data_ped: pd.DataFrame) -> None:
99
+
100
+ # FID_COL
101
+ with pytest.raises(
102
+ Exception,
103
+ match="Replace in 'Family ID' columns '_' on another a simbols"
104
+ ):
105
+ res = make_ped(
106
+ data_ped,
107
+ "SAMPLE_ID",
108
+ "SNP",
109
+ fid_col="FAMILY_ID"
110
+ )
111
+
112
+ @pytest.mark.parametrize("data_ped", ["file4.pl"], indirect=True)
113
+ def test_ped_check_data(self, data_ped: pd.DataFrame) -> None:
114
+ res = make_ped(
115
+ data_ped,
116
+ "SAMPLE_ID",
117
+ "SNP",
118
+ fid_col="FAMILY_ID",
119
+ father_col="father",
120
+ mother_col="mother",
121
+ sex_col="sex"
122
+ )
123
+
124
+ res2 = make_ped(
125
+ data_ped,
126
+ "SAMPLE_ID",
127
+ "SNP",
128
+ fid_col="FAMILY_ID",
129
+ )
130
+
131
+ assert all(res.father.values == list('1234'))
132
+ assert all(res.mother.values == list('5678'))
133
+ assert all(res.sex.values == list('1210'))
134
+ assert all(res2.father.values == list('0000'))
135
+ assert all(res2.mother.values == list('0000'))
136
+ assert all(res2.sex.values == list('0000'))
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import Snp
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_fr(request) -> pd.DataFrame:
14
+ return pd.read_csv(DIR_FILES / f"fsnp/{request.param}", sep="\t")
15
+
16
+
17
+ @pytest.fixture
18
+ def obj_snp(request) -> Snp:
19
+ return Snp(fmt=request.param)
20
+
21
+
22
+ class TestSNP(object):
23
+
24
+ @pytest.mark.parametrize(
25
+ "obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
26
+ )
27
+ def test_snp_process_uga_true(
28
+ self, obj_snp: Snp, data_fr: pd.DataFrame
29
+ ) -> None:
30
+
31
+ obj_snp.process(data_fr)
32
+ assert obj_snp.data is not None and not obj_snp.data.empty
33
+ assert obj_snp.data.SNP.isin([
34
+ '02011015010000500', '01110152120222512'
35
+ ]).all()
36
+
37
+ @pytest.mark.parametrize("obj_snp", ["uga"], indirect=True)
38
+ def test_snp_process_uga_empty(self, obj_snp: Snp) -> None:
39
+
40
+ obj_snp.process(pd.DataFrame(columns=[
41
+ 'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
42
+ 'GC Score', 'GT Score'
43
+ ]))
44
+ assert obj_snp.data is not None and obj_snp.data.empty
45
+
46
+ @pytest.mark.parametrize(
47
+ "obj_snp, data_fr",
48
+ [("uga", 'file1.txt'), (None, 'file1.txt')],
49
+ indirect=True
50
+ )
51
+ def test_snp_process_raises(
52
+ self, obj_snp: Snp, data_fr: pd.DataFrame
53
+ ) -> None:
54
+
55
+ with pytest.raises(KeyError):
56
+ obj_snp.process(pd.DataFrame(columns=[
57
+ 'SNP Name1', 'Sample ID1', 'Allele1 - AB', 'Allele2 - AB',
58
+ 'GC Score', 'GT Score'
59
+ ]))
60
+
61
+ assert obj_snp.data is None
62
+
63
+ @pytest.mark.parametrize(
64
+ "obj_snp, data_fr", [(None, 'file1.txt')], indirect=True
65
+ )
66
+ def test_snp_process_df(
67
+ self, obj_snp: Snp, data_fr: pd.DataFrame
68
+ ) -> None:
69
+
70
+ obj_snp.process(data_fr)
71
+ assert obj_snp.data is not None and not obj_snp.data.empty
72
+
73
+ @pytest.mark.parametrize("obj_snp", [None], indirect=True)
74
+ def test_snp_process_df_empty(self, obj_snp: Snp) -> None:
75
+
76
+ obj_snp.process(pd.DataFrame(columns=[
77
+ 'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
78
+ 'GC Score', 'GT Score'
79
+ ]))
80
+ assert obj_snp.data is not None and obj_snp.data.empty
81
+
82
+ @pytest.mark.parametrize(
83
+ "obj_snp, data_fr", [("uga", 'file1.txt')], indirect=True
84
+ )
85
+ def test_snp_to_file_uga1(
86
+ self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
87
+ ) -> None:
88
+ """
89
+ The name sample_id is one length
90
+ """
91
+
92
+ _dir_sub = tmp_path / "sub"
93
+ _dir_sub.mkdir()
94
+ _file_save = _dir_sub / "data_snp.csv"
95
+
96
+ obj_snp.process(data_fr)
97
+ assert obj_snp.data is not None and not obj_snp.data.empty
98
+
99
+ obj_snp.to_file(_file_save)
100
+ assert _file_save.is_file() and _file_save.exists()
101
+ assert (
102
+ _file_save.read_text() ==
103
+ "14814 02011015010000500\n14815 01110152120222512\n"
104
+ )
105
+
106
+ @pytest.mark.parametrize(
107
+ "obj_snp, data_fr", [("uga", 'file2.txt')], indirect=True
108
+ )
109
+ def test_snp_to_file_uga2(
110
+ self, obj_snp: Snp, data_fr: pd.DataFrame, tmp_path
111
+ ) -> None:
112
+ """
113
+ The name sample_id of different length
114
+ """
115
+
116
+ _dir_sub = tmp_path / "sub"
117
+ _dir_sub.mkdir()
118
+ _file_save = _dir_sub / "data_snp.csv"
119
+
120
+ obj_snp.process(data_fr)
121
+ assert obj_snp.data is not None and not obj_snp.data.empty
122
+
123
+ obj_snp.to_file(_file_save)
124
+ assert _file_save.is_file() and _file_save.exists()
125
+ assert (
126
+ _file_save.read_text() ==
127
+ "14814qwert 02011015010000500\n14815 01110152120222512\n"
128
+ )
parentage/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._discov import Discovery
6
+ from ._verif import Verification
7
+ from ._isagmark import isag_verif, isag_disc
8
+
9
+
10
+ __all__ = [
11
+ "Discovery",
12
+ "Verification",
13
+ "isag_disc",
14
+ "isag_verif"
15
+ ]
parentage/_discov.py ADDED
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import pandas as pd
6
+
7
+ """
8
+ Search for paternity according to ICAR recommendations
9
+ https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
10
+ """
11
+
12
+
13
+ class Discovery(object):
14
+ """ Search for paternity according to ICAR recommendations
15
+
16
+ :argument isag_markers: Fixed sample of markers to confirm paternity.
17
+ """
18
+
19
+ def __init__(
20
+ self, isag_markers: pd.Series | list | set | None = None
21
+ ) -> None:
22
+ self.__isag_markers = isag_markers
23
+
24
+ self.__num_conflicts = None # Number of conflicts
25
+ self.__perc_conflicts = None
26
+
27
+ @property
28
+ def status(self) -> None | str:
29
+ """ The status of each parent discovered. """
30
+
31
+ if self.__perc_conflicts is not None:
32
+ if 0 <= self.__perc_conflicts < 1:
33
+ return 'Discovered'
34
+ elif 1 < self.__perc_conflicts < 3:
35
+ return 'Doubtful'
36
+ elif self.__perc_conflicts >= 3:
37
+ return 'Excluded'
38
+ else:
39
+ return None
40
+
41
+ @property
42
+ def num_conflicts(self) -> None | int:
43
+ return self.__num_conflicts
44
+
45
+ @property
46
+ def perc_conflicts(self) -> None | float:
47
+ return self.__perc_conflicts
48
+
49
+ def search_parent(
50
+ self,
51
+ data: pd.DataFrame,
52
+ descendant: str,
53
+ parents: str,
54
+ snp_name_col: str
55
+ ) -> None:
56
+ """ Search for paternity.
57
+
58
+ :param data: SNP data for descendant and parent.
59
+ :param descendant: Columns name of the descendant in the data.
60
+ :param parents: Columns name or list name of the parents in the data.
61
+ :param snp_name_col: SNP columns name is data.
62
+ """
63
+
64
+ if self.__isag_markers is None:
65
+ raise ValueError("Error. No array of snp names to verify")
66
+
67
+ sample_by_markers = data.loc[
68
+ data[snp_name_col].isin(self.__isag_markers),
69
+ [snp_name_col, descendant, parents]
70
+ ]
71
+
72
+ # Filtering 5s from a descendent
73
+ desc_marks = sample_by_markers.loc[
74
+ sample_by_markers[descendant] != 5, [snp_name_col, descendant]
75
+ ]
76
+
77
+ # According to ICAR, the number of available markers must be
78
+ # above 450
79
+ if len(desc_marks) < 450:
80
+ raise Exception("Calf call rate is low.")
81
+
82
+ # Common after filtering markers of potential ancestors
83
+ sample_parents = sample_by_markers.loc[
84
+ sample_by_markers[snp_name_col].isin(desc_marks[snp_name_col]),
85
+ parents
86
+ ]
87
+
88
+ # Number of available markers in potential ancestors
89
+ prob_parents_same_n_markers = (sample_parents < 5).sum()
90
+
91
+ # number of conflicts
92
+ self.__num_conflicts = (
93
+ abs(sample_parents.sub(desc_marks[descendant], axis=0)) == 2
94
+ ).sum()
95
+
96
+ # Percentage of conflicts
97
+ self.__perc_conflicts = (
98
+ (self.__num_conflicts / prob_parents_same_n_markers) * 100
99
+ ).round(2)
100
+
101
+ def __status_define(self) -> None:
102
+ ...
parentage/_isagmark.py ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+
10
+ def isag_disc() -> pd.DataFrame:
11
+ return pd.read_pickle(Path(__file__).parent.joinpath("isag_disc.pl"))
12
+
13
+
14
+ def isag_verif() -> pd.DataFrame:
15
+ return pd.read_pickle(Path(__file__).parent.joinpath("isag_verif.pl"))
parentage/_verif.py ADDED
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ """
10
+ https://www.icar.org/Documents/GenoEx/ICAR%20Guidelines%20for%20Parentage%20Verification%20and%20Parentage%20Discovery%20based%20on%20SNP.pdf
11
+ """
12
+
13
+
14
+ class Verification(object):
15
+ """
16
+ Verification of paternity according to ICAR recommendations.
17
+
18
+ :argument isag_marks: Fixed sample of markers to confirm paternity.
19
+ """
20
+
21
+ def __init__(
22
+ self, isag_marks: pd.Series | list | set | None = None
23
+ ) -> None:
24
+ self.__isag_marks = isag_marks
25
+
26
+ # The minimum number of SNP available in the profile
27
+ # of each animal and potential parent must be scaled (i.e.: 95%
28
+ # truncated down)
29
+ self.__min_num_snp = 0.95
30
+ self.__num_conflicts = None # Number of conflicts
31
+
32
+ @property
33
+ def status(self) -> None | str:
34
+ if self.__num_conflicts is not None:
35
+ if self.__num_conflicts <= 2:
36
+ return 'Accept'
37
+ elif 3 <= self.__num_conflicts <= 5:
38
+ return 'Doubtful'
39
+ elif self.__num_conflicts > 5:
40
+ return 'Excluded'
41
+ else:
42
+ return None
43
+
44
+ @property
45
+ def num_conflicts(self) -> None | int:
46
+ return self.__num_conflicts
47
+
48
+ def check_on(
49
+ self,
50
+ data: pd.DataFrame,
51
+ descendant: str,
52
+ parent: str,
53
+ snp_name_col: str
54
+ ) -> None:
55
+ """ Verification of paternity according to ICAR recommendations.
56
+
57
+ :param data: SNP data for descendant and parent.
58
+ :param descendant: Columns name of the descendant in the data.
59
+ :param parent: Columns name of the parent in the data.
60
+ :param snp_name_col: SNP column name in data.
61
+ """
62
+
63
+ if self.__isag_marks is None:
64
+ raise ValueError('Error. No array of snp names to verify')
65
+
66
+ num_isag_mark = len(self.__isag_marks)
67
+ min_num_comm_snp = int(num_isag_mark - (2 * (num_isag_mark * 0.05)))
68
+
69
+ sample_mark = data.loc[
70
+ data[snp_name_col].isin(self.__isag_marks), [descendant, parent]
71
+ ]
72
+
73
+ # The number of markers is not 5ok
74
+ desc_n_markers = (sample_mark[descendant] < 5).sum()
75
+ parent_n_markers = (sample_mark[parent] < 5).sum()
76
+
77
+ # According to ICAR, the number of markers not 5ok should be more
78
+ # than 95%
79
+ if (desc_n_markers < num_isag_mark * self.__min_num_snp) and \
80
+ (parent_n_markers < num_isag_mark * self.__min_num_snp):
81
+ raise Exception('Calf and parent have low call rate')
82
+
83
+ comm_snp_no_missing = sample_mark.replace(5, np.nan).dropna()
84
+ num_comm_markers = len(comm_snp_no_missing)
85
+
86
+ if num_comm_markers < min_num_comm_snp:
87
+ raise Exception('Pair call rate is low')
88
+
89
+ self.__num_conflicts = (abs(
90
+ comm_snp_no_missing[descendant] - comm_snp_no_missing[parent]
91
+ ) == 2).sum()
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_DATA = Path(__file__).parent.joinpath("data")
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_DATA
6
+ from .. import Discovery, isag_disc
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data() -> pd.DataFrame:
14
+ return pd.read_csv(DIR_DATA / "parentage_test_disc.csv", sep=" ")
15
+
16
+
17
+ @pytest.fixture
18
+ def obj_discovery() -> Discovery:
19
+ return Discovery(isag_markers=isag_disc().markers)
20
+
21
+
22
+ class TestDiscovery(object):
23
+
24
+ def test_search_parent_successfully(
25
+ self, data: pd.DataFrame, obj_discovery: Discovery
26
+ ) -> None:
27
+
28
+ assert obj_discovery.search_parent(
29
+ data=data,
30
+ descendant="BY000041988163",
31
+ parents="EE10512586",
32
+ snp_name_col="SNP_Name"
33
+ ) is None
34
+ assert obj_discovery.num_conflicts == 77
35
+ assert obj_discovery.status == "Excluded"
36
+ assert obj_discovery.perc_conflicts == 14.86
37
+
38
+ def test_search_parent_1(self, data: pd.DataFrame) -> None:
39
+ """
40
+ An exception is thrown for the absence of data with isag markers
41
+ """
42
+ obj_discovery = Discovery()
43
+
44
+ with pytest.raises(
45
+ ValueError, match="Error. No array of snp names to verify"
46
+ ):
47
+ obj_discovery.search_parent(
48
+ data=data,
49
+ descendant="BY000041988163",
50
+ parents="EE10512586",
51
+ snp_name_col="SNP_Name"
52
+ )
53
+ assert obj_discovery.status is None
54
+ assert obj_discovery.num_conflicts is None
55
+ assert obj_discovery.perc_conflicts is None
56
+
57
+ def test_search_parent_2(
58
+ self, data: pd.DataFrame, obj_discovery: Discovery
59
+ ) -> None:
60
+ """
61
+ Exception when the number of markers required to confirm paternity is
62
+ less than the established value.
63
+ """
64
+
65
+ with pytest.raises(
66
+ Exception, match="Calf call rate is low."
67
+ ):
68
+ obj_discovery.search_parent(
69
+ data=data[:-100],
70
+ descendant="BY000041988163",
71
+ parents="EE10512586",
72
+ snp_name_col="SNP_Name"
73
+ )
74
+ assert obj_discovery.status is None
75
+ assert obj_discovery.num_conflicts is None
76
+ assert obj_discovery.perc_conflicts is None
77
+
78
+ def test_search_parent_3(
79
+ self, data: pd.DataFrame, obj_discovery: Discovery
80
+ ) -> None:
81
+ """
82
+ Test if the transmitted animal names are not in the dataframe.
83
+ """
84
+
85
+ # For descendant
86
+ with pytest.raises(KeyError):
87
+ obj_discovery.search_parent(
88
+ data=data,
89
+ descendant="BY00004198816",
90
+ parents="EE10512586",
91
+ snp_name_col="SNP_Name"
92
+ )
93
+ assert obj_discovery.status is None
94
+ assert obj_discovery.num_conflicts is None
95
+ assert obj_discovery.perc_conflicts is None
96
+
97
+ # For parents
98
+ with pytest.raises(KeyError):
99
+ obj_discovery.search_parent(
100
+ data=data,
101
+ descendant="BY000041988163",
102
+ parents="EE105125864",
103
+ snp_name_col="SNP_Name"
104
+ )
105
+ assert obj_discovery.status is None
106
+ assert obj_discovery.num_conflicts is None
107
+ assert obj_discovery.perc_conflicts is None
108
+
109
+ def test_search_parent_4(
110
+ self, data: pd.DataFrame, obj_discovery: Discovery
111
+ ) -> None:
112
+ """
113
+ Test when all snp data is not read - equal to 5.
114
+ """
115
+ data[["BY000041988163", "EE10512586"]] = 5
116
+
117
+ with pytest.raises(
118
+ Exception, match="Calf call rate is low."
119
+ ):
120
+ obj_discovery.search_parent(
121
+ data=data,
122
+ descendant="BY000041988163",
123
+ parents="EE10512586",
124
+ snp_name_col="SNP_Name"
125
+ )
126
+ assert obj_discovery.status is None
127
+ assert obj_discovery.num_conflicts is None
128
+ assert obj_discovery.perc_conflicts is None
129
+
130
+ def test_search_parent_5(
131
+ self, data: pd.DataFrame, obj_discovery: Discovery
132
+ ) -> None:
133
+ """
134
+ Test when there is a complete match.
135
+ """
136
+ data[["BY000041988163", "EE10512586"]] = 2
137
+
138
+ obj_discovery.search_parent(
139
+ data=data,
140
+ descendant="BY000041988163",
141
+ parents="EE10512586",
142
+ snp_name_col="SNP_Name"
143
+ )
144
+ assert obj_discovery.status == "Discovered"
145
+ assert obj_discovery.num_conflicts == 0
146
+ assert obj_discovery.perc_conflicts == 0.0
147
+
148
+ def test_search_parent_6(
149
+ self, data: pd.DataFrame, obj_discovery: Discovery
150
+ ) -> None:
151
+ """
152
+ Partial match test.
153
+ """
154
+ data.loc[202:, "EE10512586"] = 1
155
+
156
+ obj_discovery.search_parent(
157
+ data=data,
158
+ descendant="BY000041988163",
159
+ parents="EE10512586",
160
+ snp_name_col="SNP_Name"
161
+ )
162
+ assert obj_discovery.status == "Doubtful"
163
+ assert obj_discovery.num_conflicts == 14
164
+ assert obj_discovery.perc_conflicts == 2.70