snplib 1.0.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._finalreport import FinalReport
6
+
7
+ __all__ = ["FinalReport"]
@@ -0,0 +1,251 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+ __all__ = ("FinalReport",)
5
+
6
+ from pathlib import Path
7
+ from functools import reduce
8
+
9
+ import re
10
+ import pandas as pd
11
+
12
+
13
+ class FinalReport(object):
14
+ """ File that contains SNP information.
15
+
16
+ :argument allele: A variant form of a single nucleotide polymorphism
17
+ (SNP), a specific polymorphic site or a whole gene detectable at
18
+ a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
19
+ :argument sep: Delimiter to use. Default value: "\\t"
20
+
21
+ Example:
22
+ [Header]
23
+ GSGT Version 2.0.4
24
+ Processing Date 10/14/2021 4:02 PM
25
+ Content BovineSNP50_v3_A1.bpm
26
+ Num SNPs 53218
27
+ Total SNPs 53218
28
+ Num Samples 3
29
+ Total Samples 3
30
+ [Data]
31
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
32
+ ABCA12 1 A A 0.4048 0.8164
33
+ APAF1 1 B B 0.9067 0.9155
34
+ ...
35
+ """
36
+
37
+ __PATTERN_HEADER = re.compile(r'(^\[Header\])')
38
+ __PATTERN_DATA = re.compile(r'(^\[Data\])')
39
+
40
+ def __init__(
41
+ self,
42
+ allele: str | list | None = None,
43
+ sep: str = "\t"
44
+ ) -> None:
45
+ self._delimiter = sep
46
+ self._full_data = None
47
+
48
+ self.__header = {}
49
+ self.__snp_data = None
50
+ self.__allele = allele
51
+ self._map_rn = None
52
+
53
+ @property
54
+ def header(self) -> dict:
55
+ return self.__header
56
+
57
+ @property
58
+ def snp_data(self) -> pd.DataFrame | None:
59
+ return self.__snp_data
60
+
61
+ def handle(
62
+ self, file_rep: Path | str, conv_file: Path | str = None
63
+ ) -> bool:
64
+ """ Processes the FinalReport.txt file. Highlights meta information
65
+ and data.
66
+
67
+ :param file_rep: The file FinalReport.txt or another name.
68
+ :param conv_file: The file that contains IDs of registration numbers
69
+ of animals.
70
+ :return: Returns true if file processing was successful, false if
71
+ there were errors.
72
+ """
73
+
74
+ try:
75
+
76
+ if isinstance(file_rep, str):
77
+ file_rep = Path(file_rep)
78
+
79
+ if not file_rep.is_file() and not file_rep.exists():
80
+ return False
81
+
82
+ # Processing conversion file
83
+ if conv_file is not None:
84
+ if isinstance(conv_file, str):
85
+ conv_file = Path(conv_file)
86
+
87
+ if not conv_file.is_file() and not conv_file.exists():
88
+ return False
89
+
90
+ self.__convert_s_id(conv_file)
91
+
92
+ # Processing report file
93
+ if not self.read(file_rep):
94
+ return False
95
+
96
+ if self._full_data is None:
97
+ raise Exception("Not data in file FinalReport.txt")
98
+
99
+ self.__handler_header()
100
+ self.__handler_data()
101
+
102
+ if self._map_rn is not None:
103
+ self.__snp_data['Sample ID'] = \
104
+ self.__snp_data['Sample ID'].map(
105
+ dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
106
+ )
107
+
108
+ except Exception as e:
109
+ raise e
110
+
111
+ return True
112
+
113
+ def read(self, file_rep: Path) -> bool:
114
+ """ Reading data from the final_report file
115
+
116
+ :param file_rep: path, pointer to the file to be read.
117
+ :return: Returns true if the read was successful, false if it failed.
118
+ """
119
+ try:
120
+ if len(data := file_rep.read_text()) != 0:
121
+ self._full_data = data.strip().split("\n")
122
+ return True
123
+
124
+ self._full_data = None
125
+
126
+ except Exception as e:
127
+ return False
128
+
129
+ return True
130
+
131
+ def __handler_header(self) -> None:
132
+ """ Processes data from a file, selects meta-information. """
133
+
134
+ for line in self._full_data:
135
+ if self.__class__.__PATTERN_DATA.findall(line):
136
+ return
137
+
138
+ if self.__class__.__PATTERN_HEADER.findall(line):
139
+ continue
140
+
141
+ key = line.strip().split("\t")[0]
142
+ value = line.strip().split("\t")[1]
143
+
144
+ self.__header[key] = value
145
+
146
+ def __handler_data(self) -> None:
147
+ """ Processes data and forms an array for further processing. """
148
+
149
+ temp = 1
150
+ for line in self._full_data:
151
+ if self.__class__.__PATTERN_DATA.findall(line):
152
+ break
153
+ temp += 1
154
+
155
+ names_col = self.__sample_by_allele(
156
+ self._full_data[temp].split(f"{self._delimiter}")
157
+ )
158
+
159
+ if names_col is None:
160
+ raise Exception(f"Error. Allele {self.__allele} not in data.")
161
+
162
+ self.__snp_data = pd.DataFrame(
163
+ [
164
+ item_data.split(f"{self._delimiter}")
165
+ for item_data in self._full_data[temp + 1:]
166
+ ],
167
+ columns=self._full_data[temp].split(f"{self._delimiter}")
168
+ )[names_col]
169
+
170
+ def __sample_by_allele(self, names: list[str]) -> list[str] | None:
171
+ """ Method that generates a list of field names choosing which alleles
172
+ to keep
173
+
174
+ :param names: List of field names in the report file.
175
+ :return: Returns a filtered list of fields by alleles.
176
+ """
177
+
178
+ allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
179
+
180
+ match self.__allele:
181
+ case None:
182
+ return names
183
+
184
+ case str():
185
+ allele_pattern = re.compile(
186
+ allele_templ.format(self.__allele)
187
+ )
188
+
189
+ case list() | tuple() | set():
190
+ allele_pattern = re.compile(
191
+ allele_templ.format("|".join(self.__allele))
192
+ )
193
+ case _:
194
+ return None
195
+
196
+ lst_allele = reduce(
197
+ lambda i, j: i + j,
198
+ [allele_pattern.findall(item) for item in names]
199
+ )
200
+
201
+ if len(lst_allele) == 0:
202
+ return None
203
+
204
+ exclude_alleles = [
205
+ item for item in names
206
+ if item.startswith("Allele") and item not in lst_allele
207
+ ]
208
+
209
+ return list(filter(
210
+ lambda x: True if x not in exclude_alleles else False, names
211
+ ))
212
+
213
+ def __convert_s_id(self, path_file: Path) -> None:
214
+ """Converts sample id which is in FinalReport to animal registration
215
+ number.
216
+
217
+ :param path_file: xlsx file with animal numbers label
218
+ """
219
+
220
+ self._map_rn = pd.read_excel(
221
+ path_file,
222
+ header=None,
223
+ names=['SID', 'UNIQ_KEY', 'SEX'],
224
+ dtype={'SID': str},
225
+ index_col=False
226
+ )
227
+
228
+ if self._map_rn.empty:
229
+ self._map_rn = None
230
+ return
231
+
232
+ self._map_rn.SID = self._map_rn.SID.str.strip()
233
+ self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
234
+
235
+ if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
236
+ raise Exception("Error. Unique keys contain Cyrillic alphabet.")
237
+
238
+ if self._map_rn.UNIQ_KEY.isna().any():
239
+ self._map_rn.fillna('unknown', inplace=True)
240
+
241
+ @staticmethod
242
+ def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
243
+ """
244
+
245
+ :param seq:
246
+ :return:
247
+ """
248
+
249
+ return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))
250
+
251
+
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_FILES = Path(__file__).parent.joinpath("files")
@@ -0,0 +1,215 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+
6
+ from . import DIR_FILES
7
+ from .._finalreport import FinalReport
8
+
9
+ import pytest
10
+
11
+
12
+ @pytest.fixture
13
+ def report(request) -> FinalReport:
14
+ return FinalReport(allele=request.param)
15
+
16
+
17
+ class TestFinalReport(object):
18
+
19
+ @pytest.mark.parametrize("report", [None], indirect=True)
20
+ def test_handle_1(self, report: FinalReport) -> None:
21
+ """ If both files do not exist """
22
+
23
+ assert not report.handle(
24
+ DIR_FILES / "fr/f.txt", DIR_FILES / "fr/f.xlsx",
25
+ )
26
+
27
+ @pytest.mark.parametrize("report", [None], indirect=True)
28
+ def test_handle_2(self, report: FinalReport) -> None:
29
+ """ If the file to convert does not exist """
30
+
31
+ assert not report.handle(
32
+ DIR_FILES / "fr/file1.txt", DIR_FILES / "fr/f.xlsx",
33
+ )
34
+
35
+ @pytest.mark.parametrize("report", [None], indirect=True)
36
+ def test_handle_3(self, report: FinalReport) -> None:
37
+ """ If the data does not contain header data """
38
+
39
+ report.handle(
40
+ DIR_FILES / "fr/file2.txt", DIR_FILES / "fr/file2.xlsx",
41
+ )
42
+
43
+ assert len(report.header) == 0 and not report.snp_data.empty
44
+
45
+ @pytest.mark.parametrize("report", [None], indirect=True)
46
+ def test_handle_4(self, report: FinalReport) -> None:
47
+ """ If the file contains only header and field names """
48
+
49
+ report.handle(
50
+ DIR_FILES / "fr/file3.txt", DIR_FILES / "fr/file3.xlsx",
51
+ )
52
+
53
+ assert report.snp_data is not None and report.snp_data.empty
54
+
55
+ @pytest.mark.parametrize("report", [None], indirect=True)
56
+ def test_handle_5(self, report: FinalReport) -> None:
57
+ """ If the data file is empty """
58
+
59
+ with pytest.raises(
60
+ Exception, match="Not data in file FinalReport.txt"
61
+ ):
62
+ report.handle(
63
+ DIR_FILES / "fr/file5.txt", DIR_FILES / "fr/file5.xlsx",
64
+ )
65
+
66
+ assert report.snp_data is None
67
+
68
+ @pytest.mark.parametrize("report", [None], indirect=True)
69
+ def test_handle_6(self, report: FinalReport) -> None:
70
+ """ If the conversion file is empty """
71
+
72
+ assert report.handle(
73
+ DIR_FILES / "fr/file6.txt", DIR_FILES / "fr/file6.xlsx",
74
+ )
75
+
76
+ assert not report.snp_data.empty
77
+ assert len(report.header) != 0
78
+
79
+ @pytest.mark.parametrize("report", [None], indirect=True)
80
+ def test_handle_7(self, report: FinalReport) -> None:
81
+ """ If the data file is not needed to convert ID name """
82
+
83
+ report.handle(DIR_FILES / "fr/file4.txt", None)
84
+
85
+ assert not report.snp_data.empty
86
+ assert len(report.header) != 0
87
+
88
+ @pytest.mark.parametrize("report", [None], indirect=True)
89
+ def test_handle_8(self, report: FinalReport) -> None:
90
+ """ If files exist """
91
+
92
+ assert report.handle(
93
+ DIR_FILES / "fr/file1.txt", DIR_FILES / "fr/file1.xlsx",
94
+ )
95
+
96
+ @pytest.mark.parametrize("report", [None], indirect=True)
97
+ def test_allele_none(self, report: FinalReport) -> None:
98
+ report.handle(DIR_FILES / "fr/file4.txt", None)
99
+
100
+ _fields = [
101
+ 'SNP Name', 'Sample ID', 'Allele1 - Forward', 'Allele2 - Forward',
102
+ 'Allele1 - Top', 'Allele2 - Top', 'Allele1 - AB', 'Allele2 - AB',
103
+ 'GC Score', 'X', 'Y'
104
+ ]
105
+
106
+ assert report.snp_data.columns.difference(_fields).empty
107
+
108
+ @pytest.mark.parametrize("report", ["AB"], indirect=True)
109
+ def test_sample_allele_ab(self, report: FinalReport) -> None:
110
+ report.handle(DIR_FILES / "fr/file4.txt", None)
111
+
112
+ _fields = [
113
+ 'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
114
+ 'GC Score', 'X', 'Y'
115
+ ]
116
+
117
+ assert report.snp_data.columns.difference(_fields).empty
118
+
119
+ @pytest.mark.parametrize("report", ["Forward"], indirect=True)
120
+ def test_sample_allele_forward(self, report: FinalReport) -> None:
121
+ report.handle(DIR_FILES / "fr/file4.txt", None)
122
+
123
+ _fields = [
124
+ 'SNP Name', 'Sample ID', 'Allele1 - Forward', 'Allele2 - Forward',
125
+ 'GC Score', 'X', 'Y'
126
+ ]
127
+
128
+ assert report.snp_data.columns.difference(_fields).empty
129
+
130
+ @pytest.mark.parametrize("report", ["Top"], indirect=True)
131
+ def test_sample_allele_top(self, report: FinalReport) -> None:
132
+ report.handle(DIR_FILES / "fr/file4.txt", None)
133
+
134
+ _fields = [
135
+ 'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
136
+ 'GC Score', 'X', 'Y'
137
+ ]
138
+
139
+ assert report.snp_data.columns.difference(_fields).empty
140
+
141
+ @pytest.mark.parametrize("report", [["AB", "Top"]], indirect=True)
142
+ def test_sample_allele_list1(self, report: FinalReport) -> None:
143
+ report.handle(DIR_FILES / "fr/file4.txt", None)
144
+
145
+ _fields = [
146
+ 'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
147
+ 'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
148
+ ]
149
+
150
+ assert report.snp_data.columns.difference(_fields).empty
151
+
152
+ @pytest.mark.parametrize("report", [["AB"]], indirect=True)
153
+ def test_sample_allele_list2(self, report: FinalReport) -> None:
154
+ report.handle(DIR_FILES / "fr/file4.txt", None)
155
+
156
+ _fields = [
157
+ 'SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB',
158
+ 'GC Score', 'X', 'Y'
159
+ ]
160
+
161
+ assert report.snp_data.columns.difference(_fields).empty
162
+
163
+ @pytest.mark.parametrize("report", [("AB", "Top")], indirect=True)
164
+ def test_sample_allele_tuple(self, report: FinalReport) -> None:
165
+ report.handle(DIR_FILES / "fr/file4.txt", None)
166
+
167
+ _fields = [
168
+ 'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
169
+ 'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
170
+ ]
171
+
172
+ assert report.snp_data.columns.difference(_fields).empty
173
+
174
+ @pytest.mark.parametrize("report", [{"AB", "Top"}], indirect=True)
175
+ def test_sample_allele_set(self, report: FinalReport) -> None:
176
+ report.handle(DIR_FILES / "fr/file4.txt", None)
177
+
178
+ _fields = [
179
+ 'SNP Name', 'Sample ID', 'Allele1 - Top', 'Allele2 - Top',
180
+ 'Allele1 - AB', 'Allele2 - AB', 'GC Score', 'X', 'Y'
181
+ ]
182
+
183
+ assert report.snp_data.columns.difference(_fields).empty
184
+
185
+ @pytest.mark.parametrize("report", ["GG"], indirect=True)
186
+ def test_sample_allele_not_exist(self, report: FinalReport) -> None:
187
+
188
+ with pytest.raises(
189
+ Exception, match="Error. Allele GG not in data."
190
+ ):
191
+ report.handle(DIR_FILES / "fr/file4.txt", None)
192
+
193
+ @pytest.mark.parametrize("report", ["AB"], indirect=True)
194
+ def test_7(self, report: FinalReport) -> None:
195
+
196
+ with pytest.raises(
197
+ Exception, match="Error. Unique keys contain Cyrillic alphabet."
198
+ ):
199
+ report.handle(
200
+ DIR_FILES / "fr/file7.txt", DIR_FILES / "fr/file7.xlsx"
201
+ )
202
+
203
+ # assert not report.snp_data.empty
204
+ #
205
+ # @pytest.mark.parametrize("report", ["AB"], indirect=True)
206
+ # def test_8(self, report: FinalReport) -> None:
207
+ # ...
208
+ #
209
+ # @pytest.mark.parametrize("report", ["AB"], indirect=True)
210
+ # def test_9(self, report: FinalReport) -> None:
211
+ # ...
212
+ #
213
+ # @pytest.mark.parametrize("report", ["AB"], indirect=True)
214
+ # def test_10(self, report: FinalReport) -> None:
215
+ # ...
format/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snp import Snp
6
+ from ._plink import (
7
+ make_map,
8
+ make_ped,
9
+ make_lgen,
10
+ make_fam
11
+ )
12
+
13
+ __all__ = [
14
+ "Snp",
15
+ "make_map",
16
+ "make_ped",
17
+ "make_fam",
18
+ "make_lgen"
19
+ ]
format/__settings.py ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
6
+ RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
7
+ MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))