snplib 1.1.10__py3-none-any.whl → 1.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
snplib/__init__.py CHANGED
@@ -1,8 +1,8 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from .finalreport import *
6
- from .format import *
7
- from .parentage import *
8
- from .statistics import *
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from .finalreport import *
6
+ from .format import *
7
+ from .parentage import *
8
+ from .statistics import *
@@ -1,7 +1,7 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._finalreport import FinalReport
6
-
7
- __all__ = ["FinalReport"]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._finalreport import FinalReport
6
+
7
+ __all__ = ["FinalReport"]
@@ -1,253 +1,305 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
- __all__ = ("FinalReport",)
5
-
6
- from pathlib import Path
7
- from functools import reduce
8
-
9
- import re
10
-
11
- from numpy import nan
12
- import pandas as pd
13
-
14
-
15
- class FinalReport(object):
16
- """ File that contains SNP information. File processing is triggered by the
17
- handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
18
- conversion file, the processed data will contain NAN values.
19
-
20
- :argument allele: A variant form of a single nucleotide polymorphism
21
- (SNP), a specific polymorphic site or a whole gene detectable at
22
- a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
23
- :argument sep: Delimiter to use. Default value: "\\t"
24
-
25
- Example:
26
- [Header]
27
- GSGT Version 2.0.4
28
- Processing Date 10/14/2021 4:02 PM
29
- Content BovineSNP50_v3_A1.bpm
30
- Num SNPs 53218
31
- Total SNPs 53218
32
- Num Samples 3
33
- Total Samples 3
34
- [Data]
35
- SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
36
- ABCA12 1 A A 0.4048 0.8164
37
- APAF1 1 B B 0.9067 0.9155
38
- ...
39
- """
40
-
41
- __PATTERN_HEADER = re.compile(r'(^\[Header\])')
42
- __PATTERN_DATA = re.compile(r'(^\[Data\])')
43
-
44
- def __init__(
45
- self,
46
- allele: str | list | None = None,
47
- sep: str = "\t"
48
- ) -> None:
49
- self._delimiter = sep
50
- self._full_data = None
51
-
52
- self.__header = {}
53
- self.__snp_data = None
54
- self.__allele = allele
55
- self._map_rn = None
56
-
57
- @property
58
- def header(self) -> dict:
59
- return self.__header
60
-
61
- @property
62
- def snp_data(self) -> pd.DataFrame | None:
63
- return self.__snp_data
64
-
65
- def handle(
66
- self, file_rep: Path | str, conv_file: Path | str = None
67
- ) -> bool:
68
- """ Processes the FinalReport.txt file. Highlights meta information
69
- and data.
70
-
71
- :param file_rep: The file FinalReport.txt or another name.
72
- :param conv_file: The file that contains IDs of registration numbers
73
- of animals.
74
- :return: Returns true if file processing was successful, false if
75
- there were errors.
76
- """
77
-
78
- try:
79
-
80
- if isinstance(file_rep, str):
81
- file_rep = Path(file_rep)
82
-
83
- if not file_rep.is_file() and not file_rep.exists():
84
- return False
85
-
86
- # Processing conversion file
87
- if conv_file is not None:
88
- if isinstance(conv_file, str):
89
- conv_file = Path(conv_file)
90
-
91
- if not conv_file.is_file() and not conv_file.exists():
92
- return False
93
-
94
- self.__convert_s_id(conv_file)
95
-
96
- # Processing report file
97
- if not self.read(file_rep):
98
- return False
99
-
100
- if self._full_data is None:
101
- raise Exception("Not data in file FinalReport.txt")
102
-
103
- self.__handler_header()
104
- self.__handler_data()
105
-
106
- if self._map_rn is not None:
107
- self.__snp_data['Sample ID'] = \
108
- self.__snp_data['Sample ID'].map(
109
- dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
110
- )
111
-
112
- except Exception as e:
113
- raise e
114
-
115
- return True
116
-
117
- def read(self, file_rep: Path) -> bool:
118
- """ Reading data from the final_report file
119
-
120
- :param file_rep: path, pointer to the file to be read.
121
- :return: Returns true if the read was successful, false if it failed.
122
- """
123
- try:
124
- if len(data := file_rep.read_text()) != 0:
125
- self._full_data = data.strip().split("\n")
126
- return True
127
-
128
- self._full_data = None
129
-
130
- except Exception as e:
131
- return False
132
-
133
- return True
134
-
135
- def __handler_header(self) -> None:
136
- """ Processes data from a file, selects meta-information. """
137
-
138
- for line in self._full_data:
139
- if self.__class__.__PATTERN_DATA.findall(line):
140
- return
141
-
142
- if self.__class__.__PATTERN_HEADER.findall(line):
143
- continue
144
-
145
- key = line.strip().split("\t")[0]
146
- value = line.strip().split("\t")[1]
147
-
148
- self.__header[key] = value
149
-
150
- def __handler_data(self) -> None:
151
- """ Processes data and forms an array for further processing. """
152
-
153
- temp = 1
154
- for line in self._full_data:
155
- if self.__class__.__PATTERN_DATA.findall(line):
156
- break
157
- temp += 1
158
-
159
- names_col = self.__sample_by_allele(
160
- self._full_data[temp].split(f"{self._delimiter}")
161
- )
162
-
163
- if names_col is None:
164
- raise Exception(f"Error. Allele {self.__allele} not in data.")
165
-
166
- self.__snp_data = pd.DataFrame(
167
- [
168
- item_data.split(f"{self._delimiter}")
169
- for item_data in self._full_data[temp + 1:]
170
- ],
171
- columns=self._full_data[temp].split(f"{self._delimiter}")
172
- )[names_col]
173
-
174
- def __sample_by_allele(self, names: list[str]) -> list[str] | None:
175
- """ Method that generates a list of field names choosing which alleles
176
- to keep
177
-
178
- :param names: List of field names in the report file.
179
- :return: Returns a filtered list of fields by alleles.
180
- """
181
-
182
- allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
183
-
184
- match self.__allele:
185
- case None:
186
- return names
187
-
188
- case str():
189
- allele_pattern = re.compile(
190
- allele_templ.format(self.__allele)
191
- )
192
-
193
- case list() | tuple() | set():
194
- allele_pattern = re.compile(
195
- allele_templ.format("|".join(self.__allele))
196
- )
197
- case _:
198
- return None
199
-
200
- lst_allele = reduce(
201
- lambda i, j: i + j,
202
- [allele_pattern.findall(item) for item in names]
203
- )
204
-
205
- if len(lst_allele) == 0:
206
- return None
207
-
208
- exclude_alleles = [
209
- item for item in names
210
- if item.startswith("Allele") and item not in lst_allele
211
- ]
212
-
213
- return list(filter(
214
- lambda x: True if x not in exclude_alleles else False, names
215
- ))
216
-
217
- def __convert_s_id(self, path_file: Path) -> None:
218
- """Converts sample id which is in FinalReport to animal registration
219
- number.
220
-
221
- :param path_file: xlsx file with animal numbers label
222
- """
223
-
224
- self._map_rn = pd.read_excel(
225
- path_file,
226
- header=None,
227
- names=['SID', 'UNIQ_KEY', 'SEX'],
228
- dtype={'SID': str},
229
- index_col=False
230
- )
231
-
232
- if self._map_rn.empty:
233
- self._map_rn = None
234
- return
235
-
236
- self._map_rn.SID = self._map_rn.SID.str.strip()
237
- self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
238
-
239
- if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
240
- raise Exception("Error. Unique keys contain Cyrillic alphabet.")
241
-
242
- @staticmethod
243
- def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
244
- """ Checial verification of the Cyrillic
245
-
246
- :param seq: Squeezed for verification.
247
- :return: Truth if there are no symbols of Cyril and there is a lie if
248
- there is.
249
- """
250
-
251
- return seq.apply(
252
- lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
253
- ).any()
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+ __all__ = ("FinalReport",)
5
+
6
+ import re
7
+ from functools import reduce
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ from numpy import nan
12
+
13
+
14
+ class FinalReport(object):
15
+ """ File that contains SNP information. File processing is triggered by the
16
+ handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
17
+ conversion file, the processed data will contain NAN values.
18
+
19
+ :param allele: A variant form of a single nucleotide polymorphism (SNP), a
20
+ specific polymorphic site or a whole gene detectable at a locus. Type:
21
+ 'AB', 'Forward', 'Top', 'Plus', 'Design'.
22
+ :param sep: Delimiter to use. Default value: "\\t".
23
+ :param usecols: Selection of fields for reading. Accelerates processing
24
+ and reduces memory.
25
+ :param dtype: Data type(s) to apply to either the whole dataset or
26
+ individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
27
+
28
+ Example:
29
+ [Header]
30
+ GSGT Version 2.0.4
31
+ Processing Date 10/14/2021 4:02 PM
32
+ Content BovineSNP50_v3_A1.bpm
33
+ Num SNPs 53218
34
+ Total SNPs 53218
35
+ Num Samples 3
36
+ Total Samples 3
37
+ [Data]
38
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
39
+ ABCA12 1 A A 0.4048 0.8164
40
+ APAF1 1 B B 0.9067 0.9155
41
+ ...
42
+ """
43
+
44
+ __PATTERN_HEADER = re.compile(r'(^\[Header])')
45
+ __PATTERN_DATA = re.compile(r'(^\[Data])')
46
+
47
+ __slots__ = (
48
+ "_delimiter",
49
+ "__allele",
50
+ "__usecols",
51
+ "__dtype",
52
+ "__snp_data",
53
+ "__header",
54
+ "_map_rn",
55
+ )
56
+
57
+ def __init__(
58
+ self,
59
+ allele: str | list | None = None,
60
+ usecols: list[str] | None = None,
61
+ dtype: dict | None = None,
62
+ sep: str = "\t"
63
+ ) -> None:
64
+ self._delimiter = sep
65
+ self.__allele = allele
66
+ self.__usecols = usecols
67
+ self.__dtype = dtype
68
+
69
+ # self._full_data = None
70
+ self.__snp_data: pd.DataFrame | None = None
71
+ self.__header = {}
72
+ self._map_rn = None
73
+
74
+ @property
75
+ def header(self) -> dict:
76
+ return self.__header
77
+
78
+ @property
79
+ def snp_data(self) -> pd.DataFrame | None:
80
+ return self.__snp_data
81
+
82
+ def handle(
83
+ self, file_rep: Path | str, conv_file: Path | str = None
84
+ ) -> bool:
85
+ """ Processes the FinalReport.txt file. Highlights meta information
86
+ and data.
87
+
88
+ :param file_rep: The file FinalReport.txt or another name.
89
+ :param conv_file: The file that contains IDs of registration numbers
90
+ of animals.
91
+ :return: Returns true if file processing was successful, false if
92
+ there were errors.
93
+ """
94
+
95
+ try:
96
+
97
+ if self.__allele is not None and self.__usecols is not None:
98
+ raise Exception("Error. Usecols is used for allele is none.")
99
+
100
+ if isinstance(file_rep, str):
101
+ file_rep = Path(file_rep)
102
+
103
+ if not file_rep.is_file() and not file_rep.exists():
104
+ return False
105
+
106
+ # Processing conversion file
107
+ if conv_file is not None:
108
+ if isinstance(conv_file, str):
109
+ conv_file = Path(conv_file)
110
+
111
+ if not conv_file.is_file() and not conv_file.exists():
112
+ return False
113
+
114
+ self.__convert_s_id(conv_file)
115
+
116
+ # # Processing report file
117
+ self.__handler_header(file_rep)
118
+ self.__handler_data(file_rep)
119
+
120
+ if not self.__snp_data.empty and self._map_rn is not None:
121
+ self.__snp_data['Sample ID'] = \
122
+ self.__snp_data['Sample ID'].map(
123
+ dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
124
+ )
125
+
126
+ except Exception as e:
127
+ raise e
128
+
129
+ return True
130
+
131
+ def __handler_header(self, file_rep: Path) -> None:
132
+ """ Processes data from a file, selects meta-information.
133
+
134
+ :param file_rep: path, pointer to the file to be read.
135
+ """
136
+
137
+ with open(file_rep, 'r') as file:
138
+
139
+ for line in file:
140
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
141
+ return
142
+
143
+ if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
144
+ len(line.strip()) == 0:
145
+ continue
146
+
147
+ key = line.strip().split("\t")[0]
148
+ value = line.strip().split("\t")[1]
149
+
150
+ self.__header[key] = value
151
+
152
+ def __handler_data(self, file_rep: Path) -> None:
153
+ """ Processes data and forms an array for further processing.
154
+
155
+ :param file_rep: path, pointer to the file to be read.
156
+ """
157
+
158
+ with open(file_rep, 'r') as file:
159
+
160
+ # Search for the data start index and skip
161
+ for line in file:
162
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
163
+ break
164
+
165
+ # line column
166
+ orig_name_col = file.readline().strip().split(self._delimiter)
167
+
168
+ if self.__allele is None and self.__usecols is None:
169
+ self.__snp_data = pd.read_csv(
170
+ file,
171
+ sep=self._delimiter,
172
+ header=None,
173
+ names=orig_name_col,
174
+ dtype=self.__dtype,
175
+ low_memory=True,
176
+ na_filter=True
177
+ )
178
+
179
+ return
180
+
181
+ sub_n_col = self.__processing_columns(orig_name_col)
182
+ self.__snp_data = pd.read_csv(
183
+ file,
184
+ sep=self._delimiter,
185
+ header=None,
186
+ names=orig_name_col,
187
+ usecols=sub_n_col,
188
+ dtype=self.__dtype,
189
+ low_memory=True,
190
+ na_filter=True
191
+ )
192
+
193
+ return
194
+
195
+ def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
196
+ """ Processing the line with all the names of the fields and the
197
+ sample of them.
198
+
199
+ :param lst_col: List of all fields.
200
+ :return: Returns a tuple with a list of names of selected fields.
201
+ """
202
+
203
+ if self.__usecols is not None:
204
+ check_n_col = [
205
+ item for item in self.__usecols if item in lst_col
206
+ ]
207
+
208
+ # Check on empty list
209
+ if check_n_col:
210
+ return self.__usecols
211
+
212
+ raise Exception(
213
+ f"Error. The USECOLS list contains not true fields."
214
+ )
215
+
216
+ # processing alleles
217
+ sample_n_col = self.__sample_by_allele(lst_col)
218
+ if sample_n_col is None:
219
+ raise Exception(
220
+ f"Error. Allele {self.__allele} not in data."
221
+ )
222
+
223
+ return sample_n_col
224
+
225
+ def __sample_by_allele(self, names: list[str]) -> list[str] | None:
226
+ """ Method that generates a list of field names choosing which alleles
227
+ to keep
228
+
229
+ :param names: List of field names in the report file.
230
+ :return: Returns a filtered list of fields by alleles.
231
+ """
232
+
233
+ allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
234
+
235
+ match self.__allele:
236
+ case None:
237
+ return names
238
+
239
+ case str():
240
+ allele_pattern = re.compile(
241
+ allele_templ.format(self.__allele)
242
+ )
243
+
244
+ case list() | tuple() | set():
245
+ allele_pattern = re.compile(
246
+ allele_templ.format("|".join(self.__allele))
247
+ )
248
+ case _:
249
+ return None
250
+
251
+ lst_allele = reduce(
252
+ lambda i, j: i + j,
253
+ [allele_pattern.findall(item) for item in names]
254
+ )
255
+
256
+ if len(lst_allele) == 0:
257
+ return None
258
+
259
+ exclude_alleles = [
260
+ item for item in names
261
+ if item.startswith("Allele") and item not in lst_allele
262
+ ]
263
+
264
+ return list(filter(
265
+ lambda x: True if x not in exclude_alleles else False, names
266
+ ))
267
+
268
+ def __convert_s_id(self, path_file: Path) -> None:
269
+ """Converts sample id which is in FinalReport to animal registration
270
+ number.
271
+
272
+ :param path_file: xlsx file with animal numbers label
273
+ """
274
+
275
+ self._map_rn = pd.read_excel(
276
+ path_file,
277
+ header=None,
278
+ names=['SID', 'UNIQ_KEY', 'SEX'],
279
+ index_col=False
280
+ )
281
+
282
+ if self._map_rn.empty:
283
+ self._map_rn = None
284
+ return
285
+
286
+ if self._map_rn.SID.dtypes == "O":
287
+ self._map_rn.SID = self._map_rn.SID.str.strip()
288
+
289
+ self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
290
+
291
+ if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
292
+ raise Exception("Error. Unique keys contain Cyrillic alphabet.")
293
+
294
+ @staticmethod
295
+ def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
296
+ """ Checial verification of the Cyrillic
297
+
298
+ :param seq: Squeezed for verification.
299
+ :return: Truth if there are no symbols of Cyril and there is a lie if
300
+ there is.
301
+ """
302
+
303
+ return seq.apply(
304
+ lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
305
+ ).any()
snplib/format/__init__.py CHANGED
@@ -1,19 +1,19 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._snp import Snp
6
- from ._plink import (
7
- make_map,
8
- make_ped,
9
- make_lgen,
10
- make_fam
11
- )
12
-
13
- __all__ = [
14
- "Snp",
15
- "make_map",
16
- "make_ped",
17
- "make_fam",
18
- "make_lgen"
19
- ]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snp import Snp
6
+ from ._plink import (
7
+ make_map,
8
+ make_ped,
9
+ make_lgen,
10
+ make_fam
11
+ )
12
+
13
+ __all__ = [
14
+ "Snp",
15
+ "make_map",
16
+ "make_ped",
17
+ "make_fam",
18
+ "make_lgen"
19
+ ]
@@ -1,7 +1,7 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
6
- RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
7
- MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
6
+ RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
7
+ MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))