snplib 1.0.10__py3-none-any.whl → 1.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snplib/finalreport/_finalreport.py +124 -71
- {snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/METADATA +7 -7
- {snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/RECORD +6 -6
- {snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/WHEEL +1 -1
- {snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/LICENSE +0 -0
- {snplib-1.0.10.dist-info → snplib-1.2.10.dist-info}/top_level.txt +0 -0
@@ -3,20 +3,27 @@
|
|
3
3
|
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
4
|
__all__ = ("FinalReport",)
|
5
5
|
|
6
|
-
|
6
|
+
import re
|
7
7
|
from functools import reduce
|
8
|
+
from pathlib import Path
|
8
9
|
|
9
|
-
import re
|
10
10
|
import pandas as pd
|
11
|
+
from numpy import nan
|
11
12
|
|
12
13
|
|
13
14
|
class FinalReport(object):
|
14
|
-
""" File that contains SNP information.
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
15
|
+
""" File that contains SNP information. File processing is triggered by the
|
16
|
+
handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
|
17
|
+
conversion file, the processed data will contain NAN values.
|
18
|
+
|
19
|
+
:param allele: A variant form of a single nucleotide polymorphism (SNP), a
|
20
|
+
specific polymorphic site or a whole gene detectable at a locus. Type:
|
21
|
+
'AB', 'Forward', 'Top', 'Plus', 'Design'.
|
22
|
+
:param sep: Delimiter to use. Default value: "\\t".
|
23
|
+
:param usecols: Selection of fields for reading. Accelerates processing
|
24
|
+
and reduces memory.
|
25
|
+
:param dtype: Data type(s) to apply to either the whole dataset or
|
26
|
+
individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
|
20
27
|
|
21
28
|
Example:
|
22
29
|
[Header]
|
@@ -34,20 +41,34 @@ class FinalReport(object):
|
|
34
41
|
...
|
35
42
|
"""
|
36
43
|
|
37
|
-
__PATTERN_HEADER = re.compile(r'(^\[Header
|
38
|
-
__PATTERN_DATA = re.compile(r'(^\[Data
|
44
|
+
__PATTERN_HEADER = re.compile(r'(^\[Header])')
|
45
|
+
__PATTERN_DATA = re.compile(r'(^\[Data])')
|
46
|
+
|
47
|
+
__slots__ = (
|
48
|
+
"_delimiter",
|
49
|
+
"__allele",
|
50
|
+
"__usecols",
|
51
|
+
"__dtype",
|
52
|
+
"__snp_data",
|
53
|
+
"__header",
|
54
|
+
"_map_rn",
|
55
|
+
)
|
39
56
|
|
40
57
|
def __init__(
|
41
58
|
self,
|
42
59
|
allele: str | list | None = None,
|
60
|
+
usecols: list[str] | None = None,
|
61
|
+
dtype: dict | None = None,
|
43
62
|
sep: str = "\t"
|
44
63
|
) -> None:
|
45
64
|
self._delimiter = sep
|
46
|
-
self.
|
65
|
+
self.__allele = allele
|
66
|
+
self.__usecols = usecols
|
67
|
+
self.__dtype = dtype
|
47
68
|
|
69
|
+
# self._full_data = None
|
70
|
+
self.__snp_data: pd.DataFrame | None = None
|
48
71
|
self.__header = {}
|
49
|
-
self.__snp_data = None
|
50
|
-
self.__allele = allele
|
51
72
|
self._map_rn = None
|
52
73
|
|
53
74
|
@property
|
@@ -73,6 +94,9 @@ class FinalReport(object):
|
|
73
94
|
|
74
95
|
try:
|
75
96
|
|
97
|
+
if self.__allele is not None and self.__usecols is not None:
|
98
|
+
raise Exception("Error. Usecols is used for allele is none.")
|
99
|
+
|
76
100
|
if isinstance(file_rep, str):
|
77
101
|
file_rep = Path(file_rep)
|
78
102
|
|
@@ -89,17 +113,11 @@ class FinalReport(object):
|
|
89
113
|
|
90
114
|
self.__convert_s_id(conv_file)
|
91
115
|
|
92
|
-
# Processing report file
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if self._full_data is None:
|
97
|
-
raise Exception("Not data in file FinalReport.txt")
|
98
|
-
|
99
|
-
self.__handler_header()
|
100
|
-
self.__handler_data()
|
116
|
+
# # Processing report file
|
117
|
+
self.__handler_header(file_rep)
|
118
|
+
self.__handler_data(file_rep)
|
101
119
|
|
102
|
-
if self._map_rn is not None:
|
120
|
+
if not self.__snp_data.empty and self._map_rn is not None:
|
103
121
|
self.__snp_data['Sample ID'] = \
|
104
122
|
self.__snp_data['Sample ID'].map(
|
105
123
|
dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
|
@@ -110,62 +128,99 @@ class FinalReport(object):
|
|
110
128
|
|
111
129
|
return True
|
112
130
|
|
113
|
-
def
|
114
|
-
"""
|
131
|
+
def __handler_header(self, file_rep: Path) -> None:
|
132
|
+
""" Processes data from a file, selects meta-information.
|
115
133
|
|
116
134
|
:param file_rep: path, pointer to the file to be read.
|
117
|
-
:return: Returns true if the read was successful, false if it failed.
|
118
135
|
"""
|
119
|
-
try:
|
120
|
-
if len(data := file_rep.read_text()) != 0:
|
121
|
-
self._full_data = data.strip().split("\n")
|
122
|
-
return True
|
123
136
|
|
124
|
-
|
137
|
+
with open(file_rep, 'r') as file:
|
125
138
|
|
126
|
-
|
127
|
-
|
139
|
+
for line in file:
|
140
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
141
|
+
return
|
128
142
|
|
129
|
-
|
143
|
+
if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
|
144
|
+
len(line.strip()) == 0:
|
145
|
+
continue
|
130
146
|
|
131
|
-
|
132
|
-
|
147
|
+
key = line.strip().split("\t")[0]
|
148
|
+
value = line.strip().split("\t")[1]
|
149
|
+
|
150
|
+
self.__header[key] = value
|
151
|
+
|
152
|
+
def __handler_data(self, file_rep: Path) -> None:
|
153
|
+
""" Processes data and forms an array for further processing.
|
154
|
+
|
155
|
+
:param file_rep: path, pointer to the file to be read.
|
156
|
+
"""
|
157
|
+
|
158
|
+
with open(file_rep, 'r') as file:
|
159
|
+
|
160
|
+
# Search for the data start index and skip
|
161
|
+
for line in file:
|
162
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
163
|
+
break
|
164
|
+
|
165
|
+
# line column
|
166
|
+
orig_name_col = file.readline().strip().split(self._delimiter)
|
167
|
+
|
168
|
+
if self.__allele is None and self.__usecols is None:
|
169
|
+
self.__snp_data = pd.read_csv(
|
170
|
+
file,
|
171
|
+
sep=self._delimiter,
|
172
|
+
header=None,
|
173
|
+
names=orig_name_col,
|
174
|
+
dtype=self.__dtype,
|
175
|
+
low_memory=True,
|
176
|
+
na_filter=True
|
177
|
+
)
|
133
178
|
|
134
|
-
for line in self._full_data:
|
135
|
-
if self.__class__.__PATTERN_DATA.findall(line):
|
136
179
|
return
|
137
180
|
|
138
|
-
|
139
|
-
|
181
|
+
sub_n_col = self.__processing_columns(orig_name_col)
|
182
|
+
self.__snp_data = pd.read_csv(
|
183
|
+
file,
|
184
|
+
sep=self._delimiter,
|
185
|
+
header=None,
|
186
|
+
names=orig_name_col,
|
187
|
+
usecols=sub_n_col,
|
188
|
+
dtype=self.__dtype,
|
189
|
+
low_memory=True,
|
190
|
+
na_filter=True
|
191
|
+
)
|
140
192
|
|
141
|
-
|
142
|
-
value = line.strip().split("\t")[1]
|
193
|
+
return
|
143
194
|
|
144
|
-
|
195
|
+
def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
|
196
|
+
""" Processing the line with all the names of the fields and the
|
197
|
+
sample of them.
|
145
198
|
|
146
|
-
|
147
|
-
|
199
|
+
:param lst_col: List of all fields.
|
200
|
+
:return: Returns a tuple with a list of names of selected fields.
|
201
|
+
"""
|
148
202
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
temp += 1
|
203
|
+
if self.__usecols is not None:
|
204
|
+
check_n_col = [
|
205
|
+
item for item in self.__usecols if item in lst_col
|
206
|
+
]
|
154
207
|
|
155
|
-
|
156
|
-
|
157
|
-
|
208
|
+
# Check on empty list
|
209
|
+
if check_n_col:
|
210
|
+
return self.__usecols
|
158
211
|
|
159
|
-
|
160
|
-
|
212
|
+
raise Exception(
|
213
|
+
f"Error. The USECOLS list contains not true fields."
|
214
|
+
)
|
161
215
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
216
|
+
# processing alleles
|
217
|
+
sample_n_col = self.__sample_by_allele(lst_col)
|
218
|
+
if sample_n_col is None:
|
219
|
+
raise Exception(
|
220
|
+
f"Error. Allele {self.__allele} not in data."
|
221
|
+
)
|
222
|
+
|
223
|
+
return sample_n_col
|
169
224
|
|
170
225
|
def __sample_by_allele(self, names: list[str]) -> list[str] | None:
|
171
226
|
""" Method that generates a list of field names choosing which alleles
|
@@ -235,17 +290,15 @@ class FinalReport(object):
|
|
235
290
|
if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
|
236
291
|
raise Exception("Error. Unique keys contain Cyrillic alphabet.")
|
237
292
|
|
238
|
-
if self._map_rn.UNIQ_KEY.isna().any():
|
239
|
-
self._map_rn.fillna('unknown', inplace=True)
|
240
|
-
|
241
293
|
@staticmethod
|
242
294
|
def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
|
243
|
-
"""
|
295
|
+
""" Checial verification of the Cyrillic
|
244
296
|
|
245
|
-
:param seq:
|
246
|
-
:return:
|
297
|
+
:param seq: Squeezed for verification.
|
298
|
+
:return: Truth if there are no symbols of Cyril and there is a lie if
|
299
|
+
there is.
|
247
300
|
"""
|
248
301
|
|
249
|
-
return
|
250
|
-
|
251
|
-
|
302
|
+
return seq.apply(
|
303
|
+
lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
|
304
|
+
).any()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: snplib
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.10
|
4
4
|
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
5
|
Author-email: Igor <igor.loschinin@gmail.com>
|
6
6
|
License: GNU
|
@@ -10,14 +10,14 @@ Classifier: Operating System :: OS Independent
|
|
10
10
|
Requires-Python: >=3.10
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
-
Requires-Dist: numpy>=
|
14
|
-
Requires-Dist: pandas>=2.
|
15
|
-
Requires-Dist: six>=1.
|
13
|
+
Requires-Dist: numpy>=2.2.3
|
14
|
+
Requires-Dist: pandas>=2.2.3
|
15
|
+
Requires-Dist: six>=1.17.0
|
16
16
|
Requires-Dist: swifter>=1.4.0
|
17
17
|
Requires-Dist: xlrd>=2.0.1
|
18
|
-
Requires-Dist: XlsxWriter>=3.
|
19
|
-
Requires-Dist: openpyxl>=3.1.
|
20
|
-
Requires-Dist: pydantic>=2.
|
18
|
+
Requires-Dist: XlsxWriter>=3.2.2
|
19
|
+
Requires-Dist: openpyxl>=3.1.5
|
20
|
+
Requires-Dist: pydantic>=2.10.6
|
21
21
|
|
22
22
|
# snptools
|
23
23
|
<p align="center">
|
@@ -1,6 +1,6 @@
|
|
1
1
|
snplib/__init__.py,sha256=xhjj4ZywdwCq91GBh1zfBP_TwFW26-KpHcCUUVvMdgI,196
|
2
2
|
snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
|
3
|
-
snplib/finalreport/_finalreport.py,sha256=
|
3
|
+
snplib/finalreport/_finalreport.py,sha256=_VXv8ayTIJBGGkpXYtrvBEp2HNuQ8Dh3zqS8HadlnHo,7501
|
4
4
|
snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
|
5
5
|
snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
|
6
6
|
snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
|
@@ -15,8 +15,8 @@ snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw
|
|
15
15
|
snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
|
16
16
|
snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
|
17
17
|
snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
|
18
|
-
snplib-1.
|
19
|
-
snplib-1.
|
20
|
-
snplib-1.
|
21
|
-
snplib-1.
|
22
|
-
snplib-1.
|
18
|
+
snplib-1.2.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
19
|
+
snplib-1.2.10.dist-info/METADATA,sha256=ZjG7lspM2kiKEQZCTFHQDNY3yrU6zyfTnLBaYGAzRU4,2184
|
20
|
+
snplib-1.2.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
21
|
+
snplib-1.2.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
|
22
|
+
snplib-1.2.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|