snplib 1.1.10__py3-none-any.whl → 1.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snplib/finalreport/_finalreport.py +113 -62
- {snplib-1.1.10.dist-info → snplib-1.2.10.dist-info}/METADATA +1 -1
- {snplib-1.1.10.dist-info → snplib-1.2.10.dist-info}/RECORD +6 -6
- {snplib-1.1.10.dist-info → snplib-1.2.10.dist-info}/WHEEL +1 -1
- {snplib-1.1.10.dist-info → snplib-1.2.10.dist-info}/LICENSE +0 -0
- {snplib-1.1.10.dist-info → snplib-1.2.10.dist-info}/top_level.txt +0 -0
@@ -3,13 +3,12 @@
|
|
3
3
|
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
4
|
__all__ = ("FinalReport",)
|
5
5
|
|
6
|
-
from pathlib import Path
|
7
|
-
from functools import reduce
|
8
|
-
|
9
6
|
import re
|
7
|
+
from functools import reduce
|
8
|
+
from pathlib import Path
|
10
9
|
|
11
|
-
from numpy import nan
|
12
10
|
import pandas as pd
|
11
|
+
from numpy import nan
|
13
12
|
|
14
13
|
|
15
14
|
class FinalReport(object):
|
@@ -17,10 +16,14 @@ class FinalReport(object):
|
|
17
16
|
handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
|
18
17
|
conversion file, the processed data will contain NAN values.
|
19
18
|
|
20
|
-
:
|
21
|
-
|
22
|
-
|
23
|
-
:
|
19
|
+
:param allele: A variant form of a single nucleotide polymorphism (SNP), a
|
20
|
+
specific polymorphic site or a whole gene detectable at a locus. Type:
|
21
|
+
'AB', 'Forward', 'Top', 'Plus', 'Design'.
|
22
|
+
:param sep: Delimiter to use. Default value: "\\t".
|
23
|
+
:param usecols: Selection of fields for reading. Accelerates processing
|
24
|
+
and reduces memory.
|
25
|
+
:param dtype: Data type(s) to apply to either the whole dataset or
|
26
|
+
individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
|
24
27
|
|
25
28
|
Example:
|
26
29
|
[Header]
|
@@ -38,20 +41,34 @@ class FinalReport(object):
|
|
38
41
|
...
|
39
42
|
"""
|
40
43
|
|
41
|
-
__PATTERN_HEADER = re.compile(r'(^\[Header
|
42
|
-
__PATTERN_DATA = re.compile(r'(^\[Data
|
44
|
+
__PATTERN_HEADER = re.compile(r'(^\[Header])')
|
45
|
+
__PATTERN_DATA = re.compile(r'(^\[Data])')
|
46
|
+
|
47
|
+
__slots__ = (
|
48
|
+
"_delimiter",
|
49
|
+
"__allele",
|
50
|
+
"__usecols",
|
51
|
+
"__dtype",
|
52
|
+
"__snp_data",
|
53
|
+
"__header",
|
54
|
+
"_map_rn",
|
55
|
+
)
|
43
56
|
|
44
57
|
def __init__(
|
45
58
|
self,
|
46
59
|
allele: str | list | None = None,
|
60
|
+
usecols: list[str] | None = None,
|
61
|
+
dtype: dict | None = None,
|
47
62
|
sep: str = "\t"
|
48
63
|
) -> None:
|
49
64
|
self._delimiter = sep
|
50
|
-
self.
|
65
|
+
self.__allele = allele
|
66
|
+
self.__usecols = usecols
|
67
|
+
self.__dtype = dtype
|
51
68
|
|
69
|
+
# self._full_data = None
|
70
|
+
self.__snp_data: pd.DataFrame | None = None
|
52
71
|
self.__header = {}
|
53
|
-
self.__snp_data = None
|
54
|
-
self.__allele = allele
|
55
72
|
self._map_rn = None
|
56
73
|
|
57
74
|
@property
|
@@ -77,6 +94,9 @@ class FinalReport(object):
|
|
77
94
|
|
78
95
|
try:
|
79
96
|
|
97
|
+
if self.__allele is not None and self.__usecols is not None:
|
98
|
+
raise Exception("Error. Usecols is used for allele is none.")
|
99
|
+
|
80
100
|
if isinstance(file_rep, str):
|
81
101
|
file_rep = Path(file_rep)
|
82
102
|
|
@@ -93,17 +113,11 @@ class FinalReport(object):
|
|
93
113
|
|
94
114
|
self.__convert_s_id(conv_file)
|
95
115
|
|
96
|
-
# Processing report file
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
if self._full_data is None:
|
101
|
-
raise Exception("Not data in file FinalReport.txt")
|
102
|
-
|
103
|
-
self.__handler_header()
|
104
|
-
self.__handler_data()
|
116
|
+
# # Processing report file
|
117
|
+
self.__handler_header(file_rep)
|
118
|
+
self.__handler_data(file_rep)
|
105
119
|
|
106
|
-
if self._map_rn is not None:
|
120
|
+
if not self.__snp_data.empty and self._map_rn is not None:
|
107
121
|
self.__snp_data['Sample ID'] = \
|
108
122
|
self.__snp_data['Sample ID'].map(
|
109
123
|
dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
|
@@ -114,62 +128,99 @@ class FinalReport(object):
|
|
114
128
|
|
115
129
|
return True
|
116
130
|
|
117
|
-
def
|
118
|
-
"""
|
131
|
+
def __handler_header(self, file_rep: Path) -> None:
|
132
|
+
""" Processes data from a file, selects meta-information.
|
119
133
|
|
120
134
|
:param file_rep: path, pointer to the file to be read.
|
121
|
-
:return: Returns true if the read was successful, false if it failed.
|
122
135
|
"""
|
123
|
-
try:
|
124
|
-
if len(data := file_rep.read_text()) != 0:
|
125
|
-
self._full_data = data.strip().split("\n")
|
126
|
-
return True
|
127
136
|
|
128
|
-
|
137
|
+
with open(file_rep, 'r') as file:
|
129
138
|
|
130
|
-
|
131
|
-
|
139
|
+
for line in file:
|
140
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
141
|
+
return
|
132
142
|
|
133
|
-
|
143
|
+
if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
|
144
|
+
len(line.strip()) == 0:
|
145
|
+
continue
|
146
|
+
|
147
|
+
key = line.strip().split("\t")[0]
|
148
|
+
value = line.strip().split("\t")[1]
|
149
|
+
|
150
|
+
self.__header[key] = value
|
151
|
+
|
152
|
+
def __handler_data(self, file_rep: Path) -> None:
|
153
|
+
""" Processes data and forms an array for further processing.
|
154
|
+
|
155
|
+
:param file_rep: path, pointer to the file to be read.
|
156
|
+
"""
|
134
157
|
|
135
|
-
|
136
|
-
|
158
|
+
with open(file_rep, 'r') as file:
|
159
|
+
|
160
|
+
# Search for the data start index and skip
|
161
|
+
for line in file:
|
162
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
163
|
+
break
|
164
|
+
|
165
|
+
# line column
|
166
|
+
orig_name_col = file.readline().strip().split(self._delimiter)
|
167
|
+
|
168
|
+
if self.__allele is None and self.__usecols is None:
|
169
|
+
self.__snp_data = pd.read_csv(
|
170
|
+
file,
|
171
|
+
sep=self._delimiter,
|
172
|
+
header=None,
|
173
|
+
names=orig_name_col,
|
174
|
+
dtype=self.__dtype,
|
175
|
+
low_memory=True,
|
176
|
+
na_filter=True
|
177
|
+
)
|
137
178
|
|
138
|
-
for line in self._full_data:
|
139
|
-
if self.__class__.__PATTERN_DATA.findall(line):
|
140
179
|
return
|
141
180
|
|
142
|
-
|
143
|
-
|
181
|
+
sub_n_col = self.__processing_columns(orig_name_col)
|
182
|
+
self.__snp_data = pd.read_csv(
|
183
|
+
file,
|
184
|
+
sep=self._delimiter,
|
185
|
+
header=None,
|
186
|
+
names=orig_name_col,
|
187
|
+
usecols=sub_n_col,
|
188
|
+
dtype=self.__dtype,
|
189
|
+
low_memory=True,
|
190
|
+
na_filter=True
|
191
|
+
)
|
144
192
|
|
145
|
-
|
146
|
-
value = line.strip().split("\t")[1]
|
193
|
+
return
|
147
194
|
|
148
|
-
|
195
|
+
def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
|
196
|
+
""" Processing the line with all the names of the fields and the
|
197
|
+
sample of them.
|
149
198
|
|
150
|
-
|
151
|
-
|
199
|
+
:param lst_col: List of all fields.
|
200
|
+
:return: Returns a tuple with a list of names of selected fields.
|
201
|
+
"""
|
152
202
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
temp += 1
|
203
|
+
if self.__usecols is not None:
|
204
|
+
check_n_col = [
|
205
|
+
item for item in self.__usecols if item in lst_col
|
206
|
+
]
|
158
207
|
|
159
|
-
|
160
|
-
|
161
|
-
|
208
|
+
# Check on empty list
|
209
|
+
if check_n_col:
|
210
|
+
return self.__usecols
|
211
|
+
|
212
|
+
raise Exception(
|
213
|
+
f"Error. The USECOLS list contains not true fields."
|
214
|
+
)
|
162
215
|
|
163
|
-
|
164
|
-
|
216
|
+
# processing alleles
|
217
|
+
sample_n_col = self.__sample_by_allele(lst_col)
|
218
|
+
if sample_n_col is None:
|
219
|
+
raise Exception(
|
220
|
+
f"Error. Allele {self.__allele} not in data."
|
221
|
+
)
|
165
222
|
|
166
|
-
|
167
|
-
[
|
168
|
-
item_data.split(f"{self._delimiter}")
|
169
|
-
for item_data in self._full_data[temp + 1:]
|
170
|
-
],
|
171
|
-
columns=self._full_data[temp].split(f"{self._delimiter}")
|
172
|
-
)[names_col]
|
223
|
+
return sample_n_col
|
173
224
|
|
174
225
|
def __sample_by_allele(self, names: list[str]) -> list[str] | None:
|
175
226
|
""" Method that generates a list of field names choosing which alleles
|
@@ -1,6 +1,6 @@
|
|
1
1
|
snplib/__init__.py,sha256=xhjj4ZywdwCq91GBh1zfBP_TwFW26-KpHcCUUVvMdgI,196
|
2
2
|
snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
|
3
|
-
snplib/finalreport/_finalreport.py,sha256=
|
3
|
+
snplib/finalreport/_finalreport.py,sha256=_VXv8ayTIJBGGkpXYtrvBEp2HNuQ8Dh3zqS8HadlnHo,7501
|
4
4
|
snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
|
5
5
|
snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
|
6
6
|
snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
|
@@ -15,8 +15,8 @@ snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw
|
|
15
15
|
snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
|
16
16
|
snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
|
17
17
|
snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
|
18
|
-
snplib-1.
|
19
|
-
snplib-1.
|
20
|
-
snplib-1.
|
21
|
-
snplib-1.
|
22
|
-
snplib-1.
|
18
|
+
snplib-1.2.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
19
|
+
snplib-1.2.10.dist-info/METADATA,sha256=ZjG7lspM2kiKEQZCTFHQDNY3yrU6zyfTnLBaYGAzRU4,2184
|
20
|
+
snplib-1.2.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
21
|
+
snplib-1.2.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
|
22
|
+
snplib-1.2.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|