snplib 1.0.10__py3-none-any.whl → 1.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,20 +3,27 @@
3
3
  __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
4
  __all__ = ("FinalReport",)
5
5
 
6
- from pathlib import Path
6
+ import re
7
7
  from functools import reduce
8
+ from pathlib import Path
8
9
 
9
- import re
10
10
  import pandas as pd
11
+ from numpy import nan
11
12
 
12
13
 
13
14
  class FinalReport(object):
14
- """ File that contains SNP information.
15
-
16
- :argument allele: A variant form of a single nucleotide polymorphism
17
- (SNP), a specific polymorphic site or a whole gene detectable at
18
- a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
19
- :argument sep: Delimiter to use. Default value: "\\t"
15
+ """ File that contains SNP information. File processing is triggered by the
16
+ handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
17
+ conversion file, the processed data will contain NAN values.
18
+
19
+ :param allele: A variant form of a single nucleotide polymorphism (SNP), a
20
+ specific polymorphic site or a whole gene detectable at a locus. Type:
21
+ 'AB', 'Forward', 'Top', 'Plus', 'Design'.
22
+ :param sep: Delimiter to use. Default value: "\\t".
23
+ :param usecols: Selection of fields for reading. Accelerates processing
24
+ and reduces memory.
25
+ :param dtype: Data type(s) to apply to either the whole dataset or
26
+ individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
20
27
 
21
28
  Example:
22
29
  [Header]
@@ -34,20 +41,34 @@ class FinalReport(object):
34
41
  ...
35
42
  """
36
43
 
37
- __PATTERN_HEADER = re.compile(r'(^\[Header\])')
38
- __PATTERN_DATA = re.compile(r'(^\[Data\])')
44
+ __PATTERN_HEADER = re.compile(r'(^\[Header])')
45
+ __PATTERN_DATA = re.compile(r'(^\[Data])')
46
+
47
+ __slots__ = (
48
+ "_delimiter",
49
+ "__allele",
50
+ "__usecols",
51
+ "__dtype",
52
+ "__snp_data",
53
+ "__header",
54
+ "_map_rn",
55
+ )
39
56
 
40
57
  def __init__(
41
58
  self,
42
59
  allele: str | list | None = None,
60
+ usecols: list[str] | None = None,
61
+ dtype: dict | None = None,
43
62
  sep: str = "\t"
44
63
  ) -> None:
45
64
  self._delimiter = sep
46
- self._full_data = None
65
+ self.__allele = allele
66
+ self.__usecols = usecols
67
+ self.__dtype = dtype
47
68
 
69
+ # self._full_data = None
70
+ self.__snp_data: pd.DataFrame | None = None
48
71
  self.__header = {}
49
- self.__snp_data = None
50
- self.__allele = allele
51
72
  self._map_rn = None
52
73
 
53
74
  @property
@@ -73,6 +94,9 @@ class FinalReport(object):
73
94
 
74
95
  try:
75
96
 
97
+ if self.__allele is not None and self.__usecols is not None:
98
+ raise Exception("Error. Usecols is used for allele is none.")
99
+
76
100
  if isinstance(file_rep, str):
77
101
  file_rep = Path(file_rep)
78
102
 
@@ -89,17 +113,11 @@ class FinalReport(object):
89
113
 
90
114
  self.__convert_s_id(conv_file)
91
115
 
92
- # Processing report file
93
- if not self.read(file_rep):
94
- return False
95
-
96
- if self._full_data is None:
97
- raise Exception("Not data in file FinalReport.txt")
98
-
99
- self.__handler_header()
100
- self.__handler_data()
116
+ # # Processing report file
117
+ self.__handler_header(file_rep)
118
+ self.__handler_data(file_rep)
101
119
 
102
- if self._map_rn is not None:
120
+ if not self.__snp_data.empty and self._map_rn is not None:
103
121
  self.__snp_data['Sample ID'] = \
104
122
  self.__snp_data['Sample ID'].map(
105
123
  dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
@@ -110,62 +128,99 @@ class FinalReport(object):
110
128
 
111
129
  return True
112
130
 
113
- def read(self, file_rep: Path) -> bool:
114
- """ Reading data from the final_report file
131
+ def __handler_header(self, file_rep: Path) -> None:
132
+ """ Processes data from a file, selects meta-information.
115
133
 
116
134
  :param file_rep: path, pointer to the file to be read.
117
- :return: Returns true if the read was successful, false if it failed.
118
135
  """
119
- try:
120
- if len(data := file_rep.read_text()) != 0:
121
- self._full_data = data.strip().split("\n")
122
- return True
123
136
 
124
- self._full_data = None
137
+ with open(file_rep, 'r') as file:
125
138
 
126
- except Exception as e:
127
- return False
139
+ for line in file:
140
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
141
+ return
128
142
 
129
- return True
143
+ if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
144
+ len(line.strip()) == 0:
145
+ continue
130
146
 
131
- def __handler_header(self) -> None:
132
- """ Processes data from a file, selects meta-information. """
147
+ key = line.strip().split("\t")[0]
148
+ value = line.strip().split("\t")[1]
149
+
150
+ self.__header[key] = value
151
+
152
+ def __handler_data(self, file_rep: Path) -> None:
153
+ """ Processes data and forms an array for further processing.
154
+
155
+ :param file_rep: path, pointer to the file to be read.
156
+ """
157
+
158
+ with open(file_rep, 'r') as file:
159
+
160
+ # Search for the data start index and skip
161
+ for line in file:
162
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
163
+ break
164
+
165
+ # line column
166
+ orig_name_col = file.readline().strip().split(self._delimiter)
167
+
168
+ if self.__allele is None and self.__usecols is None:
169
+ self.__snp_data = pd.read_csv(
170
+ file,
171
+ sep=self._delimiter,
172
+ header=None,
173
+ names=orig_name_col,
174
+ dtype=self.__dtype,
175
+ low_memory=True,
176
+ na_filter=True
177
+ )
133
178
 
134
- for line in self._full_data:
135
- if self.__class__.__PATTERN_DATA.findall(line):
136
179
  return
137
180
 
138
- if self.__class__.__PATTERN_HEADER.findall(line):
139
- continue
181
+ sub_n_col = self.__processing_columns(orig_name_col)
182
+ self.__snp_data = pd.read_csv(
183
+ file,
184
+ sep=self._delimiter,
185
+ header=None,
186
+ names=orig_name_col,
187
+ usecols=sub_n_col,
188
+ dtype=self.__dtype,
189
+ low_memory=True,
190
+ na_filter=True
191
+ )
140
192
 
141
- key = line.strip().split("\t")[0]
142
- value = line.strip().split("\t")[1]
193
+ return
143
194
 
144
- self.__header[key] = value
195
+ def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
196
+ """ Processing the line with all the names of the fields and the
197
+ sample of them.
145
198
 
146
- def __handler_data(self) -> None:
147
- """ Processes data and forms an array for further processing. """
199
+ :param lst_col: List of all fields.
200
+ :return: Returns a tuple with a list of names of selected fields.
201
+ """
148
202
 
149
- temp = 1
150
- for line in self._full_data:
151
- if self.__class__.__PATTERN_DATA.findall(line):
152
- break
153
- temp += 1
203
+ if self.__usecols is not None:
204
+ check_n_col = [
205
+ item for item in self.__usecols if item in lst_col
206
+ ]
154
207
 
155
- names_col = self.__sample_by_allele(
156
- self._full_data[temp].split(f"{self._delimiter}")
157
- )
208
+ # Check on empty list
209
+ if check_n_col:
210
+ return self.__usecols
158
211
 
159
- if names_col is None:
160
- raise Exception(f"Error. Allele {self.__allele} not in data.")
212
+ raise Exception(
213
+ f"Error. The USECOLS list contains not true fields."
214
+ )
161
215
 
162
- self.__snp_data = pd.DataFrame(
163
- [
164
- item_data.split(f"{self._delimiter}")
165
- for item_data in self._full_data[temp + 1:]
166
- ],
167
- columns=self._full_data[temp].split(f"{self._delimiter}")
168
- )[names_col]
216
+ # processing alleles
217
+ sample_n_col = self.__sample_by_allele(lst_col)
218
+ if sample_n_col is None:
219
+ raise Exception(
220
+ f"Error. Allele {self.__allele} not in data."
221
+ )
222
+
223
+ return sample_n_col
169
224
 
170
225
  def __sample_by_allele(self, names: list[str]) -> list[str] | None:
171
226
  """ Method that generates a list of field names choosing which alleles
@@ -235,17 +290,15 @@ class FinalReport(object):
235
290
  if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
236
291
  raise Exception("Error. Unique keys contain Cyrillic alphabet.")
237
292
 
238
- if self._map_rn.UNIQ_KEY.isna().any():
239
- self._map_rn.fillna('unknown', inplace=True)
240
-
241
293
  @staticmethod
242
294
  def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
243
- """
295
+ """ Checial verification of the Cyrillic
244
296
 
245
- :param seq:
246
- :return:
297
+ :param seq: Squeezed for verification.
298
+ :return: Truth if there are no symbols of Cyril and there is a lie if
299
+ there is.
247
300
  """
248
301
 
249
- return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))
250
-
251
-
302
+ return seq.apply(
303
+ lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
304
+ ).any()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: snplib
3
- Version: 1.0.10
3
+ Version: 1.2.10
4
4
  Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
5
5
  Author-email: Igor <igor.loschinin@gmail.com>
6
6
  License: GNU
@@ -10,14 +10,14 @@ Classifier: Operating System :: OS Independent
10
10
  Requires-Python: >=3.10
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
- Requires-Dist: numpy>=1.26.1
14
- Requires-Dist: pandas>=2.1.1
15
- Requires-Dist: six>=1.16.0
13
+ Requires-Dist: numpy>=2.2.3
14
+ Requires-Dist: pandas>=2.2.3
15
+ Requires-Dist: six>=1.17.0
16
16
  Requires-Dist: swifter>=1.4.0
17
17
  Requires-Dist: xlrd>=2.0.1
18
- Requires-Dist: XlsxWriter>=3.1.9
19
- Requires-Dist: openpyxl>=3.1.2
20
- Requires-Dist: pydantic>=2.4.2
18
+ Requires-Dist: XlsxWriter>=3.2.2
19
+ Requires-Dist: openpyxl>=3.1.5
20
+ Requires-Dist: pydantic>=2.10.6
21
21
 
22
22
  # snptools
23
23
  <p align="center">
@@ -1,6 +1,6 @@
1
1
  snplib/__init__.py,sha256=xhjj4ZywdwCq91GBh1zfBP_TwFW26-KpHcCUUVvMdgI,196
2
2
  snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
3
- snplib/finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
3
+ snplib/finalreport/_finalreport.py,sha256=_VXv8ayTIJBGGkpXYtrvBEp2HNuQ8Dh3zqS8HadlnHo,7501
4
4
  snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
5
5
  snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
6
6
  snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
@@ -15,8 +15,8 @@ snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw
15
15
  snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
16
16
  snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
17
17
  snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
18
- snplib-1.0.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
19
- snplib-1.0.10.dist-info/METADATA,sha256=pKp9XmfbbogkNsdC_SEpsx1hLTdFxWaD9YgrIq7j0e4,2184
20
- snplib-1.0.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
21
- snplib-1.0.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
22
- snplib-1.0.10.dist-info/RECORD,,
18
+ snplib-1.2.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
19
+ snplib-1.2.10.dist-info/METADATA,sha256=ZjG7lspM2kiKEQZCTFHQDNY3yrU6zyfTnLBaYGAzRU4,2184
20
+ snplib-1.2.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
21
+ snplib-1.2.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
22
+ snplib-1.2.10.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5