snplib 1.1.10__py3-none-any.whl → 1.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,12 @@
3
3
  __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
4
  __all__ = ("FinalReport",)
5
5
 
6
- from pathlib import Path
7
- from functools import reduce
8
-
9
6
  import re
7
+ from functools import reduce
8
+ from pathlib import Path
10
9
 
11
- from numpy import nan
12
10
  import pandas as pd
11
+ from numpy import nan
13
12
 
14
13
 
15
14
  class FinalReport(object):
@@ -17,10 +16,14 @@ class FinalReport(object):
17
16
  handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
18
17
  conversion file, the processed data will contain NAN values.
19
18
 
20
- :argument allele: A variant form of a single nucleotide polymorphism
21
- (SNP), a specific polymorphic site or a whole gene detectable at
22
- a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
23
- :argument sep: Delimiter to use. Default value: "\\t"
19
+ :param allele: A variant form of a single nucleotide polymorphism (SNP), a
20
+ specific polymorphic site or a whole gene detectable at a locus. Type:
21
+ 'AB', 'Forward', 'Top', 'Plus', 'Design'.
22
+ :param sep: Delimiter to use. Default value: "\\t".
23
+ :param usecols: Selection of fields for reading. Accelerates processing
24
+ and reduces memory.
25
+ :param dtype: Data type(s) to apply to either the whole dataset or
26
+ individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
24
27
 
25
28
  Example:
26
29
  [Header]
@@ -38,20 +41,34 @@ class FinalReport(object):
38
41
  ...
39
42
  """
40
43
 
41
- __PATTERN_HEADER = re.compile(r'(^\[Header\])')
42
- __PATTERN_DATA = re.compile(r'(^\[Data\])')
44
+ __PATTERN_HEADER = re.compile(r'(^\[Header])')
45
+ __PATTERN_DATA = re.compile(r'(^\[Data])')
46
+
47
+ __slots__ = (
48
+ "_delimiter",
49
+ "__allele",
50
+ "__usecols",
51
+ "__dtype",
52
+ "__snp_data",
53
+ "__header",
54
+ "_map_rn",
55
+ )
43
56
 
44
57
  def __init__(
45
58
  self,
46
59
  allele: str | list | None = None,
60
+ usecols: list[str] | None = None,
61
+ dtype: dict | None = None,
47
62
  sep: str = "\t"
48
63
  ) -> None:
49
64
  self._delimiter = sep
50
- self._full_data = None
65
+ self.__allele = allele
66
+ self.__usecols = usecols
67
+ self.__dtype = dtype
51
68
 
69
+ # self._full_data = None
70
+ self.__snp_data: pd.DataFrame | None = None
52
71
  self.__header = {}
53
- self.__snp_data = None
54
- self.__allele = allele
55
72
  self._map_rn = None
56
73
 
57
74
  @property
@@ -77,6 +94,9 @@ class FinalReport(object):
77
94
 
78
95
  try:
79
96
 
97
+ if self.__allele is not None and self.__usecols is not None:
98
+ raise Exception("Error. Usecols is used for allele is none.")
99
+
80
100
  if isinstance(file_rep, str):
81
101
  file_rep = Path(file_rep)
82
102
 
@@ -93,17 +113,11 @@ class FinalReport(object):
93
113
 
94
114
  self.__convert_s_id(conv_file)
95
115
 
96
- # Processing report file
97
- if not self.read(file_rep):
98
- return False
99
-
100
- if self._full_data is None:
101
- raise Exception("Not data in file FinalReport.txt")
102
-
103
- self.__handler_header()
104
- self.__handler_data()
116
+ # # Processing report file
117
+ self.__handler_header(file_rep)
118
+ self.__handler_data(file_rep)
105
119
 
106
- if self._map_rn is not None:
120
+ if not self.__snp_data.empty and self._map_rn is not None:
107
121
  self.__snp_data['Sample ID'] = \
108
122
  self.__snp_data['Sample ID'].map(
109
123
  dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
@@ -114,62 +128,99 @@ class FinalReport(object):
114
128
 
115
129
  return True
116
130
 
117
- def read(self, file_rep: Path) -> bool:
118
- """ Reading data from the final_report file
131
+ def __handler_header(self, file_rep: Path) -> None:
132
+ """ Processes data from a file, selects meta-information.
119
133
 
120
134
  :param file_rep: path, pointer to the file to be read.
121
- :return: Returns true if the read was successful, false if it failed.
122
135
  """
123
- try:
124
- if len(data := file_rep.read_text()) != 0:
125
- self._full_data = data.strip().split("\n")
126
- return True
127
136
 
128
- self._full_data = None
137
+ with open(file_rep, 'r') as file:
129
138
 
130
- except Exception as e:
131
- return False
139
+ for line in file:
140
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
141
+ return
132
142
 
133
- return True
143
+ if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
144
+ len(line.strip()) == 0:
145
+ continue
146
+
147
+ key = line.strip().split("\t")[0]
148
+ value = line.strip().split("\t")[1]
149
+
150
+ self.__header[key] = value
151
+
152
+ def __handler_data(self, file_rep: Path) -> None:
153
+ """ Processes data and forms an array for further processing.
154
+
155
+ :param file_rep: path, pointer to the file to be read.
156
+ """
134
157
 
135
- def __handler_header(self) -> None:
136
- """ Processes data from a file, selects meta-information. """
158
+ with open(file_rep, 'r') as file:
159
+
160
+ # Search for the data start index and skip
161
+ for line in file:
162
+ if self.__class__.__PATTERN_DATA.findall(line.strip()):
163
+ break
164
+
165
+ # line column
166
+ orig_name_col = file.readline().strip().split(self._delimiter)
167
+
168
+ if self.__allele is None and self.__usecols is None:
169
+ self.__snp_data = pd.read_csv(
170
+ file,
171
+ sep=self._delimiter,
172
+ header=None,
173
+ names=orig_name_col,
174
+ dtype=self.__dtype,
175
+ low_memory=True,
176
+ na_filter=True
177
+ )
137
178
 
138
- for line in self._full_data:
139
- if self.__class__.__PATTERN_DATA.findall(line):
140
179
  return
141
180
 
142
- if self.__class__.__PATTERN_HEADER.findall(line):
143
- continue
181
+ sub_n_col = self.__processing_columns(orig_name_col)
182
+ self.__snp_data = pd.read_csv(
183
+ file,
184
+ sep=self._delimiter,
185
+ header=None,
186
+ names=orig_name_col,
187
+ usecols=sub_n_col,
188
+ dtype=self.__dtype,
189
+ low_memory=True,
190
+ na_filter=True
191
+ )
144
192
 
145
- key = line.strip().split("\t")[0]
146
- value = line.strip().split("\t")[1]
193
+ return
147
194
 
148
- self.__header[key] = value
195
+ def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
196
+ """ Processing the line with all the names of the fields and the
197
+ sample of them.
149
198
 
150
- def __handler_data(self) -> None:
151
- """ Processes data and forms an array for further processing. """
199
+ :param lst_col: List of all fields.
200
+ :return: Returns a tuple with a list of names of selected fields.
201
+ """
152
202
 
153
- temp = 1
154
- for line in self._full_data:
155
- if self.__class__.__PATTERN_DATA.findall(line):
156
- break
157
- temp += 1
203
+ if self.__usecols is not None:
204
+ check_n_col = [
205
+ item for item in self.__usecols if item in lst_col
206
+ ]
158
207
 
159
- names_col = self.__sample_by_allele(
160
- self._full_data[temp].split(f"{self._delimiter}")
161
- )
208
+ # Check on empty list
209
+ if check_n_col:
210
+ return self.__usecols
211
+
212
+ raise Exception(
213
+ f"Error. The USECOLS list contains not true fields."
214
+ )
162
215
 
163
- if names_col is None:
164
- raise Exception(f"Error. Allele {self.__allele} not in data.")
216
+ # processing alleles
217
+ sample_n_col = self.__sample_by_allele(lst_col)
218
+ if sample_n_col is None:
219
+ raise Exception(
220
+ f"Error. Allele {self.__allele} not in data."
221
+ )
165
222
 
166
- self.__snp_data = pd.DataFrame(
167
- [
168
- item_data.split(f"{self._delimiter}")
169
- for item_data in self._full_data[temp + 1:]
170
- ],
171
- columns=self._full_data[temp].split(f"{self._delimiter}")
172
- )[names_col]
223
+ return sample_n_col
173
224
 
174
225
  def __sample_by_allele(self, names: list[str]) -> list[str] | None:
175
226
  """ Method that generates a list of field names choosing which alleles
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: snplib
3
- Version: 1.1.10
3
+ Version: 1.2.10
4
4
  Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
5
5
  Author-email: Igor <igor.loschinin@gmail.com>
6
6
  License: GNU
@@ -1,6 +1,6 @@
1
1
  snplib/__init__.py,sha256=xhjj4ZywdwCq91GBh1zfBP_TwFW26-KpHcCUUVvMdgI,196
2
2
  snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1g,161
3
- snplib/finalreport/_finalreport.py,sha256=dE44NsJ5ciGyFCh2LpFncf4LwQDI_-l4LatOPg879Sk,6148
3
+ snplib/finalreport/_finalreport.py,sha256=_VXv8ayTIJBGGkpXYtrvBEp2HNuQ8Dh3zqS8HadlnHo,7501
4
4
  snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
5
5
  snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
6
6
  snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
@@ -15,8 +15,8 @@ snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw
15
15
  snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
16
16
  snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
17
17
  snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
18
- snplib-1.1.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
19
- snplib-1.1.10.dist-info/METADATA,sha256=hmE1zLFNhDwwbl03PVupafM1n-NGOV2yKYmKzi1a6lw,2184
20
- snplib-1.1.10.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
21
- snplib-1.1.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
22
- snplib-1.1.10.dist-info/RECORD,,
18
+ snplib-1.2.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
19
+ snplib-1.2.10.dist-info/METADATA,sha256=ZjG7lspM2kiKEQZCTFHQDNY3yrU6zyfTnLBaYGAzRU4,2184
20
+ snplib-1.2.10.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
21
+ snplib-1.2.10.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
22
+ snplib-1.2.10.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.1)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5