snplib 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. snplib/__init__.py +8 -8
  2. snplib/finalreport/__init__.py +7 -7
  3. snplib/finalreport/_finalreport.py +251 -251
  4. snplib/format/__init__.py +19 -19
  5. snplib/format/__settings.py +7 -7
  6. snplib/format/_plink.py +305 -305
  7. snplib/format/_snp.py +113 -113
  8. snplib/parentage/__init__.py +15 -15
  9. snplib/parentage/_discov.py +102 -102
  10. snplib/parentage/_isagmark.py +15 -15
  11. snplib/parentage/_verif.py +91 -91
  12. snplib/parentage/isag_disc.pl +0 -0
  13. snplib/parentage/isag_verif.pl +0 -0
  14. snplib/statistics/__init__.py +16 -16
  15. snplib/statistics/_callrate.py +59 -59
  16. snplib/statistics/_freq.py +67 -67
  17. snplib/statistics/_snphwe.py +132 -132
  18. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/LICENSE +674 -674
  19. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/METADATA +80 -97
  20. snplib-1.0.8.dist-info/RECORD +22 -0
  21. snplib/finalreport/tests/__init__.py +0 -7
  22. snplib/finalreport/tests/test_finalreport.py +0 -215
  23. snplib/format/tests/__init__.py +0 -7
  24. snplib/format/tests/test_plink_fam.py +0 -121
  25. snplib/format/tests/test_plink_lgen.py +0 -106
  26. snplib/format/tests/test_plink_map.py +0 -42
  27. snplib/format/tests/test_plink_ped.py +0 -136
  28. snplib/format/tests/test_snp.py +0 -128
  29. snplib/parentage/tests/__init__.py +0 -7
  30. snplib/parentage/tests/test_discov.py +0 -164
  31. snplib/parentage/tests/test_verif.py +0 -160
  32. snplib/statistics/tests/__init__.py +0 -7
  33. snplib/statistics/tests/test_callrate.py +0 -171
  34. snplib/statistics/tests/test_freq_allele.py +0 -87
  35. snplib/statistics/tests/test_freq_maf.py +0 -17
  36. snplib/statistics/tests/test_hwe_t.py +0 -41
  37. snplib/statistics/tests/test_snphwe.py +0 -41
  38. snplib-1.0.7.dist-info/RECORD +0 -37
  39. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/WHEEL +0 -0
  40. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/top_level.txt +0 -0
snplib/__init__.py CHANGED
@@ -1,8 +1,8 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from .finalreport import *
6
- from .format import *
7
- from .parentage import *
8
- from .statistics import *
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from .finalreport import *
6
+ from .format import *
7
+ from .parentage import *
8
+ from .statistics import *
@@ -1,7 +1,7 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._finalreport import FinalReport
6
-
7
- __all__ = ["FinalReport"]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._finalreport import FinalReport
6
+
7
+ __all__ = ["FinalReport"]
@@ -1,251 +1,251 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
- __all__ = ("FinalReport",)
5
-
6
- from pathlib import Path
7
- from functools import reduce
8
-
9
- import re
10
- import pandas as pd
11
-
12
-
13
- class FinalReport(object):
14
- """ File that contains SNP information.
15
-
16
- :argument allele: A variant form of a single nucleotide polymorphism
17
- (SNP), a specific polymorphic site or a whole gene detectable at
18
- a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
19
- :argument sep: Delimiter to use. Default value: "\\t"
20
-
21
- Example:
22
- [Header]
23
- GSGT Version 2.0.4
24
- Processing Date 10/14/2021 4:02 PM
25
- Content BovineSNP50_v3_A1.bpm
26
- Num SNPs 53218
27
- Total SNPs 53218
28
- Num Samples 3
29
- Total Samples 3
30
- [Data]
31
- SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
32
- ABCA12 1 A A 0.4048 0.8164
33
- APAF1 1 B B 0.9067 0.9155
34
- ...
35
- """
36
-
37
- __PATTERN_HEADER = re.compile(r'(^\[Header\])')
38
- __PATTERN_DATA = re.compile(r'(^\[Data\])')
39
-
40
- def __init__(
41
- self,
42
- allele: str | list | None = None,
43
- sep: str = "\t"
44
- ) -> None:
45
- self._delimiter = sep
46
- self._full_data = None
47
-
48
- self.__header = {}
49
- self.__snp_data = None
50
- self.__allele = allele
51
- self._map_rn = None
52
-
53
- @property
54
- def header(self) -> dict:
55
- return self.__header
56
-
57
- @property
58
- def snp_data(self) -> pd.DataFrame | None:
59
- return self.__snp_data
60
-
61
- def handle(
62
- self, file_rep: Path | str, conv_file: Path | str = None
63
- ) -> bool:
64
- """ Processes the FinalReport.txt file. Highlights meta information
65
- and data.
66
-
67
- :param file_rep: The file FinalReport.txt or another name.
68
- :param conv_file: The file that contains IDs of registration numbers
69
- of animals.
70
- :return: Returns true if file processing was successful, false if
71
- there were errors.
72
- """
73
-
74
- try:
75
-
76
- if isinstance(file_rep, str):
77
- file_rep = Path(file_rep)
78
-
79
- if not file_rep.is_file() and not file_rep.exists():
80
- return False
81
-
82
- # Processing conversion file
83
- if conv_file is not None:
84
- if isinstance(conv_file, str):
85
- conv_file = Path(conv_file)
86
-
87
- if not conv_file.is_file() and not conv_file.exists():
88
- return False
89
-
90
- self.__convert_s_id(conv_file)
91
-
92
- # Processing report file
93
- if not self.read(file_rep):
94
- return False
95
-
96
- if self._full_data is None:
97
- raise Exception("Not data in file FinalReport.txt")
98
-
99
- self.__handler_header()
100
- self.__handler_data()
101
-
102
- if self._map_rn is not None:
103
- self.__snp_data['Sample ID'] = \
104
- self.__snp_data['Sample ID'].map(
105
- dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
106
- )
107
-
108
- except Exception as e:
109
- raise e
110
-
111
- return True
112
-
113
- def read(self, file_rep: Path) -> bool:
114
- """ Reading data from the final_report file
115
-
116
- :param file_rep: path, pointer to the file to be read.
117
- :return: Returns true if the read was successful, false if it failed.
118
- """
119
- try:
120
- if len(data := file_rep.read_text()) != 0:
121
- self._full_data = data.strip().split("\n")
122
- return True
123
-
124
- self._full_data = None
125
-
126
- except Exception as e:
127
- return False
128
-
129
- return True
130
-
131
- def __handler_header(self) -> None:
132
- """ Processes data from a file, selects meta-information. """
133
-
134
- for line in self._full_data:
135
- if self.__class__.__PATTERN_DATA.findall(line):
136
- return
137
-
138
- if self.__class__.__PATTERN_HEADER.findall(line):
139
- continue
140
-
141
- key = line.strip().split("\t")[0]
142
- value = line.strip().split("\t")[1]
143
-
144
- self.__header[key] = value
145
-
146
- def __handler_data(self) -> None:
147
- """ Processes data and forms an array for further processing. """
148
-
149
- temp = 1
150
- for line in self._full_data:
151
- if self.__class__.__PATTERN_DATA.findall(line):
152
- break
153
- temp += 1
154
-
155
- names_col = self.__sample_by_allele(
156
- self._full_data[temp].split(f"{self._delimiter}")
157
- )
158
-
159
- if names_col is None:
160
- raise Exception(f"Error. Allele {self.__allele} not in data.")
161
-
162
- self.__snp_data = pd.DataFrame(
163
- [
164
- item_data.split(f"{self._delimiter}")
165
- for item_data in self._full_data[temp + 1:]
166
- ],
167
- columns=self._full_data[temp].split(f"{self._delimiter}")
168
- )[names_col]
169
-
170
- def __sample_by_allele(self, names: list[str]) -> list[str] | None:
171
- """ Method that generates a list of field names choosing which alleles
172
- to keep
173
-
174
- :param names: List of field names in the report file.
175
- :return: Returns a filtered list of fields by alleles.
176
- """
177
-
178
- allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
179
-
180
- match self.__allele:
181
- case None:
182
- return names
183
-
184
- case str():
185
- allele_pattern = re.compile(
186
- allele_templ.format(self.__allele)
187
- )
188
-
189
- case list() | tuple() | set():
190
- allele_pattern = re.compile(
191
- allele_templ.format("|".join(self.__allele))
192
- )
193
- case _:
194
- return None
195
-
196
- lst_allele = reduce(
197
- lambda i, j: i + j,
198
- [allele_pattern.findall(item) for item in names]
199
- )
200
-
201
- if len(lst_allele) == 0:
202
- return None
203
-
204
- exclude_alleles = [
205
- item for item in names
206
- if item.startswith("Allele") and item not in lst_allele
207
- ]
208
-
209
- return list(filter(
210
- lambda x: True if x not in exclude_alleles else False, names
211
- ))
212
-
213
- def __convert_s_id(self, path_file: Path) -> None:
214
- """Converts sample id which is in FinalReport to animal registration
215
- number.
216
-
217
- :param path_file: xlsx file with animal numbers label
218
- """
219
-
220
- self._map_rn = pd.read_excel(
221
- path_file,
222
- header=None,
223
- names=['SID', 'UNIQ_KEY', 'SEX'],
224
- dtype={'SID': str},
225
- index_col=False
226
- )
227
-
228
- if self._map_rn.empty:
229
- self._map_rn = None
230
- return
231
-
232
- self._map_rn.SID = self._map_rn.SID.str.strip()
233
- self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
234
-
235
- if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
236
- raise Exception("Error. Unique keys contain Cyrillic alphabet.")
237
-
238
- if self._map_rn.UNIQ_KEY.isna().any():
239
- self._map_rn.fillna('unknown', inplace=True)
240
-
241
- @staticmethod
242
- def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
243
- """
244
-
245
- :param seq:
246
- :return:
247
- """
248
-
249
- return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))
250
-
251
-
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+ __all__ = ("FinalReport",)
5
+
6
+ from pathlib import Path
7
+ from functools import reduce
8
+
9
+ import re
10
+ import pandas as pd
11
+
12
+
13
+ class FinalReport(object):
14
+ """ File that contains SNP information.
15
+
16
+ :argument allele: A variant form of a single nucleotide polymorphism
17
+ (SNP), a specific polymorphic site or a whole gene detectable at
18
+ a locus. Type: 'AB', 'Forward', 'Top', 'Plus', 'Design'
19
+ :argument sep: Delimiter to use. Default value: "\\t"
20
+
21
+ Example:
22
+ [Header]
23
+ GSGT Version 2.0.4
24
+ Processing Date 10/14/2021 4:02 PM
25
+ Content BovineSNP50_v3_A1.bpm
26
+ Num SNPs 53218
27
+ Total SNPs 53218
28
+ Num Samples 3
29
+ Total Samples 3
30
+ [Data]
31
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
32
+ ABCA12 1 A A 0.4048 0.8164
33
+ APAF1 1 B B 0.9067 0.9155
34
+ ...
35
+ """
36
+
37
+ __PATTERN_HEADER = re.compile(r'(^\[Header\])')
38
+ __PATTERN_DATA = re.compile(r'(^\[Data\])')
39
+
40
+ def __init__(
41
+ self,
42
+ allele: str | list | None = None,
43
+ sep: str = "\t"
44
+ ) -> None:
45
+ self._delimiter = sep
46
+ self._full_data = None
47
+
48
+ self.__header = {}
49
+ self.__snp_data = None
50
+ self.__allele = allele
51
+ self._map_rn = None
52
+
53
+ @property
54
+ def header(self) -> dict:
55
+ return self.__header
56
+
57
+ @property
58
+ def snp_data(self) -> pd.DataFrame | None:
59
+ return self.__snp_data
60
+
61
+ def handle(
62
+ self, file_rep: Path | str, conv_file: Path | str = None
63
+ ) -> bool:
64
+ """ Processes the FinalReport.txt file. Highlights meta information
65
+ and data.
66
+
67
+ :param file_rep: The file FinalReport.txt or another name.
68
+ :param conv_file: The file that contains IDs of registration numbers
69
+ of animals.
70
+ :return: Returns true if file processing was successful, false if
71
+ there were errors.
72
+ """
73
+
74
+ try:
75
+
76
+ if isinstance(file_rep, str):
77
+ file_rep = Path(file_rep)
78
+
79
+ if not file_rep.is_file() and not file_rep.exists():
80
+ return False
81
+
82
+ # Processing conversion file
83
+ if conv_file is not None:
84
+ if isinstance(conv_file, str):
85
+ conv_file = Path(conv_file)
86
+
87
+ if not conv_file.is_file() and not conv_file.exists():
88
+ return False
89
+
90
+ self.__convert_s_id(conv_file)
91
+
92
+ # Processing report file
93
+ if not self.read(file_rep):
94
+ return False
95
+
96
+ if self._full_data is None:
97
+ raise Exception("Not data in file FinalReport.txt")
98
+
99
+ self.__handler_header()
100
+ self.__handler_data()
101
+
102
+ if self._map_rn is not None:
103
+ self.__snp_data['Sample ID'] = \
104
+ self.__snp_data['Sample ID'].map(
105
+ dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
106
+ )
107
+
108
+ except Exception as e:
109
+ raise e
110
+
111
+ return True
112
+
113
+ def read(self, file_rep: Path) -> bool:
114
+ """ Reading data from the final_report file
115
+
116
+ :param file_rep: path, pointer to the file to be read.
117
+ :return: Returns true if the read was successful, false if it failed.
118
+ """
119
+ try:
120
+ if len(data := file_rep.read_text()) != 0:
121
+ self._full_data = data.strip().split("\n")
122
+ return True
123
+
124
+ self._full_data = None
125
+
126
+ except Exception as e:
127
+ return False
128
+
129
+ return True
130
+
131
+ def __handler_header(self) -> None:
132
+ """ Processes data from a file, selects meta-information. """
133
+
134
+ for line in self._full_data:
135
+ if self.__class__.__PATTERN_DATA.findall(line):
136
+ return
137
+
138
+ if self.__class__.__PATTERN_HEADER.findall(line):
139
+ continue
140
+
141
+ key = line.strip().split("\t")[0]
142
+ value = line.strip().split("\t")[1]
143
+
144
+ self.__header[key] = value
145
+
146
+ def __handler_data(self) -> None:
147
+ """ Processes data and forms an array for further processing. """
148
+
149
+ temp = 1
150
+ for line in self._full_data:
151
+ if self.__class__.__PATTERN_DATA.findall(line):
152
+ break
153
+ temp += 1
154
+
155
+ names_col = self.__sample_by_allele(
156
+ self._full_data[temp].split(f"{self._delimiter}")
157
+ )
158
+
159
+ if names_col is None:
160
+ raise Exception(f"Error. Allele {self.__allele} not in data.")
161
+
162
+ self.__snp_data = pd.DataFrame(
163
+ [
164
+ item_data.split(f"{self._delimiter}")
165
+ for item_data in self._full_data[temp + 1:]
166
+ ],
167
+ columns=self._full_data[temp].split(f"{self._delimiter}")
168
+ )[names_col]
169
+
170
+ def __sample_by_allele(self, names: list[str]) -> list[str] | None:
171
+ """ Method that generates a list of field names choosing which alleles
172
+ to keep
173
+
174
+ :param names: List of field names in the report file.
175
+ :return: Returns a filtered list of fields by alleles.
176
+ """
177
+
178
+ allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
179
+
180
+ match self.__allele:
181
+ case None:
182
+ return names
183
+
184
+ case str():
185
+ allele_pattern = re.compile(
186
+ allele_templ.format(self.__allele)
187
+ )
188
+
189
+ case list() | tuple() | set():
190
+ allele_pattern = re.compile(
191
+ allele_templ.format("|".join(self.__allele))
192
+ )
193
+ case _:
194
+ return None
195
+
196
+ lst_allele = reduce(
197
+ lambda i, j: i + j,
198
+ [allele_pattern.findall(item) for item in names]
199
+ )
200
+
201
+ if len(lst_allele) == 0:
202
+ return None
203
+
204
+ exclude_alleles = [
205
+ item for item in names
206
+ if item.startswith("Allele") and item not in lst_allele
207
+ ]
208
+
209
+ return list(filter(
210
+ lambda x: True if x not in exclude_alleles else False, names
211
+ ))
212
+
213
+ def __convert_s_id(self, path_file: Path) -> None:
214
+ """Converts sample id which is in FinalReport to animal registration
215
+ number.
216
+
217
+ :param path_file: xlsx file with animal numbers label
218
+ """
219
+
220
+ self._map_rn = pd.read_excel(
221
+ path_file,
222
+ header=None,
223
+ names=['SID', 'UNIQ_KEY', 'SEX'],
224
+ dtype={'SID': str},
225
+ index_col=False
226
+ )
227
+
228
+ if self._map_rn.empty:
229
+ self._map_rn = None
230
+ return
231
+
232
+ self._map_rn.SID = self._map_rn.SID.str.strip()
233
+ self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
234
+
235
+ if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
236
+ raise Exception("Error. Unique keys contain Cyrillic alphabet.")
237
+
238
+ if self._map_rn.UNIQ_KEY.isna().any():
239
+ self._map_rn.fillna('unknown', inplace=True)
240
+
241
+ @staticmethod
242
+ def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
243
+ """
244
+
245
+ :param seq:
246
+ :return:
247
+ """
248
+
249
+ return any(seq.apply(lambda x: bool(re.search('[а-яА-Я]', x))))
250
+
251
+
snplib/format/__init__.py CHANGED
@@ -1,19 +1,19 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- from ._snp import Snp
6
- from ._plink import (
7
- make_map,
8
- make_ped,
9
- make_lgen,
10
- make_fam
11
- )
12
-
13
- __all__ = [
14
- "Snp",
15
- "make_map",
16
- "make_ped",
17
- "make_fam",
18
- "make_lgen"
19
- ]
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from ._snp import Snp
6
+ from ._plink import (
7
+ make_map,
8
+ make_ped,
9
+ make_lgen,
10
+ make_fam
11
+ )
12
+
13
+ __all__ = [
14
+ "Snp",
15
+ "make_map",
16
+ "make_ped",
17
+ "make_fam",
18
+ "make_lgen"
19
+ ]
@@ -1,7 +1,7 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
6
- RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
7
- MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ FIELDS_ILLUMIN = ['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB']
6
+ RENAME_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'ALLELE1', 'ALLELE2']
7
+ MAP_FIELDS = dict(zip(FIELDS_ILLUMIN, RENAME_FIELDS))