snplib 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snplib/__init__.py +8 -8
- snplib/finalreport/__init__.py +7 -7
- snplib/finalreport/_finalreport.py +251 -251
- snplib/format/__init__.py +19 -19
- snplib/format/__settings.py +7 -7
- snplib/format/_plink.py +291 -305
- snplib/format/_snp.py +113 -113
- snplib/parentage/__init__.py +15 -15
- snplib/parentage/_discov.py +102 -102
- snplib/parentage/_isagmark.py +15 -15
- snplib/parentage/_verif.py +91 -91
- snplib/parentage/isag_disc.pl +0 -0
- snplib/parentage/isag_verif.pl +0 -0
- snplib/statistics/__init__.py +16 -16
- snplib/statistics/_callrate.py +60 -59
- snplib/statistics/_freq.py +67 -67
- snplib/statistics/_snphwe.py +132 -132
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/LICENSE +674 -674
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/METADATA +80 -97
- snplib-1.0.9.dist-info/RECORD +22 -0
- snplib/finalreport/tests/__init__.py +0 -7
- snplib/finalreport/tests/test_finalreport.py +0 -215
- snplib/format/tests/__init__.py +0 -7
- snplib/format/tests/test_plink_fam.py +0 -121
- snplib/format/tests/test_plink_lgen.py +0 -106
- snplib/format/tests/test_plink_map.py +0 -42
- snplib/format/tests/test_plink_ped.py +0 -136
- snplib/format/tests/test_snp.py +0 -128
- snplib/parentage/tests/__init__.py +0 -7
- snplib/parentage/tests/test_discov.py +0 -164
- snplib/parentage/tests/test_verif.py +0 -160
- snplib/statistics/tests/__init__.py +0 -7
- snplib/statistics/tests/test_callrate.py +0 -171
- snplib/statistics/tests/test_freq_allele.py +0 -87
- snplib/statistics/tests/test_freq_maf.py +0 -17
- snplib/statistics/tests/test_hwe_t.py +0 -41
- snplib/statistics/tests/test_snphwe.py +0 -41
- snplib-1.0.7.dist-info/RECORD +0 -37
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/WHEEL +0 -0
- {snplib-1.0.7.dist-info → snplib-1.0.9.dist-info}/top_level.txt +0 -0
snplib/format/_plink.py
CHANGED
@@ -1,305 +1,291 @@
|
|
1
|
-
#!/usr/bin/env python
|
2
|
-
# coding: utf-8
|
3
|
-
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
-
|
5
|
-
__all__ = [
|
6
|
-
"make_map", "make_ped", "make_fam", "make_lgen"
|
7
|
-
]
|
8
|
-
|
9
|
-
import re
|
10
|
-
import pandas as pd
|
11
|
-
|
12
|
-
|
13
|
-
def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
|
14
|
-
""" PLINK text fileset variant information file
|
15
|
-
https://www.cog-genomics.org/plink/1.9/formats#map
|
16
|
-
|
17
|
-
A text file with no header line, and one line per variant with the
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
2. Variant identifier
|
22
|
-
3. Position in morgans or centimorgans (optional; also safe to use
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
:param
|
94
|
-
|
95
|
-
:param
|
96
|
-
:param
|
97
|
-
:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
_ped =
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
if
|
117
|
-
|
118
|
-
"
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
_ped[
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
:param
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
return
|
292
|
-
|
293
|
-
|
294
|
-
def _check_underscore(value: str) -> bool:
|
295
|
-
""" Checking for underscore in a string
|
296
|
-
|
297
|
-
:param value: String for checked
|
298
|
-
:return: Return True if there is an underline in the string, False if not
|
299
|
-
"""
|
300
|
-
_under_l = re.compile(r"_")
|
301
|
-
|
302
|
-
if _under_l.findall(value):
|
303
|
-
return True
|
304
|
-
|
305
|
-
return False
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"make_map", "make_ped", "make_fam", "make_lgen"
|
7
|
+
]
|
8
|
+
|
9
|
+
import re
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
|
14
|
+
""" PLINK text fileset variant information file
|
15
|
+
https://www.cog-genomics.org/plink/1.9/formats#map
|
16
|
+
|
17
|
+
A text file with no header line, and one line per variant with the
|
18
|
+
following 3-4 fields:
|
19
|
+
|
20
|
+
1. Chromosome code. PLINK 1.9 also permits contig names here, but most older programs do not.
|
21
|
+
2. Variant identifier.
|
22
|
+
3. Position in morgans or centimorgans (optional; also safe to use dummy value of '0').
|
23
|
+
4. Base-pair coordinate.
|
24
|
+
|
25
|
+
All lines must have the same number of columns (so either no lines contain
|
26
|
+
the morgans/centimorgans column, or all of them do).
|
27
|
+
|
28
|
+
:param manifest: The file that is taken on the Illumina website with full information about the chip https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html.
|
29
|
+
|
30
|
+
:return: Return data in formate .map.
|
31
|
+
"""
|
32
|
+
|
33
|
+
fields = ['Chr', 'Name', 'MapInfo']
|
34
|
+
|
35
|
+
if all([
|
36
|
+
True
|
37
|
+
if item not in manifest.columns
|
38
|
+
else False
|
39
|
+
for item in fields
|
40
|
+
]):
|
41
|
+
raise KeyError("Manifest has no data to build map format!")
|
42
|
+
|
43
|
+
# Rearrange the columns and replace the names of the sex and mitochondrial
|
44
|
+
# chromosomes
|
45
|
+
permute_cols = manifest[fields].\
|
46
|
+
sort_values(by='Name').\
|
47
|
+
replace({'X': 30, 'Y': 31, 'MT': 33}).\
|
48
|
+
dropna(axis=0)
|
49
|
+
|
50
|
+
# Insert distances in centimorganides
|
51
|
+
permute_cols.insert(2, 'morgans', [0] * len(manifest))
|
52
|
+
|
53
|
+
return permute_cols
|
54
|
+
|
55
|
+
|
56
|
+
def make_ped(
|
57
|
+
data: pd.DataFrame,
|
58
|
+
sid_col: str,
|
59
|
+
snp_col: str,
|
60
|
+
fid_col: str = None,
|
61
|
+
father_col: str = None,
|
62
|
+
mother_col: str = None,
|
63
|
+
sex_col: str = None,
|
64
|
+
) -> pd.DataFrame | None:
|
65
|
+
""" Original standard text format for sample pedigree information and
|
66
|
+
genotype calls. Normally must be accompanied by a .map file.
|
67
|
+
https://www.cog-genomics.org/plink/1.9/formats#ped
|
68
|
+
|
69
|
+
The PED file has 6 fixed columns at the beginning followed by the SNP
|
70
|
+
information. The columns should be separated by a whitespace or a tab. The
|
71
|
+
first six columns hold the following information:
|
72
|
+
|
73
|
+
1. Family ID (if unknown use the same id as for the sample id in column two)
|
74
|
+
2. Sample ID
|
75
|
+
3. Paternal ID (if unknown use 0)
|
76
|
+
4. Maternal ID (if unknown use 0)
|
77
|
+
5. Sex (1=male; 2=female; 0=unknown)
|
78
|
+
6. Affection (0=unknown; 1=unaffected; 2=affected)
|
79
|
+
7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
|
80
|
+
|
81
|
+
Here is a brief example of a genotype PED file containing 5 samples
|
82
|
+
with 10 homozygous SNPs:
|
83
|
+
|
84
|
+
4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
85
|
+
6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
|
86
|
+
7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
87
|
+
6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
|
88
|
+
6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
|
89
|
+
|
90
|
+
:param data: Snp data that contain full or partial information on the animal.
|
91
|
+
:param sid_col: Sample ID. Column name in data.
|
92
|
+
:param snp_col: Snp column name in data.
|
93
|
+
:param fid_col: Family ID column name in data (if unknown use the same id as for the sample id in column two).
|
94
|
+
:param father_col: Paternal ID column name in data (if unknown use 0).
|
95
|
+
:param mother_col: Maternal ID column name in data (if unknown use 0).
|
96
|
+
:param sex_col: Sex column name in data (if unknown use 0).
|
97
|
+
:return: Returns an array of data in ped format to work with the plink program.
|
98
|
+
"""
|
99
|
+
|
100
|
+
_fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
|
101
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
102
|
+
|
103
|
+
_ped = pd.DataFrame(columns=_fields)
|
104
|
+
|
105
|
+
if sid_col not in data.columns or snp_col not in data.columns:
|
106
|
+
raise KeyError(f"Data has not in name columns!")
|
107
|
+
|
108
|
+
# Checked Sample ID on underscope - '_'
|
109
|
+
_ped["sid"] = data[sid_col].astype(str)
|
110
|
+
if _ped["sid"].apply(_check_underscore).any():
|
111
|
+
raise Exception(
|
112
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
113
|
+
)
|
114
|
+
|
115
|
+
# Checked Family ID on underscope - '_'
|
116
|
+
if fid_col is not None:
|
117
|
+
if fid_col not in data.columns:
|
118
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
119
|
+
|
120
|
+
if (data[fid_col].dtype.hasobject and
|
121
|
+
data[fid_col].apply(_check_underscore).any()):
|
122
|
+
raise Exception(
|
123
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
124
|
+
)
|
125
|
+
|
126
|
+
_ped["fid"] = data[fid_col]
|
127
|
+
|
128
|
+
else:
|
129
|
+
_ped["fid"] = data[sid_col].astype(str)
|
130
|
+
|
131
|
+
_ped["father"] = data[father_col] if father_col is not None else 0
|
132
|
+
_ped["mother"] = data[mother_col] if mother_col is not None else 0
|
133
|
+
_ped["sex"] = data[sex_col] if sex_col is not None else 0
|
134
|
+
_ped["not_used"] = 0
|
135
|
+
_ped["snp"] = data[snp_col]
|
136
|
+
|
137
|
+
return _ped[_fields].astype(_f_dtype)
|
138
|
+
|
139
|
+
|
140
|
+
def make_fam(
|
141
|
+
data: pd.DataFrame,
|
142
|
+
sid_col: str,
|
143
|
+
fid_col: str = None,
|
144
|
+
father_col: str = None,
|
145
|
+
mother_col: str = None,
|
146
|
+
sex_col: str = None,
|
147
|
+
sex_val: int = 0,
|
148
|
+
pheno_col: str = None,
|
149
|
+
pheno_val: int = -9
|
150
|
+
) -> pd.DataFrame | None:
|
151
|
+
""" PLINK sample information file
|
152
|
+
https://www.cog-genomics.org/plink/1.9/formats#fam
|
153
|
+
|
154
|
+
A text file with no header line, and one line per sample with the following
|
155
|
+
six fields:
|
156
|
+
|
157
|
+
1. Family ID ('FID')
|
158
|
+
2. Within-family ID ('IID'; cannot be '0')
|
159
|
+
3. Within-family ID of father ('0' if father isn't in dataset)
|
160
|
+
4. Within-family ID of mother ('0' if mother isn't in dataset)
|
161
|
+
5. Sex code ('1' = male, '2' = female, '0' = unknown)
|
162
|
+
6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
|
163
|
+
|
164
|
+
:param data: Snp data that contain full or partial information on the animal
|
165
|
+
:param fid_col: Family ID, default value "1". Must not contain underline - "_"
|
166
|
+
:param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain underline - "_"
|
167
|
+
:param father_col: Within-family ID of father ('0' if father isn't in dataset)
|
168
|
+
:param mother_col: Within-family ID of mother ('0' if mother isn't in dataset)
|
169
|
+
:param sex_col: Sex column name in data
|
170
|
+
:param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
|
171
|
+
:param pheno_col: Pheno column name in data
|
172
|
+
:param pheno_val: Phenotype value ('1' = control, '2' = case,'-9'/'0'/non-numeric = missing data if case/control)
|
173
|
+
:return: Return data in formate .fam
|
174
|
+
"""
|
175
|
+
|
176
|
+
_fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
|
177
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
178
|
+
|
179
|
+
_fam = pd.DataFrame(columns=_fields)
|
180
|
+
|
181
|
+
if sid_col not in data.columns:
|
182
|
+
raise KeyError(f"Data has not in name columns {sid_col}!")
|
183
|
+
|
184
|
+
# Checked Sample ID on underscope - '_'
|
185
|
+
_fam["sid"] = data[sid_col].astype(str)
|
186
|
+
if _fam["sid"].apply(_check_underscore).any():
|
187
|
+
raise Exception(
|
188
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
189
|
+
)
|
190
|
+
|
191
|
+
# Checked Family ID on underscope - '_'
|
192
|
+
if fid_col is not None:
|
193
|
+
if fid_col not in data.columns:
|
194
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
195
|
+
|
196
|
+
if (data[fid_col].dtype.hasobject and
|
197
|
+
data[fid_col].apply(_check_underscore).any()):
|
198
|
+
raise Exception(
|
199
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
200
|
+
)
|
201
|
+
|
202
|
+
_fam["fid"] = data[fid_col]
|
203
|
+
|
204
|
+
else:
|
205
|
+
_fam["fid"] = 1
|
206
|
+
|
207
|
+
_fam["father"] = data[father_col] if father_col is not None else 0
|
208
|
+
_fam["mother"] = data[mother_col] if mother_col is not None else 0
|
209
|
+
_fam["sex"] = data[sex_col] if sex_col is not None else sex_val
|
210
|
+
_fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
|
211
|
+
|
212
|
+
return _fam[_fields].astype(_f_dtype)
|
213
|
+
|
214
|
+
|
215
|
+
def make_lgen(
|
216
|
+
data: pd.DataFrame,
|
217
|
+
sid_col: str,
|
218
|
+
snp_name: str,
|
219
|
+
alleles: list[str],
|
220
|
+
fid_col: str = None
|
221
|
+
) -> pd.DataFrame | None:
|
222
|
+
""" PLINK long-format genotype file
|
223
|
+
https://www.cog-genomics.org/plink/1.9/formats#lgen
|
224
|
+
|
225
|
+
A text file with no header line, and one line per genotype call (or
|
226
|
+
just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
|
227
|
+
the following five fields:
|
228
|
+
|
229
|
+
1. Family ID
|
230
|
+
2. Within-family ID
|
231
|
+
3. Variant identifier
|
232
|
+
4. Allele call 1 ('0' for missing)
|
233
|
+
5. Allele call 2
|
234
|
+
|
235
|
+
There are several variations which are also handled by PLINK; see the
|
236
|
+
original discussion for details.
|
237
|
+
|
238
|
+
:param data: Data the after parsing FinalReport.txt
|
239
|
+
:param sid_col:
|
240
|
+
:param snp_name:
|
241
|
+
:param fid_col: Family ID, default value "1"
|
242
|
+
:param alleles:
|
243
|
+
:return: Return data in formate .lgen
|
244
|
+
"""
|
245
|
+
_fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
|
246
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
247
|
+
|
248
|
+
_lgen = pd.DataFrame(columns=_fields)
|
249
|
+
|
250
|
+
try:
|
251
|
+
# Checked Sample ID on underscope - '_'
|
252
|
+
_lgen["sid"] = data[sid_col].astype(str)
|
253
|
+
if _lgen["sid"].apply(_check_underscore).any():
|
254
|
+
raise Exception(
|
255
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
256
|
+
)
|
257
|
+
|
258
|
+
# Checked Family ID on underscope - '_'
|
259
|
+
if fid_col is not None:
|
260
|
+
if (data[fid_col].dtype.hasobject and
|
261
|
+
data[fid_col].apply(_check_underscore).any()):
|
262
|
+
raise Exception(
|
263
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
264
|
+
)
|
265
|
+
|
266
|
+
_lgen["fid"] = data[fid_col]
|
267
|
+
|
268
|
+
else:
|
269
|
+
_lgen["fid"] = 1
|
270
|
+
|
271
|
+
_lgen["snp_name"] = data[snp_name]
|
272
|
+
_lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
|
273
|
+
|
274
|
+
except Exception as e:
|
275
|
+
raise e
|
276
|
+
|
277
|
+
return _lgen[_fields].astype(_f_dtype)
|
278
|
+
|
279
|
+
|
280
|
+
def _check_underscore(value: str) -> bool:
|
281
|
+
""" Checking for underscore in a string
|
282
|
+
|
283
|
+
:param value: String for checked
|
284
|
+
:return: Return True if there is an underline in the string, False if not.
|
285
|
+
"""
|
286
|
+
_under_l = re.compile(r"_")
|
287
|
+
|
288
|
+
if _under_l.findall(value):
|
289
|
+
return True
|
290
|
+
|
291
|
+
return False
|