snplib 1.0.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- finalreport/__init__.py +7 -0
- finalreport/_finalreport.py +251 -0
- finalreport/tests/__init__.py +7 -0
- finalreport/tests/test_finalreport.py +215 -0
- format/__init__.py +19 -0
- format/__settings.py +7 -0
- format/_plink.py +305 -0
- format/_snp.py +113 -0
- format/tests/__init__.py +7 -0
- format/tests/test_plink_fam.py +121 -0
- format/tests/test_plink_lgen.py +106 -0
- format/tests/test_plink_map.py +42 -0
- format/tests/test_plink_ped.py +136 -0
- format/tests/test_snp.py +128 -0
- parentage/__init__.py +15 -0
- parentage/_discov.py +102 -0
- parentage/_isagmark.py +15 -0
- parentage/_verif.py +91 -0
- parentage/tests/__init__.py +7 -0
- parentage/tests/test_discov.py +164 -0
- parentage/tests/test_verif.py +160 -0
- snplib-1.0.0.dist-info/LICENSE +674 -0
- snplib-1.0.0.dist-info/METADATA +89 -0
- snplib-1.0.0.dist-info/RECORD +36 -0
- snplib-1.0.0.dist-info/WHEEL +5 -0
- snplib-1.0.0.dist-info/top_level.txt +4 -0
- statistics/__init__.py +16 -0
- statistics/_callrate.py +59 -0
- statistics/_freq.py +67 -0
- statistics/_snphwe.py +132 -0
- statistics/tests/__init__.py +7 -0
- statistics/tests/test_callrate.py +171 -0
- statistics/tests/test_freq_allele.py +87 -0
- statistics/tests/test_freq_maf.py +17 -0
- statistics/tests/test_hwe_t.py +41 -0
- statistics/tests/test_snphwe.py +41 -0
format/_plink.py
ADDED
@@ -0,0 +1,305 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"make_map", "make_ped", "make_fam", "make_lgen"
|
7
|
+
]
|
8
|
+
|
9
|
+
import re
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
|
14
|
+
""" PLINK text fileset variant information file
|
15
|
+
https://www.cog-genomics.org/plink/1.9/formats#map
|
16
|
+
|
17
|
+
A text file with no header line, and one line per variant with the following 3-4 fields:
|
18
|
+
|
19
|
+
1. Chromosome code. PLINK 1.9 also permits contig names here, but most
|
20
|
+
older programs do not.
|
21
|
+
2. Variant identifier
|
22
|
+
3. Position in morgans or centimorgans (optional; also safe to use
|
23
|
+
dummy value of '0')
|
24
|
+
4. Base-pair coordinate
|
25
|
+
|
26
|
+
All lines must have the same number of columns (so either no lines
|
27
|
+
contain the morgans/centimorgans column, or all of them do).
|
28
|
+
|
29
|
+
:param manifest: The file that is taken on the Illumina website with full
|
30
|
+
information about the chip
|
31
|
+
https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
|
32
|
+
|
33
|
+
:return: Return data in formate .map
|
34
|
+
"""
|
35
|
+
|
36
|
+
fields = ['Chr', 'Name', 'MapInfo']
|
37
|
+
|
38
|
+
if all([
|
39
|
+
True
|
40
|
+
if item not in manifest.columns
|
41
|
+
else False
|
42
|
+
for item in fields
|
43
|
+
]):
|
44
|
+
raise KeyError("Manifest has no data to build map format!")
|
45
|
+
|
46
|
+
# Rearrange the columns and replace the names of the sex and mitochondrial
|
47
|
+
# chromosomes
|
48
|
+
permute_cols = manifest[fields].\
|
49
|
+
sort_values(by='Name').\
|
50
|
+
replace({'X': 30, 'Y': 31, 'MT': 33}).\
|
51
|
+
dropna(axis=0)
|
52
|
+
|
53
|
+
# Insert distances in centimorganides
|
54
|
+
permute_cols.insert(2, 'morgans', [0] * len(manifest))
|
55
|
+
|
56
|
+
return permute_cols
|
57
|
+
|
58
|
+
|
59
|
+
def make_ped(
|
60
|
+
data: pd.DataFrame,
|
61
|
+
sid_col: str,
|
62
|
+
snp_col: str,
|
63
|
+
fid_col: str = None,
|
64
|
+
father_col: str = None,
|
65
|
+
mother_col: str = None,
|
66
|
+
sex_col: str = None,
|
67
|
+
) -> pd.DataFrame | None:
|
68
|
+
""" Original standard text format for sample pedigree information and
|
69
|
+
genotype calls. Normally must be accompanied by a .map file.
|
70
|
+
https://www.cog-genomics.org/plink/1.9/formats#ped
|
71
|
+
|
72
|
+
The PED file has 6 fixed columns at the beginning followed by the SNP
|
73
|
+
information. The columns should be separated by a whitespace or a tab. The
|
74
|
+
first six columns hold the following information:
|
75
|
+
|
76
|
+
1. Family ID (if unknown use the same id as for the sample id in
|
77
|
+
column two)
|
78
|
+
2. Sample ID
|
79
|
+
3. Paternal ID (if unknown use 0)
|
80
|
+
4. Maternal ID (if unknown use 0)
|
81
|
+
5. Sex (1=male; 2=female; 0=unknown)
|
82
|
+
6. Affection (0=unknown; 1=unaffected; 2=affected)
|
83
|
+
7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
|
84
|
+
|
85
|
+
Here is a brief example of a genotype PED file containing 5 samples
|
86
|
+
with 10 homozygous SNPs:
|
87
|
+
4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
88
|
+
6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
|
89
|
+
7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
90
|
+
6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
|
91
|
+
6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
|
92
|
+
|
93
|
+
:param data: Snp data that contain full or partial information on the
|
94
|
+
animal
|
95
|
+
:param sid_col: Sample ID. Column name in data
|
96
|
+
:param snp_col: Snp column name in data
|
97
|
+
:param fid_col: Family ID column name in data (if unknown use the same
|
98
|
+
id as for the sample id in column two)
|
99
|
+
:param father_col: Paternal ID column name in data (if unknown use 0)
|
100
|
+
:param mother_col: Maternal ID column name in data (if unknown use 0)
|
101
|
+
:param sex_col: Sex column name in data (if unknown use 0)
|
102
|
+
:return: Returns an array of data in ped format to work with the plink
|
103
|
+
program
|
104
|
+
"""
|
105
|
+
|
106
|
+
_fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
|
107
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
108
|
+
|
109
|
+
_ped = pd.DataFrame(columns=_fields)
|
110
|
+
|
111
|
+
if sid_col not in data.columns or snp_col not in data.columns:
|
112
|
+
raise KeyError(f"Data has not in name columns!")
|
113
|
+
|
114
|
+
# Checked Sample ID on underscope - '_'
|
115
|
+
_ped["sid"] = data[sid_col].astype(str)
|
116
|
+
if _ped["sid"].apply(_check_underscore).any():
|
117
|
+
raise Exception(
|
118
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
119
|
+
)
|
120
|
+
|
121
|
+
# Checked Family ID on underscope - '_'
|
122
|
+
if fid_col is not None:
|
123
|
+
if fid_col not in data.columns:
|
124
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
125
|
+
|
126
|
+
if (data[fid_col].dtype.hasobject and
|
127
|
+
data[fid_col].apply(_check_underscore).any()):
|
128
|
+
raise Exception(
|
129
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
130
|
+
)
|
131
|
+
|
132
|
+
_ped["fid"] = data[fid_col]
|
133
|
+
|
134
|
+
else:
|
135
|
+
_ped["fid"] = data[sid_col].astype(str)
|
136
|
+
|
137
|
+
_ped["father"] = data[father_col] if father_col is not None else 0
|
138
|
+
_ped["mother"] = data[mother_col] if mother_col is not None else 0
|
139
|
+
_ped["sex"] = data[sex_col] if sex_col is not None else 0
|
140
|
+
_ped["not_used"] = 0
|
141
|
+
_ped["snp"] = data[snp_col]
|
142
|
+
|
143
|
+
return _ped[_fields].astype(_f_dtype)
|
144
|
+
|
145
|
+
|
146
|
+
def make_fam(
|
147
|
+
data: pd.DataFrame,
|
148
|
+
sid_col: str,
|
149
|
+
fid_col: str = None,
|
150
|
+
father_col: str = None,
|
151
|
+
mother_col: str = None,
|
152
|
+
sex_col: str = None,
|
153
|
+
sex_val: int = 0,
|
154
|
+
pheno_col: str = None,
|
155
|
+
pheno_val: int = -9
|
156
|
+
|
157
|
+
) -> pd.DataFrame | None:
|
158
|
+
""" PLINK sample information file
|
159
|
+
https://www.cog-genomics.org/plink/1.9/formats#fam
|
160
|
+
|
161
|
+
A text file with no header line, and one line per sample with the
|
162
|
+
following six fields:
|
163
|
+
|
164
|
+
1. Family ID ('FID')
|
165
|
+
2. Within-family ID ('IID'; cannot be '0')
|
166
|
+
3. Within-family ID of father ('0' if father isn't in dataset)
|
167
|
+
4. Within-family ID of mother ('0' if mother isn't in dataset)
|
168
|
+
5. Sex code ('1' = male, '2' = female, '0' = unknown)
|
169
|
+
6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
|
170
|
+
missing data if case/control)
|
171
|
+
|
172
|
+
:param data: Snp data that contain full or partial information on the
|
173
|
+
animal
|
174
|
+
:param fid_col: Family ID, default value "1". Must not contain
|
175
|
+
underline - "_"
|
176
|
+
:param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
|
177
|
+
underline - "_"
|
178
|
+
:param father_col: Within-family ID of father ('0' if father isn't in
|
179
|
+
dataset)
|
180
|
+
:param mother_col: Within-family ID of mother ('0' if mother isn't in
|
181
|
+
dataset)
|
182
|
+
:param sex_col: Sex column name in data
|
183
|
+
:param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
|
184
|
+
:param pheno_col: Pheno column name in data
|
185
|
+
:param pheno_val: Phenotype value ('1' = control, '2' = case,
|
186
|
+
'-9'/'0'/non-numeric = missing data if case/control)
|
187
|
+
:return: Return data in formate .fam
|
188
|
+
"""
|
189
|
+
|
190
|
+
_fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
|
191
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
192
|
+
|
193
|
+
_fam = pd.DataFrame(columns=_fields)
|
194
|
+
|
195
|
+
if sid_col not in data.columns:
|
196
|
+
raise KeyError(f"Data has not in name columns {sid_col}!")
|
197
|
+
|
198
|
+
# Checked Sample ID on underscope - '_'
|
199
|
+
_fam["sid"] = data[sid_col].astype(str)
|
200
|
+
if _fam["sid"].apply(_check_underscore).any():
|
201
|
+
raise Exception(
|
202
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
203
|
+
)
|
204
|
+
|
205
|
+
# Checked Family ID on underscope - '_'
|
206
|
+
if fid_col is not None:
|
207
|
+
if fid_col not in data.columns:
|
208
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
209
|
+
|
210
|
+
if (data[fid_col].dtype.hasobject and
|
211
|
+
data[fid_col].apply(_check_underscore).any()):
|
212
|
+
raise Exception(
|
213
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
214
|
+
)
|
215
|
+
|
216
|
+
_fam["fid"] = data[fid_col]
|
217
|
+
|
218
|
+
else:
|
219
|
+
_fam["fid"] = 1
|
220
|
+
|
221
|
+
_fam["father"] = data[father_col] if father_col is not None else 0
|
222
|
+
_fam["mother"] = data[mother_col] if mother_col is not None else 0
|
223
|
+
_fam["sex"] = data[sex_col] if sex_col is not None else sex_val
|
224
|
+
_fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
|
225
|
+
|
226
|
+
return _fam[_fields].astype(_f_dtype)
|
227
|
+
|
228
|
+
|
229
|
+
def make_lgen(
|
230
|
+
data: pd.DataFrame,
|
231
|
+
sid_col: str,
|
232
|
+
snp_name: str,
|
233
|
+
alleles: list[str],
|
234
|
+
fid_col: str = None
|
235
|
+
) -> pd.DataFrame | None:
|
236
|
+
""" PLINK long-format genotype file
|
237
|
+
https://www.cog-genomics.org/plink/1.9/formats#lgen
|
238
|
+
|
239
|
+
A text file with no header line, and one line per genotype call (or
|
240
|
+
just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
|
241
|
+
the following five fields:
|
242
|
+
|
243
|
+
1. Family ID
|
244
|
+
2. Within-family ID
|
245
|
+
3. Variant identifier
|
246
|
+
4. Allele call 1 ('0' for missing)
|
247
|
+
5. Allele call 2
|
248
|
+
|
249
|
+
There are several variations which are also handled by PLINK; see the
|
250
|
+
original discussion for details.
|
251
|
+
|
252
|
+
:param data: Data the after parsing FinalReport.txt
|
253
|
+
:param sid_col:
|
254
|
+
:param snp_name:
|
255
|
+
:param fid_col: Family ID, default value "1"
|
256
|
+
:param alleles:
|
257
|
+
:return: - Return data in formate .lgen
|
258
|
+
"""
|
259
|
+
_fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
|
260
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
261
|
+
|
262
|
+
_lgen = pd.DataFrame(columns=_fields)
|
263
|
+
|
264
|
+
try:
|
265
|
+
# Checked Sample ID on underscope - '_'
|
266
|
+
_lgen["sid"] = data[sid_col].astype(str)
|
267
|
+
if _lgen["sid"].apply(_check_underscore).any():
|
268
|
+
raise Exception(
|
269
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
270
|
+
)
|
271
|
+
|
272
|
+
# Checked Family ID on underscope - '_'
|
273
|
+
if fid_col is not None:
|
274
|
+
if (data[fid_col].dtype.hasobject and
|
275
|
+
data[fid_col].apply(_check_underscore).any()):
|
276
|
+
raise Exception(
|
277
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
278
|
+
)
|
279
|
+
|
280
|
+
_lgen["fid"] = data[fid_col]
|
281
|
+
|
282
|
+
else:
|
283
|
+
_lgen["fid"] = 1
|
284
|
+
|
285
|
+
_lgen["snp_name"] = data[snp_name]
|
286
|
+
_lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
|
287
|
+
|
288
|
+
except Exception as e:
|
289
|
+
raise e
|
290
|
+
|
291
|
+
return _lgen[_fields].astype(_f_dtype)
|
292
|
+
|
293
|
+
|
294
|
+
def _check_underscore(value: str) -> bool:
|
295
|
+
""" Checking for underscore in a string
|
296
|
+
|
297
|
+
:param value: String for checked
|
298
|
+
:return: Return True if there is an underline in the string, False if not
|
299
|
+
"""
|
300
|
+
_under_l = re.compile(r"_")
|
301
|
+
|
302
|
+
if _under_l.findall(value):
|
303
|
+
return True
|
304
|
+
|
305
|
+
return False
|
format/_snp.py
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
# !/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
5
|
+
|
6
|
+
from pathlib import Path
|
7
|
+
from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
class Snp(object):
|
13
|
+
""" The process of converting genomic map data - FinalReport.txt obtained
|
14
|
+
from Illumin. Recoding allele data into quantitative data, saving in the
|
15
|
+
format necessary for calculating gblup on blupf90.
|
16
|
+
|
17
|
+
:argument fmt: Data format to use snp in plink and blupf90. Default
|
18
|
+
value "uga". """
|
19
|
+
|
20
|
+
_ALLELE_CODE = {
|
21
|
+
'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
|
22
|
+
}
|
23
|
+
|
24
|
+
_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
|
25
|
+
_F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
|
26
|
+
|
27
|
+
def __init__(self, fmt: str | None = "uga") -> None:
|
28
|
+
self._format_data = fmt
|
29
|
+
self.__data_snp = None
|
30
|
+
|
31
|
+
@property
|
32
|
+
def data(self) -> pd.DataFrame | None:
|
33
|
+
return self.__data_snp
|
34
|
+
|
35
|
+
def process(self, data: pd.DataFrame) -> None:
|
36
|
+
""" Data processing and formatting. Calculation of statistical
|
37
|
+
information
|
38
|
+
|
39
|
+
:param data: Data from FinalReport file. Example:
|
40
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
|
41
|
+
ABCA12 14814 A A 0.4048 0.8164
|
42
|
+
ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
|
43
|
+
ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
|
44
|
+
ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
|
45
|
+
|
46
|
+
:return: Returns true if the data was formatted successfully and
|
47
|
+
statistical information was calculated, false if an error.
|
48
|
+
"""
|
49
|
+
|
50
|
+
if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
|
51
|
+
raise KeyError(
|
52
|
+
'The name of the fields does not match the finalreport.txt '
|
53
|
+
'file from Illumina'
|
54
|
+
)
|
55
|
+
|
56
|
+
self.__data_snp = data.rename(columns=MAP_FIELDS)
|
57
|
+
self.__data_snp['SNP'] = \
|
58
|
+
self.__data_snp[['ALLELE1', 'ALLELE2']].\
|
59
|
+
sum(axis=1).\
|
60
|
+
map(Snp._ALLELE_CODE)
|
61
|
+
|
62
|
+
self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
|
63
|
+
|
64
|
+
if self._format_data is not None and self._format_data == "uga":
|
65
|
+
self.__data_snp = self._format_uga(
|
66
|
+
self.__data_snp[['SAMPLE_ID', 'SNP']]
|
67
|
+
)
|
68
|
+
|
69
|
+
@staticmethod
|
70
|
+
def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
|
71
|
+
""" Data format to use snp in plink and blupf90. """
|
72
|
+
|
73
|
+
return data.groupby(by='SAMPLE_ID').sum().reset_index()
|
74
|
+
|
75
|
+
def to_file(self, file_path: str | Path) -> None:
|
76
|
+
""" Saving data to a file.
|
77
|
+
|
78
|
+
:param file_path: Path to file
|
79
|
+
"""
|
80
|
+
|
81
|
+
if isinstance(file_path, str):
|
82
|
+
file_path = Path(file_path)
|
83
|
+
|
84
|
+
if self._format_data is not None and self._format_data == "uga":
|
85
|
+
|
86
|
+
max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
|
87
|
+
|
88
|
+
self.__data_snp.\
|
89
|
+
apply(
|
90
|
+
lambda x: " ".join([
|
91
|
+
self._add_space(x.iloc[0], max_len), x.iloc[1]
|
92
|
+
]),
|
93
|
+
axis=1
|
94
|
+
).\
|
95
|
+
to_csv(file_path, index=False, header=False)
|
96
|
+
|
97
|
+
self.__data_snp["SAMPLE_ID"] = \
|
98
|
+
self.__data_snp["SAMPLE_ID"].str.strip()
|
99
|
+
|
100
|
+
return None
|
101
|
+
|
102
|
+
self.__data_snp.to_csv(file_path, sep=" ", index=False)
|
103
|
+
|
104
|
+
@staticmethod
|
105
|
+
def _add_space(value: str, max_len: int) -> str:
|
106
|
+
""" Adding spaces up to the maximum length of the value in the
|
107
|
+
sample_id data.
|
108
|
+
|
109
|
+
:param value: Sample_id value
|
110
|
+
:param max_len: Max len sample_id value
|
111
|
+
:return: Return replacing value
|
112
|
+
"""
|
113
|
+
return "".join([value, " " * (max_len - len(value))])
|
format/tests/__init__.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_fam
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_fam(request) -> pd.DataFrame | None:
|
14
|
+
return pd.read_pickle(DIR_FILES / f"fplink/fam/{request.param}")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatPed(object):
|
18
|
+
|
19
|
+
@pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
|
20
|
+
def test_fam_true(self, data_fam: pd.DataFrame) -> None:
|
21
|
+
assert not make_fam(
|
22
|
+
data_fam,
|
23
|
+
"SAMPLE_ID",
|
24
|
+
"SAMPLE_ID"
|
25
|
+
).empty
|
26
|
+
|
27
|
+
assert not make_fam(
|
28
|
+
data_fam,
|
29
|
+
"SAMPLE_ID",
|
30
|
+
"SAMPLE_ID"
|
31
|
+
).empty
|
32
|
+
|
33
|
+
def test_fam_empty(self) -> None:
|
34
|
+
assert make_fam(
|
35
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
36
|
+
"SAMPLE_ID",
|
37
|
+
).empty
|
38
|
+
|
39
|
+
assert make_fam(
|
40
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
41
|
+
"SAMPLE_ID",
|
42
|
+
"SAMPLE_ID",
|
43
|
+
).empty
|
44
|
+
|
45
|
+
@pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
|
46
|
+
def test_fam_raise_columns(self, data_fam: pd.DataFrame) -> None:
|
47
|
+
# SID_COL
|
48
|
+
with pytest.raises(
|
49
|
+
KeyError, match="Data has not in name columns SAMPLE_ID1!"
|
50
|
+
):
|
51
|
+
make_fam(
|
52
|
+
data_fam,
|
53
|
+
"SAMPLE_ID1",
|
54
|
+
"SAMPLE_ID",
|
55
|
+
)
|
56
|
+
|
57
|
+
# FID_COL
|
58
|
+
with pytest.raises(
|
59
|
+
KeyError, match="Data has not in name columns SAMPLE_ID1!"
|
60
|
+
):
|
61
|
+
make_fam(
|
62
|
+
data_fam,
|
63
|
+
"SAMPLE_ID",
|
64
|
+
"SAMPLE_ID1"
|
65
|
+
)
|
66
|
+
|
67
|
+
@pytest.mark.parametrize("data_fam", ["file2.pl"], indirect=True)
|
68
|
+
def test_fam_raises_underscope_sid(self, data_fam: pd.DataFrame) -> None:
|
69
|
+
|
70
|
+
# SID_COL
|
71
|
+
with pytest.raises(
|
72
|
+
Exception,
|
73
|
+
match="Replace in 'Sample ID' columns '_' on another a simbols"
|
74
|
+
):
|
75
|
+
make_fam(
|
76
|
+
data_fam,
|
77
|
+
"SAMPLE_ID",
|
78
|
+
"SAMPLE_ID"
|
79
|
+
)
|
80
|
+
|
81
|
+
@pytest.mark.parametrize("data_fam", ["file3.pl"], indirect=True)
|
82
|
+
def test_fam_raises_underscope_fid(self, data_fam: pd.DataFrame) -> None:
|
83
|
+
|
84
|
+
# FID_COL
|
85
|
+
with pytest.raises(
|
86
|
+
Exception,
|
87
|
+
match="Replace in 'Family ID' columns '_' on another a simbols"
|
88
|
+
):
|
89
|
+
make_fam(
|
90
|
+
data_fam,
|
91
|
+
"SAMPLE_ID",
|
92
|
+
"FAMILY_ID"
|
93
|
+
)
|
94
|
+
|
95
|
+
@pytest.mark.parametrize("data_fam", ["file4.pl"], indirect=True)
|
96
|
+
def test_fam_check_data(self, data_fam: pd.DataFrame) -> None:
|
97
|
+
res = make_fam(
|
98
|
+
data_fam,
|
99
|
+
"SAMPLE_ID",
|
100
|
+
"FAMILY_ID",
|
101
|
+
father_col="father",
|
102
|
+
mother_col="mother",
|
103
|
+
sex_col="sex",
|
104
|
+
pheno_col="pheno"
|
105
|
+
)
|
106
|
+
|
107
|
+
res2 = make_fam(
|
108
|
+
data_fam,
|
109
|
+
"SAMPLE_ID",
|
110
|
+
"FAMILY_ID",
|
111
|
+
)
|
112
|
+
|
113
|
+
assert all(res.father.values == list('1234'))
|
114
|
+
assert all(res.mother.values == list('5678'))
|
115
|
+
assert all(res.sex.values == list('1210'))
|
116
|
+
assert all(res.pheno.values == ['12', '13', '14', '15'])
|
117
|
+
|
118
|
+
assert all(res2.father.values == list('0000'))
|
119
|
+
assert all(res2.mother.values == list('0000'))
|
120
|
+
assert all(res2.sex.values == list('0000'))
|
121
|
+
assert all(res2.pheno.values == ['-9', '-9', '-9', '-9'])
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_lgen
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_lgen(request) -> pd.DataFrame:
|
14
|
+
return pd.read_pickle(DIR_FILES / f"fplink/lgen/{request.param}")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatLgen(object):
|
18
|
+
|
19
|
+
@pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
|
20
|
+
def test_lgen_true(self, data_lgen: pd.DataFrame) -> None:
|
21
|
+
assert not make_lgen(
|
22
|
+
data_lgen,
|
23
|
+
"Sample ID",
|
24
|
+
"SNP Name",
|
25
|
+
["Allele1 - AB", "Allele2 - AB"]
|
26
|
+
).empty
|
27
|
+
|
28
|
+
def test_lgen_empty(self) -> None:
|
29
|
+
assert make_lgen(
|
30
|
+
pd.DataFrame(columns=[
|
31
|
+
"Sample ID", "SNP Name", "Allele1 - AB", "Allele2 - AB"
|
32
|
+
]),
|
33
|
+
"Sample ID",
|
34
|
+
"SNP Name",
|
35
|
+
["Allele1 - AB", "Allele2 - AB"]
|
36
|
+
).empty
|
37
|
+
|
38
|
+
@pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
|
39
|
+
def test_lgen_raise_columns(self, data_lgen: pd.DataFrame) -> None:
|
40
|
+
|
41
|
+
with pytest.raises(
|
42
|
+
Exception,
|
43
|
+
match="Replace in 'Sample ID' columns '_' on another a simbols"
|
44
|
+
):
|
45
|
+
res1 = data_lgen.copy(deep=True)
|
46
|
+
res1["Sample ID"] = res1["Sample ID"] + "_"
|
47
|
+
|
48
|
+
make_lgen(
|
49
|
+
res1,
|
50
|
+
"Sample ID",
|
51
|
+
"SNP Name",
|
52
|
+
["Allele1 - AB", "Allele2 - AB"]
|
53
|
+
)
|
54
|
+
|
55
|
+
with pytest.raises(
|
56
|
+
Exception,
|
57
|
+
match="Replace in 'Family ID' columns '_' on another a simbols"
|
58
|
+
):
|
59
|
+
res1 = data_lgen.copy(deep=True)
|
60
|
+
res1["Family ID"] = res1["Sample ID"] + "_"
|
61
|
+
|
62
|
+
make_lgen(
|
63
|
+
res1,
|
64
|
+
"Sample ID",
|
65
|
+
"SNP Name",
|
66
|
+
["Allele1 - AB", "Allele2 - AB"],
|
67
|
+
fid_col="Family ID"
|
68
|
+
)
|
69
|
+
|
70
|
+
# SID
|
71
|
+
with pytest.raises(KeyError):
|
72
|
+
make_lgen(
|
73
|
+
data_lgen,
|
74
|
+
"Sample ID1",
|
75
|
+
"SNP Name",
|
76
|
+
["Allele1 - AB", "Allele2 - AB"],
|
77
|
+
fid_col="Family ID"
|
78
|
+
)
|
79
|
+
|
80
|
+
# FID_COL
|
81
|
+
with pytest.raises(KeyError):
|
82
|
+
make_lgen(
|
83
|
+
data_lgen,
|
84
|
+
"Sample ID",
|
85
|
+
"SNP Name",
|
86
|
+
["Allele1 - AB", "Allele2 - AB"],
|
87
|
+
fid_col="Family ID"
|
88
|
+
)
|
89
|
+
|
90
|
+
# SNP name
|
91
|
+
with pytest.raises(KeyError):
|
92
|
+
make_lgen(
|
93
|
+
data_lgen,
|
94
|
+
"Sample ID",
|
95
|
+
"SNP Name1",
|
96
|
+
["Allele1 - AB", "Allele2 - AB"]
|
97
|
+
)
|
98
|
+
|
99
|
+
# Alleles
|
100
|
+
with pytest.raises(KeyError):
|
101
|
+
make_lgen(
|
102
|
+
data_lgen,
|
103
|
+
"Sample ID",
|
104
|
+
"SNP Name",
|
105
|
+
["Allele1 - AB1", "Allele2 - AB1"]
|
106
|
+
)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_map
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_map() -> pd.DataFrame:
|
14
|
+
return pd.read_csv(DIR_FILES / "fplink/map/file_bovinesnp50.csv")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatMap(object):
|
18
|
+
|
19
|
+
def test_map_true(self, data_map) -> None:
|
20
|
+
|
21
|
+
res = make_map(data_map)
|
22
|
+
assert not res.empty
|
23
|
+
|
24
|
+
def test_map_raise(self, data_map) -> None:
|
25
|
+
with pytest.raises(
|
26
|
+
KeyError, match="Manifest has no data to build map format!"
|
27
|
+
):
|
28
|
+
make_map(data_map)
|
29
|
+
make_map(pd.DataFrame())
|
30
|
+
make_map(
|
31
|
+
pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
|
32
|
+
)
|
33
|
+
|
34
|
+
with pytest.raises(
|
35
|
+
KeyError, match="Manifest has no data to build map format!"
|
36
|
+
):
|
37
|
+
make_map(pd.DataFrame())
|
38
|
+
|
39
|
+
def test_map_empty(self) -> None:
|
40
|
+
assert make_map(
|
41
|
+
pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
|
42
|
+
).empty
|