snplib 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- finalreport/__init__.py +7 -0
- finalreport/_finalreport.py +251 -0
- finalreport/tests/__init__.py +7 -0
- finalreport/tests/test_finalreport.py +215 -0
- format/__init__.py +19 -0
- format/__settings.py +7 -0
- format/_plink.py +305 -0
- format/_snp.py +113 -0
- format/tests/__init__.py +7 -0
- format/tests/test_plink_fam.py +121 -0
- format/tests/test_plink_lgen.py +106 -0
- format/tests/test_plink_map.py +42 -0
- format/tests/test_plink_ped.py +136 -0
- format/tests/test_snp.py +128 -0
- parentage/__init__.py +15 -0
- parentage/_discov.py +102 -0
- parentage/_isagmark.py +15 -0
- parentage/_verif.py +91 -0
- parentage/tests/__init__.py +7 -0
- parentage/tests/test_discov.py +164 -0
- parentage/tests/test_verif.py +160 -0
- snplib-1.0.0.dist-info/LICENSE +674 -0
- snplib-1.0.0.dist-info/METADATA +89 -0
- snplib-1.0.0.dist-info/RECORD +36 -0
- snplib-1.0.0.dist-info/WHEEL +5 -0
- snplib-1.0.0.dist-info/top_level.txt +4 -0
- statistics/__init__.py +16 -0
- statistics/_callrate.py +59 -0
- statistics/_freq.py +67 -0
- statistics/_snphwe.py +132 -0
- statistics/tests/__init__.py +7 -0
- statistics/tests/test_callrate.py +171 -0
- statistics/tests/test_freq_allele.py +87 -0
- statistics/tests/test_freq_maf.py +17 -0
- statistics/tests/test_hwe_t.py +41 -0
- statistics/tests/test_snphwe.py +41 -0
format/_plink.py
ADDED
@@ -0,0 +1,305 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"make_map", "make_ped", "make_fam", "make_lgen"
|
7
|
+
]
|
8
|
+
|
9
|
+
import re
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
|
14
|
+
""" PLINK text fileset variant information file
|
15
|
+
https://www.cog-genomics.org/plink/1.9/formats#map
|
16
|
+
|
17
|
+
A text file with no header line, and one line per variant with the following 3-4 fields:
|
18
|
+
|
19
|
+
1. Chromosome code. PLINK 1.9 also permits contig names here, but most
|
20
|
+
older programs do not.
|
21
|
+
2. Variant identifier
|
22
|
+
3. Position in morgans or centimorgans (optional; also safe to use
|
23
|
+
dummy value of '0')
|
24
|
+
4. Base-pair coordinate
|
25
|
+
|
26
|
+
All lines must have the same number of columns (so either no lines
|
27
|
+
contain the morgans/centimorgans column, or all of them do).
|
28
|
+
|
29
|
+
:param manifest: The file that is taken on the Illumina website with full
|
30
|
+
information about the chip
|
31
|
+
https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
|
32
|
+
|
33
|
+
:return: Return data in formate .map
|
34
|
+
"""
|
35
|
+
|
36
|
+
fields = ['Chr', 'Name', 'MapInfo']
|
37
|
+
|
38
|
+
if all([
|
39
|
+
True
|
40
|
+
if item not in manifest.columns
|
41
|
+
else False
|
42
|
+
for item in fields
|
43
|
+
]):
|
44
|
+
raise KeyError("Manifest has no data to build map format!")
|
45
|
+
|
46
|
+
# Rearrange the columns and replace the names of the sex and mitochondrial
|
47
|
+
# chromosomes
|
48
|
+
permute_cols = manifest[fields].\
|
49
|
+
sort_values(by='Name').\
|
50
|
+
replace({'X': 30, 'Y': 31, 'MT': 33}).\
|
51
|
+
dropna(axis=0)
|
52
|
+
|
53
|
+
# Insert distances in centimorganides
|
54
|
+
permute_cols.insert(2, 'morgans', [0] * len(manifest))
|
55
|
+
|
56
|
+
return permute_cols
|
57
|
+
|
58
|
+
|
59
|
+
def make_ped(
|
60
|
+
data: pd.DataFrame,
|
61
|
+
sid_col: str,
|
62
|
+
snp_col: str,
|
63
|
+
fid_col: str = None,
|
64
|
+
father_col: str = None,
|
65
|
+
mother_col: str = None,
|
66
|
+
sex_col: str = None,
|
67
|
+
) -> pd.DataFrame | None:
|
68
|
+
""" Original standard text format for sample pedigree information and
|
69
|
+
genotype calls. Normally must be accompanied by a .map file.
|
70
|
+
https://www.cog-genomics.org/plink/1.9/formats#ped
|
71
|
+
|
72
|
+
The PED file has 6 fixed columns at the beginning followed by the SNP
|
73
|
+
information. The columns should be separated by a whitespace or a tab. The
|
74
|
+
first six columns hold the following information:
|
75
|
+
|
76
|
+
1. Family ID (if unknown use the same id as for the sample id in
|
77
|
+
column two)
|
78
|
+
2. Sample ID
|
79
|
+
3. Paternal ID (if unknown use 0)
|
80
|
+
4. Maternal ID (if unknown use 0)
|
81
|
+
5. Sex (1=male; 2=female; 0=unknown)
|
82
|
+
6. Affection (0=unknown; 1=unaffected; 2=affected)
|
83
|
+
7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
|
84
|
+
|
85
|
+
Here is a brief example of a genotype PED file containing 5 samples
|
86
|
+
with 10 homozygous SNPs:
|
87
|
+
4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
88
|
+
6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
|
89
|
+
7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
90
|
+
6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
|
91
|
+
6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
|
92
|
+
|
93
|
+
:param data: Snp data that contain full or partial information on the
|
94
|
+
animal
|
95
|
+
:param sid_col: Sample ID. Column name in data
|
96
|
+
:param snp_col: Snp column name in data
|
97
|
+
:param fid_col: Family ID column name in data (if unknown use the same
|
98
|
+
id as for the sample id in column two)
|
99
|
+
:param father_col: Paternal ID column name in data (if unknown use 0)
|
100
|
+
:param mother_col: Maternal ID column name in data (if unknown use 0)
|
101
|
+
:param sex_col: Sex column name in data (if unknown use 0)
|
102
|
+
:return: Returns an array of data in ped format to work with the plink
|
103
|
+
program
|
104
|
+
"""
|
105
|
+
|
106
|
+
_fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
|
107
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
108
|
+
|
109
|
+
_ped = pd.DataFrame(columns=_fields)
|
110
|
+
|
111
|
+
if sid_col not in data.columns or snp_col not in data.columns:
|
112
|
+
raise KeyError(f"Data has not in name columns!")
|
113
|
+
|
114
|
+
# Checked Sample ID on underscope - '_'
|
115
|
+
_ped["sid"] = data[sid_col].astype(str)
|
116
|
+
if _ped["sid"].apply(_check_underscore).any():
|
117
|
+
raise Exception(
|
118
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
119
|
+
)
|
120
|
+
|
121
|
+
# Checked Family ID on underscope - '_'
|
122
|
+
if fid_col is not None:
|
123
|
+
if fid_col not in data.columns:
|
124
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
125
|
+
|
126
|
+
if (data[fid_col].dtype.hasobject and
|
127
|
+
data[fid_col].apply(_check_underscore).any()):
|
128
|
+
raise Exception(
|
129
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
130
|
+
)
|
131
|
+
|
132
|
+
_ped["fid"] = data[fid_col]
|
133
|
+
|
134
|
+
else:
|
135
|
+
_ped["fid"] = data[sid_col].astype(str)
|
136
|
+
|
137
|
+
_ped["father"] = data[father_col] if father_col is not None else 0
|
138
|
+
_ped["mother"] = data[mother_col] if mother_col is not None else 0
|
139
|
+
_ped["sex"] = data[sex_col] if sex_col is not None else 0
|
140
|
+
_ped["not_used"] = 0
|
141
|
+
_ped["snp"] = data[snp_col]
|
142
|
+
|
143
|
+
return _ped[_fields].astype(_f_dtype)
|
144
|
+
|
145
|
+
|
146
|
+
def make_fam(
|
147
|
+
data: pd.DataFrame,
|
148
|
+
sid_col: str,
|
149
|
+
fid_col: str = None,
|
150
|
+
father_col: str = None,
|
151
|
+
mother_col: str = None,
|
152
|
+
sex_col: str = None,
|
153
|
+
sex_val: int = 0,
|
154
|
+
pheno_col: str = None,
|
155
|
+
pheno_val: int = -9
|
156
|
+
|
157
|
+
) -> pd.DataFrame | None:
|
158
|
+
""" PLINK sample information file
|
159
|
+
https://www.cog-genomics.org/plink/1.9/formats#fam
|
160
|
+
|
161
|
+
A text file with no header line, and one line per sample with the
|
162
|
+
following six fields:
|
163
|
+
|
164
|
+
1. Family ID ('FID')
|
165
|
+
2. Within-family ID ('IID'; cannot be '0')
|
166
|
+
3. Within-family ID of father ('0' if father isn't in dataset)
|
167
|
+
4. Within-family ID of mother ('0' if mother isn't in dataset)
|
168
|
+
5. Sex code ('1' = male, '2' = female, '0' = unknown)
|
169
|
+
6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
|
170
|
+
missing data if case/control)
|
171
|
+
|
172
|
+
:param data: Snp data that contain full or partial information on the
|
173
|
+
animal
|
174
|
+
:param fid_col: Family ID, default value "1". Must not contain
|
175
|
+
underline - "_"
|
176
|
+
:param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
|
177
|
+
underline - "_"
|
178
|
+
:param father_col: Within-family ID of father ('0' if father isn't in
|
179
|
+
dataset)
|
180
|
+
:param mother_col: Within-family ID of mother ('0' if mother isn't in
|
181
|
+
dataset)
|
182
|
+
:param sex_col: Sex column name in data
|
183
|
+
:param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
|
184
|
+
:param pheno_col: Pheno column name in data
|
185
|
+
:param pheno_val: Phenotype value ('1' = control, '2' = case,
|
186
|
+
'-9'/'0'/non-numeric = missing data if case/control)
|
187
|
+
:return: Return data in formate .fam
|
188
|
+
"""
|
189
|
+
|
190
|
+
_fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
|
191
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
192
|
+
|
193
|
+
_fam = pd.DataFrame(columns=_fields)
|
194
|
+
|
195
|
+
if sid_col not in data.columns:
|
196
|
+
raise KeyError(f"Data has not in name columns {sid_col}!")
|
197
|
+
|
198
|
+
# Checked Sample ID on underscope - '_'
|
199
|
+
_fam["sid"] = data[sid_col].astype(str)
|
200
|
+
if _fam["sid"].apply(_check_underscore).any():
|
201
|
+
raise Exception(
|
202
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
203
|
+
)
|
204
|
+
|
205
|
+
# Checked Family ID on underscope - '_'
|
206
|
+
if fid_col is not None:
|
207
|
+
if fid_col not in data.columns:
|
208
|
+
raise KeyError(f"Data has not in name columns {fid_col}!")
|
209
|
+
|
210
|
+
if (data[fid_col].dtype.hasobject and
|
211
|
+
data[fid_col].apply(_check_underscore).any()):
|
212
|
+
raise Exception(
|
213
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
214
|
+
)
|
215
|
+
|
216
|
+
_fam["fid"] = data[fid_col]
|
217
|
+
|
218
|
+
else:
|
219
|
+
_fam["fid"] = 1
|
220
|
+
|
221
|
+
_fam["father"] = data[father_col] if father_col is not None else 0
|
222
|
+
_fam["mother"] = data[mother_col] if mother_col is not None else 0
|
223
|
+
_fam["sex"] = data[sex_col] if sex_col is not None else sex_val
|
224
|
+
_fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
|
225
|
+
|
226
|
+
return _fam[_fields].astype(_f_dtype)
|
227
|
+
|
228
|
+
|
229
|
+
def make_lgen(
|
230
|
+
data: pd.DataFrame,
|
231
|
+
sid_col: str,
|
232
|
+
snp_name: str,
|
233
|
+
alleles: list[str],
|
234
|
+
fid_col: str = None
|
235
|
+
) -> pd.DataFrame | None:
|
236
|
+
""" PLINK long-format genotype file
|
237
|
+
https://www.cog-genomics.org/plink/1.9/formats#lgen
|
238
|
+
|
239
|
+
A text file with no header line, and one line per genotype call (or
|
240
|
+
just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
|
241
|
+
the following five fields:
|
242
|
+
|
243
|
+
1. Family ID
|
244
|
+
2. Within-family ID
|
245
|
+
3. Variant identifier
|
246
|
+
4. Allele call 1 ('0' for missing)
|
247
|
+
5. Allele call 2
|
248
|
+
|
249
|
+
There are several variations which are also handled by PLINK; see the
|
250
|
+
original discussion for details.
|
251
|
+
|
252
|
+
:param data: Data the after parsing FinalReport.txt
|
253
|
+
:param sid_col:
|
254
|
+
:param snp_name:
|
255
|
+
:param fid_col: Family ID, default value "1"
|
256
|
+
:param alleles:
|
257
|
+
:return: - Return data in formate .lgen
|
258
|
+
"""
|
259
|
+
_fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
|
260
|
+
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
261
|
+
|
262
|
+
_lgen = pd.DataFrame(columns=_fields)
|
263
|
+
|
264
|
+
try:
|
265
|
+
# Checked Sample ID on underscope - '_'
|
266
|
+
_lgen["sid"] = data[sid_col].astype(str)
|
267
|
+
if _lgen["sid"].apply(_check_underscore).any():
|
268
|
+
raise Exception(
|
269
|
+
"Replace in 'Sample ID' columns '_' on another a simbols"
|
270
|
+
)
|
271
|
+
|
272
|
+
# Checked Family ID on underscope - '_'
|
273
|
+
if fid_col is not None:
|
274
|
+
if (data[fid_col].dtype.hasobject and
|
275
|
+
data[fid_col].apply(_check_underscore).any()):
|
276
|
+
raise Exception(
|
277
|
+
"Replace in 'Family ID' columns '_' on another a simbols"
|
278
|
+
)
|
279
|
+
|
280
|
+
_lgen["fid"] = data[fid_col]
|
281
|
+
|
282
|
+
else:
|
283
|
+
_lgen["fid"] = 1
|
284
|
+
|
285
|
+
_lgen["snp_name"] = data[snp_name]
|
286
|
+
_lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
|
287
|
+
|
288
|
+
except Exception as e:
|
289
|
+
raise e
|
290
|
+
|
291
|
+
return _lgen[_fields].astype(_f_dtype)
|
292
|
+
|
293
|
+
|
294
|
+
def _check_underscore(value: str) -> bool:
|
295
|
+
""" Checking for underscore in a string
|
296
|
+
|
297
|
+
:param value: String for checked
|
298
|
+
:return: Return True if there is an underline in the string, False if not
|
299
|
+
"""
|
300
|
+
_under_l = re.compile(r"_")
|
301
|
+
|
302
|
+
if _under_l.findall(value):
|
303
|
+
return True
|
304
|
+
|
305
|
+
return False
|
format/_snp.py
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
# !/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
5
|
+
|
6
|
+
from pathlib import Path
|
7
|
+
from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
|
8
|
+
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
class Snp(object):
|
13
|
+
""" The process of converting genomic map data - FinalReport.txt obtained
|
14
|
+
from Illumin. Recoding allele data into quantitative data, saving in the
|
15
|
+
format necessary for calculating gblup on blupf90.
|
16
|
+
|
17
|
+
:argument fmt: Data format to use snp in plink and blupf90. Default
|
18
|
+
value "uga". """
|
19
|
+
|
20
|
+
_ALLELE_CODE = {
|
21
|
+
'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
|
22
|
+
}
|
23
|
+
|
24
|
+
_FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
|
25
|
+
_F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
|
26
|
+
|
27
|
+
def __init__(self, fmt: str | None = "uga") -> None:
|
28
|
+
self._format_data = fmt
|
29
|
+
self.__data_snp = None
|
30
|
+
|
31
|
+
@property
|
32
|
+
def data(self) -> pd.DataFrame | None:
|
33
|
+
return self.__data_snp
|
34
|
+
|
35
|
+
def process(self, data: pd.DataFrame) -> None:
|
36
|
+
""" Data processing and formatting. Calculation of statistical
|
37
|
+
information
|
38
|
+
|
39
|
+
:param data: Data from FinalReport file. Example:
|
40
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
|
41
|
+
ABCA12 14814 A A 0.4048 0.8164
|
42
|
+
ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
|
43
|
+
ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
|
44
|
+
ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
|
45
|
+
|
46
|
+
:return: Returns true if the data was formatted successfully and
|
47
|
+
statistical information was calculated, false if an error.
|
48
|
+
"""
|
49
|
+
|
50
|
+
if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
|
51
|
+
raise KeyError(
|
52
|
+
'The name of the fields does not match the finalreport.txt '
|
53
|
+
'file from Illumina'
|
54
|
+
)
|
55
|
+
|
56
|
+
self.__data_snp = data.rename(columns=MAP_FIELDS)
|
57
|
+
self.__data_snp['SNP'] = \
|
58
|
+
self.__data_snp[['ALLELE1', 'ALLELE2']].\
|
59
|
+
sum(axis=1).\
|
60
|
+
map(Snp._ALLELE_CODE)
|
61
|
+
|
62
|
+
self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
|
63
|
+
|
64
|
+
if self._format_data is not None and self._format_data == "uga":
|
65
|
+
self.__data_snp = self._format_uga(
|
66
|
+
self.__data_snp[['SAMPLE_ID', 'SNP']]
|
67
|
+
)
|
68
|
+
|
69
|
+
@staticmethod
|
70
|
+
def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
|
71
|
+
""" Data format to use snp in plink and blupf90. """
|
72
|
+
|
73
|
+
return data.groupby(by='SAMPLE_ID').sum().reset_index()
|
74
|
+
|
75
|
+
def to_file(self, file_path: str | Path) -> None:
|
76
|
+
""" Saving data to a file.
|
77
|
+
|
78
|
+
:param file_path: Path to file
|
79
|
+
"""
|
80
|
+
|
81
|
+
if isinstance(file_path, str):
|
82
|
+
file_path = Path(file_path)
|
83
|
+
|
84
|
+
if self._format_data is not None and self._format_data == "uga":
|
85
|
+
|
86
|
+
max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
|
87
|
+
|
88
|
+
self.__data_snp.\
|
89
|
+
apply(
|
90
|
+
lambda x: " ".join([
|
91
|
+
self._add_space(x.iloc[0], max_len), x.iloc[1]
|
92
|
+
]),
|
93
|
+
axis=1
|
94
|
+
).\
|
95
|
+
to_csv(file_path, index=False, header=False)
|
96
|
+
|
97
|
+
self.__data_snp["SAMPLE_ID"] = \
|
98
|
+
self.__data_snp["SAMPLE_ID"].str.strip()
|
99
|
+
|
100
|
+
return None
|
101
|
+
|
102
|
+
self.__data_snp.to_csv(file_path, sep=" ", index=False)
|
103
|
+
|
104
|
+
@staticmethod
|
105
|
+
def _add_space(value: str, max_len: int) -> str:
|
106
|
+
""" Adding spaces up to the maximum length of the value in the
|
107
|
+
sample_id data.
|
108
|
+
|
109
|
+
:param value: Sample_id value
|
110
|
+
:param max_len: Max len sample_id value
|
111
|
+
:return: Return replacing value
|
112
|
+
"""
|
113
|
+
return "".join([value, " " * (max_len - len(value))])
|
format/tests/__init__.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_fam
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_fam(request) -> pd.DataFrame | None:
|
14
|
+
return pd.read_pickle(DIR_FILES / f"fplink/fam/{request.param}")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatPed(object):
|
18
|
+
|
19
|
+
@pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
|
20
|
+
def test_fam_true(self, data_fam: pd.DataFrame) -> None:
|
21
|
+
assert not make_fam(
|
22
|
+
data_fam,
|
23
|
+
"SAMPLE_ID",
|
24
|
+
"SAMPLE_ID"
|
25
|
+
).empty
|
26
|
+
|
27
|
+
assert not make_fam(
|
28
|
+
data_fam,
|
29
|
+
"SAMPLE_ID",
|
30
|
+
"SAMPLE_ID"
|
31
|
+
).empty
|
32
|
+
|
33
|
+
def test_fam_empty(self) -> None:
|
34
|
+
assert make_fam(
|
35
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
36
|
+
"SAMPLE_ID",
|
37
|
+
).empty
|
38
|
+
|
39
|
+
assert make_fam(
|
40
|
+
pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
|
41
|
+
"SAMPLE_ID",
|
42
|
+
"SAMPLE_ID",
|
43
|
+
).empty
|
44
|
+
|
45
|
+
@pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
|
46
|
+
def test_fam_raise_columns(self, data_fam: pd.DataFrame) -> None:
|
47
|
+
# SID_COL
|
48
|
+
with pytest.raises(
|
49
|
+
KeyError, match="Data has not in name columns SAMPLE_ID1!"
|
50
|
+
):
|
51
|
+
make_fam(
|
52
|
+
data_fam,
|
53
|
+
"SAMPLE_ID1",
|
54
|
+
"SAMPLE_ID",
|
55
|
+
)
|
56
|
+
|
57
|
+
# FID_COL
|
58
|
+
with pytest.raises(
|
59
|
+
KeyError, match="Data has not in name columns SAMPLE_ID1!"
|
60
|
+
):
|
61
|
+
make_fam(
|
62
|
+
data_fam,
|
63
|
+
"SAMPLE_ID",
|
64
|
+
"SAMPLE_ID1"
|
65
|
+
)
|
66
|
+
|
67
|
+
@pytest.mark.parametrize("data_fam", ["file2.pl"], indirect=True)
|
68
|
+
def test_fam_raises_underscope_sid(self, data_fam: pd.DataFrame) -> None:
|
69
|
+
|
70
|
+
# SID_COL
|
71
|
+
with pytest.raises(
|
72
|
+
Exception,
|
73
|
+
match="Replace in 'Sample ID' columns '_' on another a simbols"
|
74
|
+
):
|
75
|
+
make_fam(
|
76
|
+
data_fam,
|
77
|
+
"SAMPLE_ID",
|
78
|
+
"SAMPLE_ID"
|
79
|
+
)
|
80
|
+
|
81
|
+
@pytest.mark.parametrize("data_fam", ["file3.pl"], indirect=True)
|
82
|
+
def test_fam_raises_underscope_fid(self, data_fam: pd.DataFrame) -> None:
|
83
|
+
|
84
|
+
# FID_COL
|
85
|
+
with pytest.raises(
|
86
|
+
Exception,
|
87
|
+
match="Replace in 'Family ID' columns '_' on another a simbols"
|
88
|
+
):
|
89
|
+
make_fam(
|
90
|
+
data_fam,
|
91
|
+
"SAMPLE_ID",
|
92
|
+
"FAMILY_ID"
|
93
|
+
)
|
94
|
+
|
95
|
+
@pytest.mark.parametrize("data_fam", ["file4.pl"], indirect=True)
|
96
|
+
def test_fam_check_data(self, data_fam: pd.DataFrame) -> None:
|
97
|
+
res = make_fam(
|
98
|
+
data_fam,
|
99
|
+
"SAMPLE_ID",
|
100
|
+
"FAMILY_ID",
|
101
|
+
father_col="father",
|
102
|
+
mother_col="mother",
|
103
|
+
sex_col="sex",
|
104
|
+
pheno_col="pheno"
|
105
|
+
)
|
106
|
+
|
107
|
+
res2 = make_fam(
|
108
|
+
data_fam,
|
109
|
+
"SAMPLE_ID",
|
110
|
+
"FAMILY_ID",
|
111
|
+
)
|
112
|
+
|
113
|
+
assert all(res.father.values == list('1234'))
|
114
|
+
assert all(res.mother.values == list('5678'))
|
115
|
+
assert all(res.sex.values == list('1210'))
|
116
|
+
assert all(res.pheno.values == ['12', '13', '14', '15'])
|
117
|
+
|
118
|
+
assert all(res2.father.values == list('0000'))
|
119
|
+
assert all(res2.mother.values == list('0000'))
|
120
|
+
assert all(res2.sex.values == list('0000'))
|
121
|
+
assert all(res2.pheno.values == ['-9', '-9', '-9', '-9'])
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_lgen
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_lgen(request) -> pd.DataFrame:
|
14
|
+
return pd.read_pickle(DIR_FILES / f"fplink/lgen/{request.param}")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatLgen(object):
|
18
|
+
|
19
|
+
@pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
|
20
|
+
def test_lgen_true(self, data_lgen: pd.DataFrame) -> None:
|
21
|
+
assert not make_lgen(
|
22
|
+
data_lgen,
|
23
|
+
"Sample ID",
|
24
|
+
"SNP Name",
|
25
|
+
["Allele1 - AB", "Allele2 - AB"]
|
26
|
+
).empty
|
27
|
+
|
28
|
+
def test_lgen_empty(self) -> None:
|
29
|
+
assert make_lgen(
|
30
|
+
pd.DataFrame(columns=[
|
31
|
+
"Sample ID", "SNP Name", "Allele1 - AB", "Allele2 - AB"
|
32
|
+
]),
|
33
|
+
"Sample ID",
|
34
|
+
"SNP Name",
|
35
|
+
["Allele1 - AB", "Allele2 - AB"]
|
36
|
+
).empty
|
37
|
+
|
38
|
+
@pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
|
39
|
+
def test_lgen_raise_columns(self, data_lgen: pd.DataFrame) -> None:
|
40
|
+
|
41
|
+
with pytest.raises(
|
42
|
+
Exception,
|
43
|
+
match="Replace in 'Sample ID' columns '_' on another a simbols"
|
44
|
+
):
|
45
|
+
res1 = data_lgen.copy(deep=True)
|
46
|
+
res1["Sample ID"] = res1["Sample ID"] + "_"
|
47
|
+
|
48
|
+
make_lgen(
|
49
|
+
res1,
|
50
|
+
"Sample ID",
|
51
|
+
"SNP Name",
|
52
|
+
["Allele1 - AB", "Allele2 - AB"]
|
53
|
+
)
|
54
|
+
|
55
|
+
with pytest.raises(
|
56
|
+
Exception,
|
57
|
+
match="Replace in 'Family ID' columns '_' on another a simbols"
|
58
|
+
):
|
59
|
+
res1 = data_lgen.copy(deep=True)
|
60
|
+
res1["Family ID"] = res1["Sample ID"] + "_"
|
61
|
+
|
62
|
+
make_lgen(
|
63
|
+
res1,
|
64
|
+
"Sample ID",
|
65
|
+
"SNP Name",
|
66
|
+
["Allele1 - AB", "Allele2 - AB"],
|
67
|
+
fid_col="Family ID"
|
68
|
+
)
|
69
|
+
|
70
|
+
# SID
|
71
|
+
with pytest.raises(KeyError):
|
72
|
+
make_lgen(
|
73
|
+
data_lgen,
|
74
|
+
"Sample ID1",
|
75
|
+
"SNP Name",
|
76
|
+
["Allele1 - AB", "Allele2 - AB"],
|
77
|
+
fid_col="Family ID"
|
78
|
+
)
|
79
|
+
|
80
|
+
# FID_COL
|
81
|
+
with pytest.raises(KeyError):
|
82
|
+
make_lgen(
|
83
|
+
data_lgen,
|
84
|
+
"Sample ID",
|
85
|
+
"SNP Name",
|
86
|
+
["Allele1 - AB", "Allele2 - AB"],
|
87
|
+
fid_col="Family ID"
|
88
|
+
)
|
89
|
+
|
90
|
+
# SNP name
|
91
|
+
with pytest.raises(KeyError):
|
92
|
+
make_lgen(
|
93
|
+
data_lgen,
|
94
|
+
"Sample ID",
|
95
|
+
"SNP Name1",
|
96
|
+
["Allele1 - AB", "Allele2 - AB"]
|
97
|
+
)
|
98
|
+
|
99
|
+
# Alleles
|
100
|
+
with pytest.raises(KeyError):
|
101
|
+
make_lgen(
|
102
|
+
data_lgen,
|
103
|
+
"Sample ID",
|
104
|
+
"SNP Name",
|
105
|
+
["Allele1 - AB1", "Allele2 - AB1"]
|
106
|
+
)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
|
5
|
+
from . import DIR_FILES
|
6
|
+
from .. import make_map
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def data_map() -> pd.DataFrame:
|
14
|
+
return pd.read_csv(DIR_FILES / "fplink/map/file_bovinesnp50.csv")
|
15
|
+
|
16
|
+
|
17
|
+
class TestPlinkFormatMap(object):
|
18
|
+
|
19
|
+
def test_map_true(self, data_map) -> None:
|
20
|
+
|
21
|
+
res = make_map(data_map)
|
22
|
+
assert not res.empty
|
23
|
+
|
24
|
+
def test_map_raise(self, data_map) -> None:
|
25
|
+
with pytest.raises(
|
26
|
+
KeyError, match="Manifest has no data to build map format!"
|
27
|
+
):
|
28
|
+
make_map(data_map)
|
29
|
+
make_map(pd.DataFrame())
|
30
|
+
make_map(
|
31
|
+
pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
|
32
|
+
)
|
33
|
+
|
34
|
+
with pytest.raises(
|
35
|
+
KeyError, match="Manifest has no data to build map format!"
|
36
|
+
):
|
37
|
+
make_map(pd.DataFrame())
|
38
|
+
|
39
|
+
def test_map_empty(self) -> None:
|
40
|
+
assert make_map(
|
41
|
+
pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
|
42
|
+
).empty
|