snplib 1.0.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
format/_plink.py ADDED
@@ -0,0 +1,305 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ __all__ = [
6
+ "make_map", "make_ped", "make_fam", "make_lgen"
7
+ ]
8
+
9
+ import re
10
+ import pandas as pd
11
+
12
+
13
+ def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
14
+ """ PLINK text fileset variant information file
15
+ https://www.cog-genomics.org/plink/1.9/formats#map
16
+
17
+ A text file with no header line, and one line per variant with the following 3-4 fields:
18
+
19
+ 1. Chromosome code. PLINK 1.9 also permits contig names here, but most
20
+ older programs do not.
21
+ 2. Variant identifier
22
+ 3. Position in morgans or centimorgans (optional; also safe to use
23
+ dummy value of '0')
24
+ 4. Base-pair coordinate
25
+
26
+ All lines must have the same number of columns (so either no lines
27
+ contain the morgans/centimorgans column, or all of them do).
28
+
29
+ :param manifest: The file that is taken on the Illumina website with full
30
+ information about the chip
31
+ https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
32
+
33
+ :return: Return data in formate .map
34
+ """
35
+
36
+ fields = ['Chr', 'Name', 'MapInfo']
37
+
38
+ if all([
39
+ True
40
+ if item not in manifest.columns
41
+ else False
42
+ for item in fields
43
+ ]):
44
+ raise KeyError("Manifest has no data to build map format!")
45
+
46
+ # Rearrange the columns and replace the names of the sex and mitochondrial
47
+ # chromosomes
48
+ permute_cols = manifest[fields].\
49
+ sort_values(by='Name').\
50
+ replace({'X': 30, 'Y': 31, 'MT': 33}).\
51
+ dropna(axis=0)
52
+
53
+ # Insert distances in centimorganides
54
+ permute_cols.insert(2, 'morgans', [0] * len(manifest))
55
+
56
+ return permute_cols
57
+
58
+
59
+ def make_ped(
60
+ data: pd.DataFrame,
61
+ sid_col: str,
62
+ snp_col: str,
63
+ fid_col: str = None,
64
+ father_col: str = None,
65
+ mother_col: str = None,
66
+ sex_col: str = None,
67
+ ) -> pd.DataFrame | None:
68
+ """ Original standard text format for sample pedigree information and
69
+ genotype calls. Normally must be accompanied by a .map file.
70
+ https://www.cog-genomics.org/plink/1.9/formats#ped
71
+
72
+ The PED file has 6 fixed columns at the beginning followed by the SNP
73
+ information. The columns should be separated by a whitespace or a tab. The
74
+ first six columns hold the following information:
75
+
76
+ 1. Family ID (if unknown use the same id as for the sample id in
77
+ column two)
78
+ 2. Sample ID
79
+ 3. Paternal ID (if unknown use 0)
80
+ 4. Maternal ID (if unknown use 0)
81
+ 5. Sex (1=male; 2=female; 0=unknown)
82
+ 6. Affection (0=unknown; 1=unaffected; 2=affected)
83
+ 7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
84
+
85
+ Here is a brief example of a genotype PED file containing 5 samples
86
+ with 10 homozygous SNPs:
87
+ 4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
88
+ 6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
89
+ 7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
90
+ 6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
91
+ 6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
92
+
93
+ :param data: Snp data that contain full or partial information on the
94
+ animal
95
+ :param sid_col: Sample ID. Column name in data
96
+ :param snp_col: Snp column name in data
97
+ :param fid_col: Family ID column name in data (if unknown use the same
98
+ id as for the sample id in column two)
99
+ :param father_col: Paternal ID column name in data (if unknown use 0)
100
+ :param mother_col: Maternal ID column name in data (if unknown use 0)
101
+ :param sex_col: Sex column name in data (if unknown use 0)
102
+ :return: Returns an array of data in ped format to work with the plink
103
+ program
104
+ """
105
+
106
+ _fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
107
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
108
+
109
+ _ped = pd.DataFrame(columns=_fields)
110
+
111
+ if sid_col not in data.columns or snp_col not in data.columns:
112
+ raise KeyError(f"Data has not in name columns!")
113
+
114
+ # Checked Sample ID on underscope - '_'
115
+ _ped["sid"] = data[sid_col].astype(str)
116
+ if _ped["sid"].apply(_check_underscore).any():
117
+ raise Exception(
118
+ "Replace in 'Sample ID' columns '_' on another a simbols"
119
+ )
120
+
121
+ # Checked Family ID on underscope - '_'
122
+ if fid_col is not None:
123
+ if fid_col not in data.columns:
124
+ raise KeyError(f"Data has not in name columns {fid_col}!")
125
+
126
+ if (data[fid_col].dtype.hasobject and
127
+ data[fid_col].apply(_check_underscore).any()):
128
+ raise Exception(
129
+ "Replace in 'Family ID' columns '_' on another a simbols"
130
+ )
131
+
132
+ _ped["fid"] = data[fid_col]
133
+
134
+ else:
135
+ _ped["fid"] = data[sid_col].astype(str)
136
+
137
+ _ped["father"] = data[father_col] if father_col is not None else 0
138
+ _ped["mother"] = data[mother_col] if mother_col is not None else 0
139
+ _ped["sex"] = data[sex_col] if sex_col is not None else 0
140
+ _ped["not_used"] = 0
141
+ _ped["snp"] = data[snp_col]
142
+
143
+ return _ped[_fields].astype(_f_dtype)
144
+
145
+
146
+ def make_fam(
147
+ data: pd.DataFrame,
148
+ sid_col: str,
149
+ fid_col: str = None,
150
+ father_col: str = None,
151
+ mother_col: str = None,
152
+ sex_col: str = None,
153
+ sex_val: int = 0,
154
+ pheno_col: str = None,
155
+ pheno_val: int = -9
156
+
157
+ ) -> pd.DataFrame | None:
158
+ """ PLINK sample information file
159
+ https://www.cog-genomics.org/plink/1.9/formats#fam
160
+
161
+ A text file with no header line, and one line per sample with the
162
+ following six fields:
163
+
164
+ 1. Family ID ('FID')
165
+ 2. Within-family ID ('IID'; cannot be '0')
166
+ 3. Within-family ID of father ('0' if father isn't in dataset)
167
+ 4. Within-family ID of mother ('0' if mother isn't in dataset)
168
+ 5. Sex code ('1' = male, '2' = female, '0' = unknown)
169
+ 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
170
+ missing data if case/control)
171
+
172
+ :param data: Snp data that contain full or partial information on the
173
+ animal
174
+ :param fid_col: Family ID, default value "1". Must not contain
175
+ underline - "_"
176
+ :param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
177
+ underline - "_"
178
+ :param father_col: Within-family ID of father ('0' if father isn't in
179
+ dataset)
180
+ :param mother_col: Within-family ID of mother ('0' if mother isn't in
181
+ dataset)
182
+ :param sex_col: Sex column name in data
183
+ :param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
184
+ :param pheno_col: Pheno column name in data
185
+ :param pheno_val: Phenotype value ('1' = control, '2' = case,
186
+ '-9'/'0'/non-numeric = missing data if case/control)
187
+ :return: Return data in formate .fam
188
+ """
189
+
190
+ _fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
191
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
192
+
193
+ _fam = pd.DataFrame(columns=_fields)
194
+
195
+ if sid_col not in data.columns:
196
+ raise KeyError(f"Data has not in name columns {sid_col}!")
197
+
198
+ # Checked Sample ID on underscope - '_'
199
+ _fam["sid"] = data[sid_col].astype(str)
200
+ if _fam["sid"].apply(_check_underscore).any():
201
+ raise Exception(
202
+ "Replace in 'Sample ID' columns '_' on another a simbols"
203
+ )
204
+
205
+ # Checked Family ID on underscope - '_'
206
+ if fid_col is not None:
207
+ if fid_col not in data.columns:
208
+ raise KeyError(f"Data has not in name columns {fid_col}!")
209
+
210
+ if (data[fid_col].dtype.hasobject and
211
+ data[fid_col].apply(_check_underscore).any()):
212
+ raise Exception(
213
+ "Replace in 'Family ID' columns '_' on another a simbols"
214
+ )
215
+
216
+ _fam["fid"] = data[fid_col]
217
+
218
+ else:
219
+ _fam["fid"] = 1
220
+
221
+ _fam["father"] = data[father_col] if father_col is not None else 0
222
+ _fam["mother"] = data[mother_col] if mother_col is not None else 0
223
+ _fam["sex"] = data[sex_col] if sex_col is not None else sex_val
224
+ _fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
225
+
226
+ return _fam[_fields].astype(_f_dtype)
227
+
228
+
229
+ def make_lgen(
230
+ data: pd.DataFrame,
231
+ sid_col: str,
232
+ snp_name: str,
233
+ alleles: list[str],
234
+ fid_col: str = None
235
+ ) -> pd.DataFrame | None:
236
+ """ PLINK long-format genotype file
237
+ https://www.cog-genomics.org/plink/1.9/formats#lgen
238
+
239
+ A text file with no header line, and one line per genotype call (or
240
+ just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
241
+ the following five fields:
242
+
243
+ 1. Family ID
244
+ 2. Within-family ID
245
+ 3. Variant identifier
246
+ 4. Allele call 1 ('0' for missing)
247
+ 5. Allele call 2
248
+
249
+ There are several variations which are also handled by PLINK; see the
250
+ original discussion for details.
251
+
252
+ :param data: Data the after parsing FinalReport.txt
253
+ :param sid_col:
254
+ :param snp_name:
255
+ :param fid_col: Family ID, default value "1"
256
+ :param alleles:
257
+ :return: - Return data in formate .lgen
258
+ """
259
+ _fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
260
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
261
+
262
+ _lgen = pd.DataFrame(columns=_fields)
263
+
264
+ try:
265
+ # Checked Sample ID on underscope - '_'
266
+ _lgen["sid"] = data[sid_col].astype(str)
267
+ if _lgen["sid"].apply(_check_underscore).any():
268
+ raise Exception(
269
+ "Replace in 'Sample ID' columns '_' on another a simbols"
270
+ )
271
+
272
+ # Checked Family ID on underscope - '_'
273
+ if fid_col is not None:
274
+ if (data[fid_col].dtype.hasobject and
275
+ data[fid_col].apply(_check_underscore).any()):
276
+ raise Exception(
277
+ "Replace in 'Family ID' columns '_' on another a simbols"
278
+ )
279
+
280
+ _lgen["fid"] = data[fid_col]
281
+
282
+ else:
283
+ _lgen["fid"] = 1
284
+
285
+ _lgen["snp_name"] = data[snp_name]
286
+ _lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
287
+
288
+ except Exception as e:
289
+ raise e
290
+
291
+ return _lgen[_fields].astype(_f_dtype)
292
+
293
+
294
+ def _check_underscore(value: str) -> bool:
295
+ """ Checking for underscore in a string
296
+
297
+ :param value: String for checked
298
+ :return: Return True if there is an underline in the string, False if not
299
+ """
300
+ _under_l = re.compile(r"_")
301
+
302
+ if _under_l.findall(value):
303
+ return True
304
+
305
+ return False
format/_snp.py ADDED
@@ -0,0 +1,113 @@
1
+ # !/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
5
+
6
+ from pathlib import Path
7
+ from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
8
+
9
+ import pandas as pd
10
+
11
+
12
+ class Snp(object):
13
+ """ The process of converting genomic map data - FinalReport.txt obtained
14
+ from Illumin. Recoding allele data into quantitative data, saving in the
15
+ format necessary for calculating gblup on blupf90.
16
+
17
+ :argument fmt: Data format to use snp in plink and blupf90. Default
18
+ value "uga". """
19
+
20
+ _ALLELE_CODE = {
21
+ 'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
22
+ }
23
+
24
+ _FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
25
+ _F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
26
+
27
+ def __init__(self, fmt: str | None = "uga") -> None:
28
+ self._format_data = fmt
29
+ self.__data_snp = None
30
+
31
+ @property
32
+ def data(self) -> pd.DataFrame | None:
33
+ return self.__data_snp
34
+
35
+ def process(self, data: pd.DataFrame) -> None:
36
+ """ Data processing and formatting. Calculation of statistical
37
+ information
38
+
39
+ :param data: Data from FinalReport file. Example:
40
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
41
+ ABCA12 14814 A A 0.4048 0.8164
42
+ ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
43
+ ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
44
+ ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
45
+
46
+ :return: Returns true if the data was formatted successfully and
47
+ statistical information was calculated, false if an error.
48
+ """
49
+
50
+ if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
51
+ raise KeyError(
52
+ 'The name of the fields does not match the finalreport.txt '
53
+ 'file from Illumina'
54
+ )
55
+
56
+ self.__data_snp = data.rename(columns=MAP_FIELDS)
57
+ self.__data_snp['SNP'] = \
58
+ self.__data_snp[['ALLELE1', 'ALLELE2']].\
59
+ sum(axis=1).\
60
+ map(Snp._ALLELE_CODE)
61
+
62
+ self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
63
+
64
+ if self._format_data is not None and self._format_data == "uga":
65
+ self.__data_snp = self._format_uga(
66
+ self.__data_snp[['SAMPLE_ID', 'SNP']]
67
+ )
68
+
69
+ @staticmethod
70
+ def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
71
+ """ Data format to use snp in plink and blupf90. """
72
+
73
+ return data.groupby(by='SAMPLE_ID').sum().reset_index()
74
+
75
+ def to_file(self, file_path: str | Path) -> None:
76
+ """ Saving data to a file.
77
+
78
+ :param file_path: Path to file
79
+ """
80
+
81
+ if isinstance(file_path, str):
82
+ file_path = Path(file_path)
83
+
84
+ if self._format_data is not None and self._format_data == "uga":
85
+
86
+ max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
87
+
88
+ self.__data_snp.\
89
+ apply(
90
+ lambda x: " ".join([
91
+ self._add_space(x.iloc[0], max_len), x.iloc[1]
92
+ ]),
93
+ axis=1
94
+ ).\
95
+ to_csv(file_path, index=False, header=False)
96
+
97
+ self.__data_snp["SAMPLE_ID"] = \
98
+ self.__data_snp["SAMPLE_ID"].str.strip()
99
+
100
+ return None
101
+
102
+ self.__data_snp.to_csv(file_path, sep=" ", index=False)
103
+
104
+ @staticmethod
105
+ def _add_space(value: str, max_len: int) -> str:
106
+ """ Adding spaces up to the maximum length of the value in the
107
+ sample_id data.
108
+
109
+ :param value: Sample_id value
110
+ :param max_len: Max len sample_id value
111
+ :return: Return replacing value
112
+ """
113
+ return "".join([value, " " * (max_len - len(value))])
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_FILES = Path(__file__).parent.joinpath("files")
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_fam
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_fam(request) -> pd.DataFrame | None:
14
+ return pd.read_pickle(DIR_FILES / f"fplink/fam/{request.param}")
15
+
16
+
17
+ class TestPlinkFormatPed(object):
18
+
19
+ @pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
20
+ def test_fam_true(self, data_fam: pd.DataFrame) -> None:
21
+ assert not make_fam(
22
+ data_fam,
23
+ "SAMPLE_ID",
24
+ "SAMPLE_ID"
25
+ ).empty
26
+
27
+ assert not make_fam(
28
+ data_fam,
29
+ "SAMPLE_ID",
30
+ "SAMPLE_ID"
31
+ ).empty
32
+
33
+ def test_fam_empty(self) -> None:
34
+ assert make_fam(
35
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
36
+ "SAMPLE_ID",
37
+ ).empty
38
+
39
+ assert make_fam(
40
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
41
+ "SAMPLE_ID",
42
+ "SAMPLE_ID",
43
+ ).empty
44
+
45
+ @pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
46
+ def test_fam_raise_columns(self, data_fam: pd.DataFrame) -> None:
47
+ # SID_COL
48
+ with pytest.raises(
49
+ KeyError, match="Data has not in name columns SAMPLE_ID1!"
50
+ ):
51
+ make_fam(
52
+ data_fam,
53
+ "SAMPLE_ID1",
54
+ "SAMPLE_ID",
55
+ )
56
+
57
+ # FID_COL
58
+ with pytest.raises(
59
+ KeyError, match="Data has not in name columns SAMPLE_ID1!"
60
+ ):
61
+ make_fam(
62
+ data_fam,
63
+ "SAMPLE_ID",
64
+ "SAMPLE_ID1"
65
+ )
66
+
67
+ @pytest.mark.parametrize("data_fam", ["file2.pl"], indirect=True)
68
+ def test_fam_raises_underscope_sid(self, data_fam: pd.DataFrame) -> None:
69
+
70
+ # SID_COL
71
+ with pytest.raises(
72
+ Exception,
73
+ match="Replace in 'Sample ID' columns '_' on another a simbols"
74
+ ):
75
+ make_fam(
76
+ data_fam,
77
+ "SAMPLE_ID",
78
+ "SAMPLE_ID"
79
+ )
80
+
81
+ @pytest.mark.parametrize("data_fam", ["file3.pl"], indirect=True)
82
+ def test_fam_raises_underscope_fid(self, data_fam: pd.DataFrame) -> None:
83
+
84
+ # FID_COL
85
+ with pytest.raises(
86
+ Exception,
87
+ match="Replace in 'Family ID' columns '_' on another a simbols"
88
+ ):
89
+ make_fam(
90
+ data_fam,
91
+ "SAMPLE_ID",
92
+ "FAMILY_ID"
93
+ )
94
+
95
+ @pytest.mark.parametrize("data_fam", ["file4.pl"], indirect=True)
96
+ def test_fam_check_data(self, data_fam: pd.DataFrame) -> None:
97
+ res = make_fam(
98
+ data_fam,
99
+ "SAMPLE_ID",
100
+ "FAMILY_ID",
101
+ father_col="father",
102
+ mother_col="mother",
103
+ sex_col="sex",
104
+ pheno_col="pheno"
105
+ )
106
+
107
+ res2 = make_fam(
108
+ data_fam,
109
+ "SAMPLE_ID",
110
+ "FAMILY_ID",
111
+ )
112
+
113
+ assert all(res.father.values == list('1234'))
114
+ assert all(res.mother.values == list('5678'))
115
+ assert all(res.sex.values == list('1210'))
116
+ assert all(res.pheno.values == ['12', '13', '14', '15'])
117
+
118
+ assert all(res2.father.values == list('0000'))
119
+ assert all(res2.mother.values == list('0000'))
120
+ assert all(res2.sex.values == list('0000'))
121
+ assert all(res2.pheno.values == ['-9', '-9', '-9', '-9'])
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_lgen
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_lgen(request) -> pd.DataFrame:
14
+ return pd.read_pickle(DIR_FILES / f"fplink/lgen/{request.param}")
15
+
16
+
17
+ class TestPlinkFormatLgen(object):
18
+
19
+ @pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
20
+ def test_lgen_true(self, data_lgen: pd.DataFrame) -> None:
21
+ assert not make_lgen(
22
+ data_lgen,
23
+ "Sample ID",
24
+ "SNP Name",
25
+ ["Allele1 - AB", "Allele2 - AB"]
26
+ ).empty
27
+
28
+ def test_lgen_empty(self) -> None:
29
+ assert make_lgen(
30
+ pd.DataFrame(columns=[
31
+ "Sample ID", "SNP Name", "Allele1 - AB", "Allele2 - AB"
32
+ ]),
33
+ "Sample ID",
34
+ "SNP Name",
35
+ ["Allele1 - AB", "Allele2 - AB"]
36
+ ).empty
37
+
38
+ @pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
39
+ def test_lgen_raise_columns(self, data_lgen: pd.DataFrame) -> None:
40
+
41
+ with pytest.raises(
42
+ Exception,
43
+ match="Replace in 'Sample ID' columns '_' on another a simbols"
44
+ ):
45
+ res1 = data_lgen.copy(deep=True)
46
+ res1["Sample ID"] = res1["Sample ID"] + "_"
47
+
48
+ make_lgen(
49
+ res1,
50
+ "Sample ID",
51
+ "SNP Name",
52
+ ["Allele1 - AB", "Allele2 - AB"]
53
+ )
54
+
55
+ with pytest.raises(
56
+ Exception,
57
+ match="Replace in 'Family ID' columns '_' on another a simbols"
58
+ ):
59
+ res1 = data_lgen.copy(deep=True)
60
+ res1["Family ID"] = res1["Sample ID"] + "_"
61
+
62
+ make_lgen(
63
+ res1,
64
+ "Sample ID",
65
+ "SNP Name",
66
+ ["Allele1 - AB", "Allele2 - AB"],
67
+ fid_col="Family ID"
68
+ )
69
+
70
+ # SID
71
+ with pytest.raises(KeyError):
72
+ make_lgen(
73
+ data_lgen,
74
+ "Sample ID1",
75
+ "SNP Name",
76
+ ["Allele1 - AB", "Allele2 - AB"],
77
+ fid_col="Family ID"
78
+ )
79
+
80
+ # FID_COL
81
+ with pytest.raises(KeyError):
82
+ make_lgen(
83
+ data_lgen,
84
+ "Sample ID",
85
+ "SNP Name",
86
+ ["Allele1 - AB", "Allele2 - AB"],
87
+ fid_col="Family ID"
88
+ )
89
+
90
+ # SNP name
91
+ with pytest.raises(KeyError):
92
+ make_lgen(
93
+ data_lgen,
94
+ "Sample ID",
95
+ "SNP Name1",
96
+ ["Allele1 - AB", "Allele2 - AB"]
97
+ )
98
+
99
+ # Alleles
100
+ with pytest.raises(KeyError):
101
+ make_lgen(
102
+ data_lgen,
103
+ "Sample ID",
104
+ "SNP Name",
105
+ ["Allele1 - AB1", "Allele2 - AB1"]
106
+ )
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_map
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_map() -> pd.DataFrame:
14
+ return pd.read_csv(DIR_FILES / "fplink/map/file_bovinesnp50.csv")
15
+
16
+
17
+ class TestPlinkFormatMap(object):
18
+
19
+ def test_map_true(self, data_map) -> None:
20
+
21
+ res = make_map(data_map)
22
+ assert not res.empty
23
+
24
+ def test_map_raise(self, data_map) -> None:
25
+ with pytest.raises(
26
+ KeyError, match="Manifest has no data to build map format!"
27
+ ):
28
+ make_map(data_map)
29
+ make_map(pd.DataFrame())
30
+ make_map(
31
+ pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
32
+ )
33
+
34
+ with pytest.raises(
35
+ KeyError, match="Manifest has no data to build map format!"
36
+ ):
37
+ make_map(pd.DataFrame())
38
+
39
+ def test_map_empty(self) -> None:
40
+ assert make_map(
41
+ pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
42
+ ).empty