snplib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
format/_plink.py ADDED
@@ -0,0 +1,305 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ __all__ = [
6
+ "make_map", "make_ped", "make_fam", "make_lgen"
7
+ ]
8
+
9
+ import re
10
+ import pandas as pd
11
+
12
+
13
+ def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
14
+ """ PLINK text fileset variant information file
15
+ https://www.cog-genomics.org/plink/1.9/formats#map
16
+
17
+ A text file with no header line, and one line per variant with the following 3-4 fields:
18
+
19
+ 1. Chromosome code. PLINK 1.9 also permits contig names here, but most
20
+ older programs do not.
21
+ 2. Variant identifier
22
+ 3. Position in morgans or centimorgans (optional; also safe to use
23
+ dummy value of '0')
24
+ 4. Base-pair coordinate
25
+
26
+ All lines must have the same number of columns (so either no lines
27
+ contain the morgans/centimorgans column, or all of them do).
28
+
29
+ :param manifest: The file that is taken on the Illumina website with full
30
+ information about the chip
31
+ https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
32
+
33
+ :return: Return data in formate .map
34
+ """
35
+
36
+ fields = ['Chr', 'Name', 'MapInfo']
37
+
38
+ if all([
39
+ True
40
+ if item not in manifest.columns
41
+ else False
42
+ for item in fields
43
+ ]):
44
+ raise KeyError("Manifest has no data to build map format!")
45
+
46
+ # Rearrange the columns and replace the names of the sex and mitochondrial
47
+ # chromosomes
48
+ permute_cols = manifest[fields].\
49
+ sort_values(by='Name').\
50
+ replace({'X': 30, 'Y': 31, 'MT': 33}).\
51
+ dropna(axis=0)
52
+
53
+ # Insert distances in centimorganides
54
+ permute_cols.insert(2, 'morgans', [0] * len(manifest))
55
+
56
+ return permute_cols
57
+
58
+
59
+ def make_ped(
60
+ data: pd.DataFrame,
61
+ sid_col: str,
62
+ snp_col: str,
63
+ fid_col: str = None,
64
+ father_col: str = None,
65
+ mother_col: str = None,
66
+ sex_col: str = None,
67
+ ) -> pd.DataFrame | None:
68
+ """ Original standard text format for sample pedigree information and
69
+ genotype calls. Normally must be accompanied by a .map file.
70
+ https://www.cog-genomics.org/plink/1.9/formats#ped
71
+
72
+ The PED file has 6 fixed columns at the beginning followed by the SNP
73
+ information. The columns should be separated by a whitespace or a tab. The
74
+ first six columns hold the following information:
75
+
76
+ 1. Family ID (if unknown use the same id as for the sample id in
77
+ column two)
78
+ 2. Sample ID
79
+ 3. Paternal ID (if unknown use 0)
80
+ 4. Maternal ID (if unknown use 0)
81
+ 5. Sex (1=male; 2=female; 0=unknown)
82
+ 6. Affection (0=unknown; 1=unaffected; 2=affected)
83
+ 7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
84
+
85
+ Here is a brief example of a genotype PED file containing 5 samples
86
+ with 10 homozygous SNPs:
87
+ 4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
88
+ 6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
89
+ 7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
90
+ 6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
91
+ 6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
92
+
93
+ :param data: Snp data that contain full or partial information on the
94
+ animal
95
+ :param sid_col: Sample ID. Column name in data
96
+ :param snp_col: Snp column name in data
97
+ :param fid_col: Family ID column name in data (if unknown use the same
98
+ id as for the sample id in column two)
99
+ :param father_col: Paternal ID column name in data (if unknown use 0)
100
+ :param mother_col: Maternal ID column name in data (if unknown use 0)
101
+ :param sex_col: Sex column name in data (if unknown use 0)
102
+ :return: Returns an array of data in ped format to work with the plink
103
+ program
104
+ """
105
+
106
+ _fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
107
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
108
+
109
+ _ped = pd.DataFrame(columns=_fields)
110
+
111
+ if sid_col not in data.columns or snp_col not in data.columns:
112
+ raise KeyError(f"Data has not in name columns!")
113
+
114
+ # Checked Sample ID on underscope - '_'
115
+ _ped["sid"] = data[sid_col].astype(str)
116
+ if _ped["sid"].apply(_check_underscore).any():
117
+ raise Exception(
118
+ "Replace in 'Sample ID' columns '_' on another a simbols"
119
+ )
120
+
121
+ # Checked Family ID on underscope - '_'
122
+ if fid_col is not None:
123
+ if fid_col not in data.columns:
124
+ raise KeyError(f"Data has not in name columns {fid_col}!")
125
+
126
+ if (data[fid_col].dtype.hasobject and
127
+ data[fid_col].apply(_check_underscore).any()):
128
+ raise Exception(
129
+ "Replace in 'Family ID' columns '_' on another a simbols"
130
+ )
131
+
132
+ _ped["fid"] = data[fid_col]
133
+
134
+ else:
135
+ _ped["fid"] = data[sid_col].astype(str)
136
+
137
+ _ped["father"] = data[father_col] if father_col is not None else 0
138
+ _ped["mother"] = data[mother_col] if mother_col is not None else 0
139
+ _ped["sex"] = data[sex_col] if sex_col is not None else 0
140
+ _ped["not_used"] = 0
141
+ _ped["snp"] = data[snp_col]
142
+
143
+ return _ped[_fields].astype(_f_dtype)
144
+
145
+
146
+ def make_fam(
147
+ data: pd.DataFrame,
148
+ sid_col: str,
149
+ fid_col: str = None,
150
+ father_col: str = None,
151
+ mother_col: str = None,
152
+ sex_col: str = None,
153
+ sex_val: int = 0,
154
+ pheno_col: str = None,
155
+ pheno_val: int = -9
156
+
157
+ ) -> pd.DataFrame | None:
158
+ """ PLINK sample information file
159
+ https://www.cog-genomics.org/plink/1.9/formats#fam
160
+
161
+ A text file with no header line, and one line per sample with the
162
+ following six fields:
163
+
164
+ 1. Family ID ('FID')
165
+ 2. Within-family ID ('IID'; cannot be '0')
166
+ 3. Within-family ID of father ('0' if father isn't in dataset)
167
+ 4. Within-family ID of mother ('0' if mother isn't in dataset)
168
+ 5. Sex code ('1' = male, '2' = female, '0' = unknown)
169
+ 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
170
+ missing data if case/control)
171
+
172
+ :param data: Snp data that contain full or partial information on the
173
+ animal
174
+ :param fid_col: Family ID, default value "1". Must not contain
175
+ underline - "_"
176
+ :param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
177
+ underline - "_"
178
+ :param father_col: Within-family ID of father ('0' if father isn't in
179
+ dataset)
180
+ :param mother_col: Within-family ID of mother ('0' if mother isn't in
181
+ dataset)
182
+ :param sex_col: Sex column name in data
183
+ :param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
184
+ :param pheno_col: Pheno column name in data
185
+ :param pheno_val: Phenotype value ('1' = control, '2' = case,
186
+ '-9'/'0'/non-numeric = missing data if case/control)
187
+ :return: Return data in formate .fam
188
+ """
189
+
190
+ _fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
191
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
192
+
193
+ _fam = pd.DataFrame(columns=_fields)
194
+
195
+ if sid_col not in data.columns:
196
+ raise KeyError(f"Data has not in name columns {sid_col}!")
197
+
198
+ # Checked Sample ID on underscope - '_'
199
+ _fam["sid"] = data[sid_col].astype(str)
200
+ if _fam["sid"].apply(_check_underscore).any():
201
+ raise Exception(
202
+ "Replace in 'Sample ID' columns '_' on another a simbols"
203
+ )
204
+
205
+ # Checked Family ID on underscope - '_'
206
+ if fid_col is not None:
207
+ if fid_col not in data.columns:
208
+ raise KeyError(f"Data has not in name columns {fid_col}!")
209
+
210
+ if (data[fid_col].dtype.hasobject and
211
+ data[fid_col].apply(_check_underscore).any()):
212
+ raise Exception(
213
+ "Replace in 'Family ID' columns '_' on another a simbols"
214
+ )
215
+
216
+ _fam["fid"] = data[fid_col]
217
+
218
+ else:
219
+ _fam["fid"] = 1
220
+
221
+ _fam["father"] = data[father_col] if father_col is not None else 0
222
+ _fam["mother"] = data[mother_col] if mother_col is not None else 0
223
+ _fam["sex"] = data[sex_col] if sex_col is not None else sex_val
224
+ _fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
225
+
226
+ return _fam[_fields].astype(_f_dtype)
227
+
228
+
229
+ def make_lgen(
230
+ data: pd.DataFrame,
231
+ sid_col: str,
232
+ snp_name: str,
233
+ alleles: list[str],
234
+ fid_col: str = None
235
+ ) -> pd.DataFrame | None:
236
+ """ PLINK long-format genotype file
237
+ https://www.cog-genomics.org/plink/1.9/formats#lgen
238
+
239
+ A text file with no header line, and one line per genotype call (or
240
+ just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
241
+ the following five fields:
242
+
243
+ 1. Family ID
244
+ 2. Within-family ID
245
+ 3. Variant identifier
246
+ 4. Allele call 1 ('0' for missing)
247
+ 5. Allele call 2
248
+
249
+ There are several variations which are also handled by PLINK; see the
250
+ original discussion for details.
251
+
252
+ :param data: Data the after parsing FinalReport.txt
253
+ :param sid_col:
254
+ :param snp_name:
255
+ :param fid_col: Family ID, default value "1"
256
+ :param alleles:
257
+ :return: - Return data in formate .lgen
258
+ """
259
+ _fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
260
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
261
+
262
+ _lgen = pd.DataFrame(columns=_fields)
263
+
264
+ try:
265
+ # Checked Sample ID on underscope - '_'
266
+ _lgen["sid"] = data[sid_col].astype(str)
267
+ if _lgen["sid"].apply(_check_underscore).any():
268
+ raise Exception(
269
+ "Replace in 'Sample ID' columns '_' on another a simbols"
270
+ )
271
+
272
+ # Checked Family ID on underscope - '_'
273
+ if fid_col is not None:
274
+ if (data[fid_col].dtype.hasobject and
275
+ data[fid_col].apply(_check_underscore).any()):
276
+ raise Exception(
277
+ "Replace in 'Family ID' columns '_' on another a simbols"
278
+ )
279
+
280
+ _lgen["fid"] = data[fid_col]
281
+
282
+ else:
283
+ _lgen["fid"] = 1
284
+
285
+ _lgen["snp_name"] = data[snp_name]
286
+ _lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
287
+
288
+ except Exception as e:
289
+ raise e
290
+
291
+ return _lgen[_fields].astype(_f_dtype)
292
+
293
+
294
+ def _check_underscore(value: str) -> bool:
295
+ """ Checking for underscore in a string
296
+
297
+ :param value: String for checked
298
+ :return: Return True if there is an underline in the string, False if not
299
+ """
300
+ _under_l = re.compile(r"_")
301
+
302
+ if _under_l.findall(value):
303
+ return True
304
+
305
+ return False
format/_snp.py ADDED
@@ -0,0 +1,113 @@
1
+ # !/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
5
+
6
+ from pathlib import Path
7
+ from .__settings import FIELDS_ILLUMIN, MAP_FIELDS
8
+
9
+ import pandas as pd
10
+
11
+
12
+ class Snp(object):
13
+ """ The process of converting genomic map data - FinalReport.txt obtained
14
+ from Illumin. Recoding allele data into quantitative data, saving in the
15
+ format necessary for calculating gblup on blupf90.
16
+
17
+ :argument fmt: Data format to use snp in plink and blupf90. Default
18
+ value "uga". """
19
+
20
+ _ALLELE_CODE = {
21
+ 'AA': 0, 'AB': 1, 'BA': 1, 'BB': 2, '--': 5
22
+ }
23
+
24
+ _FIELDS = ['SNP_NAME', 'SAMPLE_ID', 'SNP']
25
+ _F_DTYPE = dict(zip(_FIELDS, (str for _ in range(len(_FIELDS)))))
26
+
27
+ def __init__(self, fmt: str | None = "uga") -> None:
28
+ self._format_data = fmt
29
+ self.__data_snp = None
30
+
31
+ @property
32
+ def data(self) -> pd.DataFrame | None:
33
+ return self.__data_snp
34
+
35
+ def process(self, data: pd.DataFrame) -> None:
36
+ """ Data processing and formatting. Calculation of statistical
37
+ information
38
+
39
+ :param data: Data from FinalReport file. Example:
40
+ SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
41
+ ABCA12 14814 A A 0.4048 0.8164
42
+ ARS-BFGL-BAC-13031 14814 B B 0.9083 0.8712
43
+ ARS-BFGL-BAC-13039 14814 A A 0.9005 0.9096
44
+ ARS-BFGL-BAC-13049 14814 A B 0.9295 0.8926
45
+
46
+ :return: Returns true if the data was formatted successfully and
47
+ statistical information was calculated, false if an error.
48
+ """
49
+
50
+ if not all(list(map(lambda x: x in data.columns, FIELDS_ILLUMIN))):
51
+ raise KeyError(
52
+ 'The name of the fields does not match the finalreport.txt '
53
+ 'file from Illumina'
54
+ )
55
+
56
+ self.__data_snp = data.rename(columns=MAP_FIELDS)
57
+ self.__data_snp['SNP'] = \
58
+ self.__data_snp[['ALLELE1', 'ALLELE2']].\
59
+ sum(axis=1).\
60
+ map(Snp._ALLELE_CODE)
61
+
62
+ self.__data_snp = self.__data_snp[Snp._FIELDS].astype(Snp._F_DTYPE)
63
+
64
+ if self._format_data is not None and self._format_data == "uga":
65
+ self.__data_snp = self._format_uga(
66
+ self.__data_snp[['SAMPLE_ID', 'SNP']]
67
+ )
68
+
69
+ @staticmethod
70
+ def _format_uga(data: pd.DataFrame) -> pd.DataFrame:
71
+ """ Data format to use snp in plink and blupf90. """
72
+
73
+ return data.groupby(by='SAMPLE_ID').sum().reset_index()
74
+
75
+ def to_file(self, file_path: str | Path) -> None:
76
+ """ Saving data to a file.
77
+
78
+ :param file_path: Path to file
79
+ """
80
+
81
+ if isinstance(file_path, str):
82
+ file_path = Path(file_path)
83
+
84
+ if self._format_data is not None and self._format_data == "uga":
85
+
86
+ max_len = self.__data_snp["SAMPLE_ID"].str.len().max()
87
+
88
+ self.__data_snp.\
89
+ apply(
90
+ lambda x: " ".join([
91
+ self._add_space(x.iloc[0], max_len), x.iloc[1]
92
+ ]),
93
+ axis=1
94
+ ).\
95
+ to_csv(file_path, index=False, header=False)
96
+
97
+ self.__data_snp["SAMPLE_ID"] = \
98
+ self.__data_snp["SAMPLE_ID"].str.strip()
99
+
100
+ return None
101
+
102
+ self.__data_snp.to_csv(file_path, sep=" ", index=False)
103
+
104
+ @staticmethod
105
+ def _add_space(value: str, max_len: int) -> str:
106
+ """ Adding spaces up to the maximum length of the value in the
107
+ sample_id data.
108
+
109
+ :param value: Sample_id value
110
+ :param max_len: Max len sample_id value
111
+ :return: Return replacing value
112
+ """
113
+ return "".join([value, " " * (max_len - len(value))])
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from pathlib import Path
6
+
7
+ DIR_FILES = Path(__file__).parent.joinpath("files")
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_fam
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_fam(request) -> pd.DataFrame | None:
14
+ return pd.read_pickle(DIR_FILES / f"fplink/fam/{request.param}")
15
+
16
+
17
+ class TestPlinkFormatPed(object):
18
+
19
+ @pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
20
+ def test_fam_true(self, data_fam: pd.DataFrame) -> None:
21
+ assert not make_fam(
22
+ data_fam,
23
+ "SAMPLE_ID",
24
+ "SAMPLE_ID"
25
+ ).empty
26
+
27
+ assert not make_fam(
28
+ data_fam,
29
+ "SAMPLE_ID",
30
+ "SAMPLE_ID"
31
+ ).empty
32
+
33
+ def test_fam_empty(self) -> None:
34
+ assert make_fam(
35
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
36
+ "SAMPLE_ID",
37
+ ).empty
38
+
39
+ assert make_fam(
40
+ pd.DataFrame(columns=["SAMPLE_ID", "SNP"]),
41
+ "SAMPLE_ID",
42
+ "SAMPLE_ID",
43
+ ).empty
44
+
45
+ @pytest.mark.parametrize("data_fam", ["file.pl"], indirect=True)
46
+ def test_fam_raise_columns(self, data_fam: pd.DataFrame) -> None:
47
+ # SID_COL
48
+ with pytest.raises(
49
+ KeyError, match="Data has not in name columns SAMPLE_ID1!"
50
+ ):
51
+ make_fam(
52
+ data_fam,
53
+ "SAMPLE_ID1",
54
+ "SAMPLE_ID",
55
+ )
56
+
57
+ # FID_COL
58
+ with pytest.raises(
59
+ KeyError, match="Data has not in name columns SAMPLE_ID1!"
60
+ ):
61
+ make_fam(
62
+ data_fam,
63
+ "SAMPLE_ID",
64
+ "SAMPLE_ID1"
65
+ )
66
+
67
+ @pytest.mark.parametrize("data_fam", ["file2.pl"], indirect=True)
68
+ def test_fam_raises_underscope_sid(self, data_fam: pd.DataFrame) -> None:
69
+
70
+ # SID_COL
71
+ with pytest.raises(
72
+ Exception,
73
+ match="Replace in 'Sample ID' columns '_' on another a simbols"
74
+ ):
75
+ make_fam(
76
+ data_fam,
77
+ "SAMPLE_ID",
78
+ "SAMPLE_ID"
79
+ )
80
+
81
+ @pytest.mark.parametrize("data_fam", ["file3.pl"], indirect=True)
82
+ def test_fam_raises_underscope_fid(self, data_fam: pd.DataFrame) -> None:
83
+
84
+ # FID_COL
85
+ with pytest.raises(
86
+ Exception,
87
+ match="Replace in 'Family ID' columns '_' on another a simbols"
88
+ ):
89
+ make_fam(
90
+ data_fam,
91
+ "SAMPLE_ID",
92
+ "FAMILY_ID"
93
+ )
94
+
95
+ @pytest.mark.parametrize("data_fam", ["file4.pl"], indirect=True)
96
+ def test_fam_check_data(self, data_fam: pd.DataFrame) -> None:
97
+ res = make_fam(
98
+ data_fam,
99
+ "SAMPLE_ID",
100
+ "FAMILY_ID",
101
+ father_col="father",
102
+ mother_col="mother",
103
+ sex_col="sex",
104
+ pheno_col="pheno"
105
+ )
106
+
107
+ res2 = make_fam(
108
+ data_fam,
109
+ "SAMPLE_ID",
110
+ "FAMILY_ID",
111
+ )
112
+
113
+ assert all(res.father.values == list('1234'))
114
+ assert all(res.mother.values == list('5678'))
115
+ assert all(res.sex.values == list('1210'))
116
+ assert all(res.pheno.values == ['12', '13', '14', '15'])
117
+
118
+ assert all(res2.father.values == list('0000'))
119
+ assert all(res2.mother.values == list('0000'))
120
+ assert all(res2.sex.values == list('0000'))
121
+ assert all(res2.pheno.values == ['-9', '-9', '-9', '-9'])
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_lgen
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_lgen(request) -> pd.DataFrame:
14
+ return pd.read_pickle(DIR_FILES / f"fplink/lgen/{request.param}")
15
+
16
+
17
+ class TestPlinkFormatLgen(object):
18
+
19
+ @pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
20
+ def test_lgen_true(self, data_lgen: pd.DataFrame) -> None:
21
+ assert not make_lgen(
22
+ data_lgen,
23
+ "Sample ID",
24
+ "SNP Name",
25
+ ["Allele1 - AB", "Allele2 - AB"]
26
+ ).empty
27
+
28
+ def test_lgen_empty(self) -> None:
29
+ assert make_lgen(
30
+ pd.DataFrame(columns=[
31
+ "Sample ID", "SNP Name", "Allele1 - AB", "Allele2 - AB"
32
+ ]),
33
+ "Sample ID",
34
+ "SNP Name",
35
+ ["Allele1 - AB", "Allele2 - AB"]
36
+ ).empty
37
+
38
+ @pytest.mark.parametrize("data_lgen", ["file.pl"], indirect=True)
39
+ def test_lgen_raise_columns(self, data_lgen: pd.DataFrame) -> None:
40
+
41
+ with pytest.raises(
42
+ Exception,
43
+ match="Replace in 'Sample ID' columns '_' on another a simbols"
44
+ ):
45
+ res1 = data_lgen.copy(deep=True)
46
+ res1["Sample ID"] = res1["Sample ID"] + "_"
47
+
48
+ make_lgen(
49
+ res1,
50
+ "Sample ID",
51
+ "SNP Name",
52
+ ["Allele1 - AB", "Allele2 - AB"]
53
+ )
54
+
55
+ with pytest.raises(
56
+ Exception,
57
+ match="Replace in 'Family ID' columns '_' on another a simbols"
58
+ ):
59
+ res1 = data_lgen.copy(deep=True)
60
+ res1["Family ID"] = res1["Sample ID"] + "_"
61
+
62
+ make_lgen(
63
+ res1,
64
+ "Sample ID",
65
+ "SNP Name",
66
+ ["Allele1 - AB", "Allele2 - AB"],
67
+ fid_col="Family ID"
68
+ )
69
+
70
+ # SID
71
+ with pytest.raises(KeyError):
72
+ make_lgen(
73
+ data_lgen,
74
+ "Sample ID1",
75
+ "SNP Name",
76
+ ["Allele1 - AB", "Allele2 - AB"],
77
+ fid_col="Family ID"
78
+ )
79
+
80
+ # FID_COL
81
+ with pytest.raises(KeyError):
82
+ make_lgen(
83
+ data_lgen,
84
+ "Sample ID",
85
+ "SNP Name",
86
+ ["Allele1 - AB", "Allele2 - AB"],
87
+ fid_col="Family ID"
88
+ )
89
+
90
+ # SNP name
91
+ with pytest.raises(KeyError):
92
+ make_lgen(
93
+ data_lgen,
94
+ "Sample ID",
95
+ "SNP Name1",
96
+ ["Allele1 - AB", "Allele2 - AB"]
97
+ )
98
+
99
+ # Alleles
100
+ with pytest.raises(KeyError):
101
+ make_lgen(
102
+ data_lgen,
103
+ "Sample ID",
104
+ "SNP Name",
105
+ ["Allele1 - AB1", "Allele2 - AB1"]
106
+ )
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ from . import DIR_FILES
6
+ from .. import make_map
7
+
8
+ import pytest
9
+ import pandas as pd
10
+
11
+
12
+ @pytest.fixture
13
+ def data_map() -> pd.DataFrame:
14
+ return pd.read_csv(DIR_FILES / "fplink/map/file_bovinesnp50.csv")
15
+
16
+
17
+ class TestPlinkFormatMap(object):
18
+
19
+ def test_map_true(self, data_map) -> None:
20
+
21
+ res = make_map(data_map)
22
+ assert not res.empty
23
+
24
+ def test_map_raise(self, data_map) -> None:
25
+ with pytest.raises(
26
+ KeyError, match="Manifest has no data to build map format!"
27
+ ):
28
+ make_map(data_map)
29
+ make_map(pd.DataFrame())
30
+ make_map(
31
+ pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
32
+ )
33
+
34
+ with pytest.raises(
35
+ KeyError, match="Manifest has no data to build map format!"
36
+ ):
37
+ make_map(pd.DataFrame())
38
+
39
+ def test_map_empty(self) -> None:
40
+ assert make_map(
41
+ pd.DataFrame(columns=['Chr', 'Name', 'MapInfo', 'morgans'])
42
+ ).empty