snplib 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. snplib/__init__.py +8 -8
  2. snplib/finalreport/__init__.py +7 -7
  3. snplib/finalreport/_finalreport.py +251 -251
  4. snplib/format/__init__.py +19 -19
  5. snplib/format/__settings.py +7 -7
  6. snplib/format/_plink.py +305 -305
  7. snplib/format/_snp.py +113 -113
  8. snplib/parentage/__init__.py +15 -15
  9. snplib/parentage/_discov.py +102 -102
  10. snplib/parentage/_isagmark.py +15 -15
  11. snplib/parentage/_verif.py +91 -91
  12. snplib/parentage/isag_disc.pl +0 -0
  13. snplib/parentage/isag_verif.pl +0 -0
  14. snplib/statistics/__init__.py +16 -16
  15. snplib/statistics/_callrate.py +59 -59
  16. snplib/statistics/_freq.py +67 -67
  17. snplib/statistics/_snphwe.py +132 -132
  18. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/LICENSE +674 -674
  19. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/METADATA +80 -97
  20. snplib-1.0.8.dist-info/RECORD +22 -0
  21. snplib/finalreport/tests/__init__.py +0 -7
  22. snplib/finalreport/tests/test_finalreport.py +0 -215
  23. snplib/format/tests/__init__.py +0 -7
  24. snplib/format/tests/test_plink_fam.py +0 -121
  25. snplib/format/tests/test_plink_lgen.py +0 -106
  26. snplib/format/tests/test_plink_map.py +0 -42
  27. snplib/format/tests/test_plink_ped.py +0 -136
  28. snplib/format/tests/test_snp.py +0 -128
  29. snplib/parentage/tests/__init__.py +0 -7
  30. snplib/parentage/tests/test_discov.py +0 -164
  31. snplib/parentage/tests/test_verif.py +0 -160
  32. snplib/statistics/tests/__init__.py +0 -7
  33. snplib/statistics/tests/test_callrate.py +0 -171
  34. snplib/statistics/tests/test_freq_allele.py +0 -87
  35. snplib/statistics/tests/test_freq_maf.py +0 -17
  36. snplib/statistics/tests/test_hwe_t.py +0 -41
  37. snplib/statistics/tests/test_snphwe.py +0 -41
  38. snplib-1.0.7.dist-info/RECORD +0 -37
  39. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/WHEEL +0 -0
  40. {snplib-1.0.7.dist-info → snplib-1.0.8.dist-info}/top_level.txt +0 -0
snplib/format/_plink.py CHANGED
@@ -1,305 +1,305 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
- __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
-
5
- __all__ = [
6
- "make_map", "make_ped", "make_fam", "make_lgen"
7
- ]
8
-
9
- import re
10
- import pandas as pd
11
-
12
-
13
- def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
14
- """ PLINK text fileset variant information file
15
- https://www.cog-genomics.org/plink/1.9/formats#map
16
-
17
- A text file with no header line, and one line per variant with the following 3-4 fields:
18
-
19
- 1. Chromosome code. PLINK 1.9 also permits contig names here, but most
20
- older programs do not.
21
- 2. Variant identifier
22
- 3. Position in morgans or centimorgans (optional; also safe to use
23
- dummy value of '0')
24
- 4. Base-pair coordinate
25
-
26
- All lines must have the same number of columns (so either no lines
27
- contain the morgans/centimorgans column, or all of them do).
28
-
29
- :param manifest: The file that is taken on the Illumina website with full
30
- information about the chip
31
- https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
32
-
33
- :return: Return data in formate .map
34
- """
35
-
36
- fields = ['Chr', 'Name', 'MapInfo']
37
-
38
- if all([
39
- True
40
- if item not in manifest.columns
41
- else False
42
- for item in fields
43
- ]):
44
- raise KeyError("Manifest has no data to build map format!")
45
-
46
- # Rearrange the columns and replace the names of the sex and mitochondrial
47
- # chromosomes
48
- permute_cols = manifest[fields].\
49
- sort_values(by='Name').\
50
- replace({'X': 30, 'Y': 31, 'MT': 33}).\
51
- dropna(axis=0)
52
-
53
- # Insert distances in centimorganides
54
- permute_cols.insert(2, 'morgans', [0] * len(manifest))
55
-
56
- return permute_cols
57
-
58
-
59
- def make_ped(
60
- data: pd.DataFrame,
61
- sid_col: str,
62
- snp_col: str,
63
- fid_col: str = None,
64
- father_col: str = None,
65
- mother_col: str = None,
66
- sex_col: str = None,
67
- ) -> pd.DataFrame | None:
68
- """ Original standard text format for sample pedigree information and
69
- genotype calls. Normally must be accompanied by a .map file.
70
- https://www.cog-genomics.org/plink/1.9/formats#ped
71
-
72
- The PED file has 6 fixed columns at the beginning followed by the SNP
73
- information. The columns should be separated by a whitespace or a tab. The
74
- first six columns hold the following information:
75
-
76
- 1. Family ID (if unknown use the same id as for the sample id in
77
- column two)
78
- 2. Sample ID
79
- 3. Paternal ID (if unknown use 0)
80
- 4. Maternal ID (if unknown use 0)
81
- 5. Sex (1=male; 2=female; 0=unknown)
82
- 6. Affection (0=unknown; 1=unaffected; 2=affected)
83
- 7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
84
-
85
- Here is a brief example of a genotype PED file containing 5 samples
86
- with 10 homozygous SNPs:
87
- 4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
88
- 6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
89
- 7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
90
- 6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
91
- 6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
92
-
93
- :param data: Snp data that contain full or partial information on the
94
- animal
95
- :param sid_col: Sample ID. Column name in data
96
- :param snp_col: Snp column name in data
97
- :param fid_col: Family ID column name in data (if unknown use the same
98
- id as for the sample id in column two)
99
- :param father_col: Paternal ID column name in data (if unknown use 0)
100
- :param mother_col: Maternal ID column name in data (if unknown use 0)
101
- :param sex_col: Sex column name in data (if unknown use 0)
102
- :return: Returns an array of data in ped format to work with the plink
103
- program
104
- """
105
-
106
- _fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
107
- _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
108
-
109
- _ped = pd.DataFrame(columns=_fields)
110
-
111
- if sid_col not in data.columns or snp_col not in data.columns:
112
- raise KeyError(f"Data has not in name columns!")
113
-
114
- # Checked Sample ID on underscope - '_'
115
- _ped["sid"] = data[sid_col].astype(str)
116
- if _ped["sid"].apply(_check_underscore).any():
117
- raise Exception(
118
- "Replace in 'Sample ID' columns '_' on another a simbols"
119
- )
120
-
121
- # Checked Family ID on underscope - '_'
122
- if fid_col is not None:
123
- if fid_col not in data.columns:
124
- raise KeyError(f"Data has not in name columns {fid_col}!")
125
-
126
- if (data[fid_col].dtype.hasobject and
127
- data[fid_col].apply(_check_underscore).any()):
128
- raise Exception(
129
- "Replace in 'Family ID' columns '_' on another a simbols"
130
- )
131
-
132
- _ped["fid"] = data[fid_col]
133
-
134
- else:
135
- _ped["fid"] = data[sid_col].astype(str)
136
-
137
- _ped["father"] = data[father_col] if father_col is not None else 0
138
- _ped["mother"] = data[mother_col] if mother_col is not None else 0
139
- _ped["sex"] = data[sex_col] if sex_col is not None else 0
140
- _ped["not_used"] = 0
141
- _ped["snp"] = data[snp_col]
142
-
143
- return _ped[_fields].astype(_f_dtype)
144
-
145
-
146
- def make_fam(
147
- data: pd.DataFrame,
148
- sid_col: str,
149
- fid_col: str = None,
150
- father_col: str = None,
151
- mother_col: str = None,
152
- sex_col: str = None,
153
- sex_val: int = 0,
154
- pheno_col: str = None,
155
- pheno_val: int = -9
156
-
157
- ) -> pd.DataFrame | None:
158
- """ PLINK sample information file
159
- https://www.cog-genomics.org/plink/1.9/formats#fam
160
-
161
- A text file with no header line, and one line per sample with the
162
- following six fields:
163
-
164
- 1. Family ID ('FID')
165
- 2. Within-family ID ('IID'; cannot be '0')
166
- 3. Within-family ID of father ('0' if father isn't in dataset)
167
- 4. Within-family ID of mother ('0' if mother isn't in dataset)
168
- 5. Sex code ('1' = male, '2' = female, '0' = unknown)
169
- 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
170
- missing data if case/control)
171
-
172
- :param data: Snp data that contain full or partial information on the
173
- animal
174
- :param fid_col: Family ID, default value "1". Must not contain
175
- underline - "_"
176
- :param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
177
- underline - "_"
178
- :param father_col: Within-family ID of father ('0' if father isn't in
179
- dataset)
180
- :param mother_col: Within-family ID of mother ('0' if mother isn't in
181
- dataset)
182
- :param sex_col: Sex column name in data
183
- :param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
184
- :param pheno_col: Pheno column name in data
185
- :param pheno_val: Phenotype value ('1' = control, '2' = case,
186
- '-9'/'0'/non-numeric = missing data if case/control)
187
- :return: Return data in formate .fam
188
- """
189
-
190
- _fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
191
- _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
192
-
193
- _fam = pd.DataFrame(columns=_fields)
194
-
195
- if sid_col not in data.columns:
196
- raise KeyError(f"Data has not in name columns {sid_col}!")
197
-
198
- # Checked Sample ID on underscope - '_'
199
- _fam["sid"] = data[sid_col].astype(str)
200
- if _fam["sid"].apply(_check_underscore).any():
201
- raise Exception(
202
- "Replace in 'Sample ID' columns '_' on another a simbols"
203
- )
204
-
205
- # Checked Family ID on underscope - '_'
206
- if fid_col is not None:
207
- if fid_col not in data.columns:
208
- raise KeyError(f"Data has not in name columns {fid_col}!")
209
-
210
- if (data[fid_col].dtype.hasobject and
211
- data[fid_col].apply(_check_underscore).any()):
212
- raise Exception(
213
- "Replace in 'Family ID' columns '_' on another a simbols"
214
- )
215
-
216
- _fam["fid"] = data[fid_col]
217
-
218
- else:
219
- _fam["fid"] = 1
220
-
221
- _fam["father"] = data[father_col] if father_col is not None else 0
222
- _fam["mother"] = data[mother_col] if mother_col is not None else 0
223
- _fam["sex"] = data[sex_col] if sex_col is not None else sex_val
224
- _fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
225
-
226
- return _fam[_fields].astype(_f_dtype)
227
-
228
-
229
- def make_lgen(
230
- data: pd.DataFrame,
231
- sid_col: str,
232
- snp_name: str,
233
- alleles: list[str],
234
- fid_col: str = None
235
- ) -> pd.DataFrame | None:
236
- """ PLINK long-format genotype file
237
- https://www.cog-genomics.org/plink/1.9/formats#lgen
238
-
239
- A text file with no header line, and one line per genotype call (or
240
- just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
241
- the following five fields:
242
-
243
- 1. Family ID
244
- 2. Within-family ID
245
- 3. Variant identifier
246
- 4. Allele call 1 ('0' for missing)
247
- 5. Allele call 2
248
-
249
- There are several variations which are also handled by PLINK; see the
250
- original discussion for details.
251
-
252
- :param data: Data the after parsing FinalReport.txt
253
- :param sid_col:
254
- :param snp_name:
255
- :param fid_col: Family ID, default value "1"
256
- :param alleles:
257
- :return: - Return data in formate .lgen
258
- """
259
- _fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
260
- _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
261
-
262
- _lgen = pd.DataFrame(columns=_fields)
263
-
264
- try:
265
- # Checked Sample ID on underscope - '_'
266
- _lgen["sid"] = data[sid_col].astype(str)
267
- if _lgen["sid"].apply(_check_underscore).any():
268
- raise Exception(
269
- "Replace in 'Sample ID' columns '_' on another a simbols"
270
- )
271
-
272
- # Checked Family ID on underscope - '_'
273
- if fid_col is not None:
274
- if (data[fid_col].dtype.hasobject and
275
- data[fid_col].apply(_check_underscore).any()):
276
- raise Exception(
277
- "Replace in 'Family ID' columns '_' on another a simbols"
278
- )
279
-
280
- _lgen["fid"] = data[fid_col]
281
-
282
- else:
283
- _lgen["fid"] = 1
284
-
285
- _lgen["snp_name"] = data[snp_name]
286
- _lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
287
-
288
- except Exception as e:
289
- raise e
290
-
291
- return _lgen[_fields].astype(_f_dtype)
292
-
293
-
294
- def _check_underscore(value: str) -> bool:
295
- """ Checking for underscore in a string
296
-
297
- :param value: String for checked
298
- :return: Return True if there is an underline in the string, False if not
299
- """
300
- _under_l = re.compile(r"_")
301
-
302
- if _under_l.findall(value):
303
- return True
304
-
305
- return False
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+ __author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
4
+
5
+ __all__ = [
6
+ "make_map", "make_ped", "make_fam", "make_lgen"
7
+ ]
8
+
9
+ import re
10
+ import pandas as pd
11
+
12
+
13
+ def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
14
+ """ PLINK text fileset variant information file
15
+ https://www.cog-genomics.org/plink/1.9/formats#map
16
+
17
+ A text file with no header line, and one line per variant with the following 3-4 fields:
18
+
19
+ 1. Chromosome code. PLINK 1.9 also permits contig names here, but most
20
+ older programs do not.
21
+ 2. Variant identifier
22
+ 3. Position in morgans or centimorgans (optional; also safe to use
23
+ dummy value of '0')
24
+ 4. Base-pair coordinate
25
+
26
+ All lines must have the same number of columns (so either no lines
27
+ contain the morgans/centimorgans column, or all of them do).
28
+
29
+ :param manifest: The file that is taken on the Illumina website with full
30
+ information about the chip
31
+ https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
32
+
33
+ :return: Return data in formate .map
34
+ """
35
+
36
+ fields = ['Chr', 'Name', 'MapInfo']
37
+
38
+ if all([
39
+ True
40
+ if item not in manifest.columns
41
+ else False
42
+ for item in fields
43
+ ]):
44
+ raise KeyError("Manifest has no data to build map format!")
45
+
46
+ # Rearrange the columns and replace the names of the sex and mitochondrial
47
+ # chromosomes
48
+ permute_cols = manifest[fields].\
49
+ sort_values(by='Name').\
50
+ replace({'X': 30, 'Y': 31, 'MT': 33}).\
51
+ dropna(axis=0)
52
+
53
+ # Insert distances in centimorganides
54
+ permute_cols.insert(2, 'morgans', [0] * len(manifest))
55
+
56
+ return permute_cols
57
+
58
+
59
+ def make_ped(
60
+ data: pd.DataFrame,
61
+ sid_col: str,
62
+ snp_col: str,
63
+ fid_col: str = None,
64
+ father_col: str = None,
65
+ mother_col: str = None,
66
+ sex_col: str = None,
67
+ ) -> pd.DataFrame | None:
68
+ """ Original standard text format for sample pedigree information and
69
+ genotype calls. Normally must be accompanied by a .map file.
70
+ https://www.cog-genomics.org/plink/1.9/formats#ped
71
+
72
+ The PED file has 6 fixed columns at the beginning followed by the SNP
73
+ information. The columns should be separated by a whitespace or a tab. The
74
+ first six columns hold the following information:
75
+
76
+ 1. Family ID (if unknown use the same id as for the sample id in
77
+ column two)
78
+ 2. Sample ID
79
+ 3. Paternal ID (if unknown use 0)
80
+ 4. Maternal ID (if unknown use 0)
81
+ 5. Sex (1=male; 2=female; 0=unknown)
82
+ 6. Affection (0=unknown; 1=unaffected; 2=affected)
83
+ 7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
84
+
85
+ Here is a brief example of a genotype PED file containing 5 samples
86
+ with 10 homozygous SNPs:
87
+ 4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
88
+ 6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
89
+ 7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
90
+ 6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
91
+ 6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
92
+
93
+ :param data: Snp data that contain full or partial information on the
94
+ animal
95
+ :param sid_col: Sample ID. Column name in data
96
+ :param snp_col: Snp column name in data
97
+ :param fid_col: Family ID column name in data (if unknown use the same
98
+ id as for the sample id in column two)
99
+ :param father_col: Paternal ID column name in data (if unknown use 0)
100
+ :param mother_col: Maternal ID column name in data (if unknown use 0)
101
+ :param sex_col: Sex column name in data (if unknown use 0)
102
+ :return: Returns an array of data in ped format to work with the plink
103
+ program
104
+ """
105
+
106
+ _fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
107
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
108
+
109
+ _ped = pd.DataFrame(columns=_fields)
110
+
111
+ if sid_col not in data.columns or snp_col not in data.columns:
112
+ raise KeyError(f"Data has not in name columns!")
113
+
114
+ # Checked Sample ID on underscope - '_'
115
+ _ped["sid"] = data[sid_col].astype(str)
116
+ if _ped["sid"].apply(_check_underscore).any():
117
+ raise Exception(
118
+ "Replace in 'Sample ID' columns '_' on another a simbols"
119
+ )
120
+
121
+ # Checked Family ID on underscope - '_'
122
+ if fid_col is not None:
123
+ if fid_col not in data.columns:
124
+ raise KeyError(f"Data has not in name columns {fid_col}!")
125
+
126
+ if (data[fid_col].dtype.hasobject and
127
+ data[fid_col].apply(_check_underscore).any()):
128
+ raise Exception(
129
+ "Replace in 'Family ID' columns '_' on another a simbols"
130
+ )
131
+
132
+ _ped["fid"] = data[fid_col]
133
+
134
+ else:
135
+ _ped["fid"] = data[sid_col].astype(str)
136
+
137
+ _ped["father"] = data[father_col] if father_col is not None else 0
138
+ _ped["mother"] = data[mother_col] if mother_col is not None else 0
139
+ _ped["sex"] = data[sex_col] if sex_col is not None else 0
140
+ _ped["not_used"] = 0
141
+ _ped["snp"] = data[snp_col]
142
+
143
+ return _ped[_fields].astype(_f_dtype)
144
+
145
+
146
+ def make_fam(
147
+ data: pd.DataFrame,
148
+ sid_col: str,
149
+ fid_col: str = None,
150
+ father_col: str = None,
151
+ mother_col: str = None,
152
+ sex_col: str = None,
153
+ sex_val: int = 0,
154
+ pheno_col: str = None,
155
+ pheno_val: int = -9
156
+
157
+ ) -> pd.DataFrame | None:
158
+ """ PLINK sample information file
159
+ https://www.cog-genomics.org/plink/1.9/formats#fam
160
+
161
+ A text file with no header line, and one line per sample with the
162
+ following six fields:
163
+
164
+ 1. Family ID ('FID')
165
+ 2. Within-family ID ('IID'; cannot be '0')
166
+ 3. Within-family ID of father ('0' if father isn't in dataset)
167
+ 4. Within-family ID of mother ('0' if mother isn't in dataset)
168
+ 5. Sex code ('1' = male, '2' = female, '0' = unknown)
169
+ 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
170
+ missing data if case/control)
171
+
172
+ :param data: Snp data that contain full or partial information on the
173
+ animal
174
+ :param fid_col: Family ID, default value "1". Must not contain
175
+ underline - "_"
176
+ :param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
177
+ underline - "_"
178
+ :param father_col: Within-family ID of father ('0' if father isn't in
179
+ dataset)
180
+ :param mother_col: Within-family ID of mother ('0' if mother isn't in
181
+ dataset)
182
+ :param sex_col: Sex column name in data
183
+ :param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
184
+ :param pheno_col: Pheno column name in data
185
+ :param pheno_val: Phenotype value ('1' = control, '2' = case,
186
+ '-9'/'0'/non-numeric = missing data if case/control)
187
+ :return: Return data in formate .fam
188
+ """
189
+
190
+ _fields = ['fid', 'sid', 'father', 'mother', 'sex', 'pheno']
191
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
192
+
193
+ _fam = pd.DataFrame(columns=_fields)
194
+
195
+ if sid_col not in data.columns:
196
+ raise KeyError(f"Data has not in name columns {sid_col}!")
197
+
198
+ # Checked Sample ID on underscope - '_'
199
+ _fam["sid"] = data[sid_col].astype(str)
200
+ if _fam["sid"].apply(_check_underscore).any():
201
+ raise Exception(
202
+ "Replace in 'Sample ID' columns '_' on another a simbols"
203
+ )
204
+
205
+ # Checked Family ID on underscope - '_'
206
+ if fid_col is not None:
207
+ if fid_col not in data.columns:
208
+ raise KeyError(f"Data has not in name columns {fid_col}!")
209
+
210
+ if (data[fid_col].dtype.hasobject and
211
+ data[fid_col].apply(_check_underscore).any()):
212
+ raise Exception(
213
+ "Replace in 'Family ID' columns '_' on another a simbols"
214
+ )
215
+
216
+ _fam["fid"] = data[fid_col]
217
+
218
+ else:
219
+ _fam["fid"] = 1
220
+
221
+ _fam["father"] = data[father_col] if father_col is not None else 0
222
+ _fam["mother"] = data[mother_col] if mother_col is not None else 0
223
+ _fam["sex"] = data[sex_col] if sex_col is not None else sex_val
224
+ _fam['pheno'] = data[pheno_col] if pheno_col is not None else pheno_val
225
+
226
+ return _fam[_fields].astype(_f_dtype)
227
+
228
+
229
+ def make_lgen(
230
+ data: pd.DataFrame,
231
+ sid_col: str,
232
+ snp_name: str,
233
+ alleles: list[str],
234
+ fid_col: str = None
235
+ ) -> pd.DataFrame | None:
236
+ """ PLINK long-format genotype file
237
+ https://www.cog-genomics.org/plink/1.9/formats#lgen
238
+
239
+ A text file with no header line, and one line per genotype call (or
240
+ just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
241
+ the following five fields:
242
+
243
+ 1. Family ID
244
+ 2. Within-family ID
245
+ 3. Variant identifier
246
+ 4. Allele call 1 ('0' for missing)
247
+ 5. Allele call 2
248
+
249
+ There are several variations which are also handled by PLINK; see the
250
+ original discussion for details.
251
+
252
+ :param data: Data the after parsing FinalReport.txt
253
+ :param sid_col:
254
+ :param snp_name:
255
+ :param fid_col: Family ID, default value "1"
256
+ :param alleles:
257
+ :return: - Return data in formate .lgen
258
+ """
259
+ _fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
260
+ _f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
261
+
262
+ _lgen = pd.DataFrame(columns=_fields)
263
+
264
+ try:
265
+ # Checked Sample ID on underscope - '_'
266
+ _lgen["sid"] = data[sid_col].astype(str)
267
+ if _lgen["sid"].apply(_check_underscore).any():
268
+ raise Exception(
269
+ "Replace in 'Sample ID' columns '_' on another a simbols"
270
+ )
271
+
272
+ # Checked Family ID on underscope - '_'
273
+ if fid_col is not None:
274
+ if (data[fid_col].dtype.hasobject and
275
+ data[fid_col].apply(_check_underscore).any()):
276
+ raise Exception(
277
+ "Replace in 'Family ID' columns '_' on another a simbols"
278
+ )
279
+
280
+ _lgen["fid"] = data[fid_col]
281
+
282
+ else:
283
+ _lgen["fid"] = 1
284
+
285
+ _lgen["snp_name"] = data[snp_name]
286
+ _lgen[["allele1", "allele2"]] = data[alleles].replace({'-': 0})
287
+
288
+ except Exception as e:
289
+ raise e
290
+
291
+ return _lgen[_fields].astype(_f_dtype)
292
+
293
+
294
+ def _check_underscore(value: str) -> bool:
295
+ """ Checking for underscore in a string
296
+
297
+ :param value: String for checked
298
+ :return: Return True if there is an underline in the string, False if not
299
+ """
300
+ _under_l = re.compile(r"_")
301
+
302
+ if _under_l.findall(value):
303
+ return True
304
+
305
+ return False