snplib 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snplib/format/_plink.py +42 -56
- snplib/statistics/_callrate.py +5 -4
- {snplib-1.0.8.dist-info → snplib-1.0.9.dist-info}/METADATA +2 -2
- {snplib-1.0.8.dist-info → snplib-1.0.9.dist-info}/RECORD +7 -7
- {snplib-1.0.8.dist-info → snplib-1.0.9.dist-info}/LICENSE +0 -0
- {snplib-1.0.8.dist-info → snplib-1.0.9.dist-info}/WHEEL +0 -0
- {snplib-1.0.8.dist-info → snplib-1.0.9.dist-info}/top_level.txt +0 -0
snplib/format/_plink.py
CHANGED
@@ -14,23 +14,20 @@ def make_map(manifest: pd.DataFrame) -> pd.DataFrame:
|
|
14
14
|
""" PLINK text fileset variant information file
|
15
15
|
https://www.cog-genomics.org/plink/1.9/formats#map
|
16
16
|
|
17
|
-
A text file with no header line, and one line per variant with the
|
17
|
+
A text file with no header line, and one line per variant with the
|
18
|
+
following 3-4 fields:
|
18
19
|
|
19
|
-
1. Chromosome code. PLINK 1.9 also permits contig names here, but most
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
dummy value of '0')
|
24
|
-
4. Base-pair coordinate
|
20
|
+
1. Chromosome code. PLINK 1.9 also permits contig names here, but most older programs do not.
|
21
|
+
2. Variant identifier.
|
22
|
+
3. Position in morgans or centimorgans (optional; also safe to use dummy value of '0').
|
23
|
+
4. Base-pair coordinate.
|
25
24
|
|
26
|
-
|
27
|
-
|
25
|
+
All lines must have the same number of columns (so either no lines contain
|
26
|
+
the morgans/centimorgans column, or all of them do).
|
28
27
|
|
29
|
-
:param manifest: The file that is taken on the Illumina website with full
|
30
|
-
information about the chip
|
31
|
-
https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
|
28
|
+
:param manifest: The file that is taken on the Illumina website with full information about the chip https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html.
|
32
29
|
|
33
|
-
:return: Return data in formate .map
|
30
|
+
:return: Return data in formate .map.
|
34
31
|
"""
|
35
32
|
|
36
33
|
fields = ['Chr', 'Name', 'MapInfo']
|
@@ -69,12 +66,11 @@ def make_ped(
|
|
69
66
|
genotype calls. Normally must be accompanied by a .map file.
|
70
67
|
https://www.cog-genomics.org/plink/1.9/formats#ped
|
71
68
|
|
72
|
-
|
69
|
+
The PED file has 6 fixed columns at the beginning followed by the SNP
|
73
70
|
information. The columns should be separated by a whitespace or a tab. The
|
74
71
|
first six columns hold the following information:
|
75
72
|
|
76
|
-
1. Family ID (if unknown use the same id as for the sample id in
|
77
|
-
column two)
|
73
|
+
1. Family ID (if unknown use the same id as for the sample id in column two)
|
78
74
|
2. Sample ID
|
79
75
|
3. Paternal ID (if unknown use 0)
|
80
76
|
4. Maternal ID (if unknown use 0)
|
@@ -82,25 +78,23 @@ def make_ped(
|
|
82
78
|
6. Affection (0=unknown; 1=unaffected; 2=affected)
|
83
79
|
7. Genotypes (space or tab separated, 2 for each marker. 0/-9=missing)
|
84
80
|
|
85
|
-
|
81
|
+
Here is a brief example of a genotype PED file containing 5 samples
|
86
82
|
with 10 homozygous SNPs:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
:param sid_col: Sample ID. Column name in data
|
96
|
-
:param snp_col: Snp column name in data
|
97
|
-
:param fid_col: Family ID column name in data (if unknown use the same
|
98
|
-
|
99
|
-
:param
|
100
|
-
:param
|
101
|
-
:
|
102
|
-
:return: Returns an array of data in ped format to work with the plink
|
103
|
-
program
|
83
|
+
|
84
|
+
4304 4304 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
85
|
+
6925 6925 0 0 0 0 C C C C T T G G A A C C G G C C T T T T
|
86
|
+
7319 7319 0 0 0 0 C C C C G G G G G G C C G G C C T T T T
|
87
|
+
6963 6963 0 0 0 0 A A C C T T G G A A C C G G C C T T T T
|
88
|
+
6968 6968 0 0 0 0 C C C C G G G G G G G G G G C C T T T T
|
89
|
+
|
90
|
+
:param data: Snp data that contain full or partial information on the animal.
|
91
|
+
:param sid_col: Sample ID. Column name in data.
|
92
|
+
:param snp_col: Snp column name in data.
|
93
|
+
:param fid_col: Family ID column name in data (if unknown use the same id as for the sample id in column two).
|
94
|
+
:param father_col: Paternal ID column name in data (if unknown use 0).
|
95
|
+
:param mother_col: Maternal ID column name in data (if unknown use 0).
|
96
|
+
:param sex_col: Sex column name in data (if unknown use 0).
|
97
|
+
:return: Returns an array of data in ped format to work with the plink program.
|
104
98
|
"""
|
105
99
|
|
106
100
|
_fields = ["fid", "sid", "father", "mother", "sex", "not_used", "snp"]
|
@@ -153,37 +147,29 @@ def make_fam(
|
|
153
147
|
sex_val: int = 0,
|
154
148
|
pheno_col: str = None,
|
155
149
|
pheno_val: int = -9
|
156
|
-
|
157
150
|
) -> pd.DataFrame | None:
|
158
151
|
""" PLINK sample information file
|
159
152
|
https://www.cog-genomics.org/plink/1.9/formats#fam
|
160
153
|
|
161
|
-
|
162
|
-
|
154
|
+
A text file with no header line, and one line per sample with the following
|
155
|
+
six fields:
|
163
156
|
|
164
157
|
1. Family ID ('FID')
|
165
158
|
2. Within-family ID ('IID'; cannot be '0')
|
166
159
|
3. Within-family ID of father ('0' if father isn't in dataset)
|
167
160
|
4. Within-family ID of mother ('0' if mother isn't in dataset)
|
168
161
|
5. Sex code ('1' = male, '2' = female, '0' = unknown)
|
169
|
-
6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric =
|
170
|
-
|
171
|
-
|
172
|
-
:param
|
173
|
-
|
174
|
-
:param
|
175
|
-
|
176
|
-
:param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain
|
177
|
-
underline - "_"
|
178
|
-
:param father_col: Within-family ID of father ('0' if father isn't in
|
179
|
-
dataset)
|
180
|
-
:param mother_col: Within-family ID of mother ('0' if mother isn't in
|
181
|
-
dataset)
|
162
|
+
6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
|
163
|
+
|
164
|
+
:param data: Snp data that contain full or partial information on the animal
|
165
|
+
:param fid_col: Family ID, default value "1". Must not contain underline - "_"
|
166
|
+
:param sid_col: Within-family ID ('IID'; cannot be '0'). Must not contain underline - "_"
|
167
|
+
:param father_col: Within-family ID of father ('0' if father isn't in dataset)
|
168
|
+
:param mother_col: Within-family ID of mother ('0' if mother isn't in dataset)
|
182
169
|
:param sex_col: Sex column name in data
|
183
170
|
:param sex_val: Sex code ('1' = male, '2' = female, '0' = unknown)
|
184
171
|
:param pheno_col: Pheno column name in data
|
185
|
-
:param pheno_val: Phenotype value ('1' = control, '2' = case,
|
186
|
-
'-9'/'0'/non-numeric = missing data if case/control)
|
172
|
+
:param pheno_val: Phenotype value ('1' = control, '2' = case,'-9'/'0'/non-numeric = missing data if case/control)
|
187
173
|
:return: Return data in formate .fam
|
188
174
|
"""
|
189
175
|
|
@@ -236,7 +222,7 @@ def make_lgen(
|
|
236
222
|
""" PLINK long-format genotype file
|
237
223
|
https://www.cog-genomics.org/plink/1.9/formats#lgen
|
238
224
|
|
239
|
-
|
225
|
+
A text file with no header line, and one line per genotype call (or
|
240
226
|
just not-homozygous-major calls if 'lgen-ref' was invoked) usually with
|
241
227
|
the following five fields:
|
242
228
|
|
@@ -246,7 +232,7 @@ def make_lgen(
|
|
246
232
|
4. Allele call 1 ('0' for missing)
|
247
233
|
5. Allele call 2
|
248
234
|
|
249
|
-
|
235
|
+
There are several variations which are also handled by PLINK; see the
|
250
236
|
original discussion for details.
|
251
237
|
|
252
238
|
:param data: Data the after parsing FinalReport.txt
|
@@ -254,7 +240,7 @@ def make_lgen(
|
|
254
240
|
:param snp_name:
|
255
241
|
:param fid_col: Family ID, default value "1"
|
256
242
|
:param alleles:
|
257
|
-
:return:
|
243
|
+
:return: Return data in formate .lgen
|
258
244
|
"""
|
259
245
|
_fields = ['fid', 'sid', 'snp_name', 'allele1', 'allele2']
|
260
246
|
_f_dtype = dict(zip(_fields, (str for _ in range(len(_fields)))))
|
@@ -295,7 +281,7 @@ def _check_underscore(value: str) -> bool:
|
|
295
281
|
""" Checking for underscore in a string
|
296
282
|
|
297
283
|
:param value: String for checked
|
298
|
-
:return: Return True if there is an underline in the string, False if not
|
284
|
+
:return: Return True if there is an underline in the string, False if not.
|
299
285
|
"""
|
300
286
|
_under_l = re.compile(r"_")
|
301
287
|
|
snplib/statistics/_callrate.py
CHANGED
@@ -15,11 +15,12 @@ def call_rate(
|
|
15
15
|
not missing. In the following example, we filter using a call rate of 95%,
|
16
16
|
meaning we retain SNPs for which there is less than 5% missing data.
|
17
17
|
|
18
|
-
|
19
|
-
particular animal, the “call rate animal” is 50K/54K=93
|
20
|
-
|
18
|
+
Of the say, 54K markers in the chip, 50K have been genotyped for a
|
19
|
+
particular animal, the “call rate animal” is 50K/54K=93%.
|
20
|
+
|
21
|
+
Of the say, 900 animals genotyped for marker CL635944_160.1, how many
|
21
22
|
have actually been successfully read? Assume that 600 have been read, then
|
22
|
-
the “call rate marker” is 600/900 = 67
|
23
|
+
the “call rate marker” is 600/900 = 67%.
|
23
24
|
|
24
25
|
:param data: Pre-processed data on which the call rate is calculated.
|
25
26
|
:param id_col: The name of the column with the id of the animals or
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: snplib
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.9
|
4
4
|
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
5
|
Author-email: Igor <igor.loschinin@gmail.com>
|
6
6
|
License: GNU
|
@@ -73,7 +73,7 @@ from snplib import (
|
|
73
73
|
```
|
74
74
|
|
75
75
|
## Documentation
|
76
|
-
Detailed documentation on how to use SNPTools is available see the [docs](
|
76
|
+
Detailed documentation on how to use SNPTools is available see the [docs](docs/_build/index.html).
|
77
77
|
|
78
78
|
## License
|
79
79
|
This project is licensed under the GNU General Public License - see the
|
@@ -3,7 +3,7 @@ snplib/finalreport/__init__.py,sha256=Yk49x8t-STIfsdP6QLMtaGm1gTj_n-XS8kchPguvW1
|
|
3
3
|
snplib/finalreport/_finalreport.py,sha256=el_d8MVmpic3wKCRJ-J52VZYSmMuNSf4p_tmPkgh0Z0,5876
|
4
4
|
snplib/format/__init__.py,sha256=3W_l_sP1u9HV3HWwnsJxPGw9anrVknstqLaJmWQaG0k,261
|
5
5
|
snplib/format/__settings.py,sha256=kyAVZ4tiU61sNr3jQhjXbLXRyBA3pjFfCw3fOfSkY14,289
|
6
|
-
snplib/format/_plink.py,sha256=
|
6
|
+
snplib/format/_plink.py,sha256=cjT6PkvDJr8KwvQo76i7_Hm1Og4bASYCDN9G7CHsQ00,10372
|
7
7
|
snplib/format/_snp.py,sha256=oI-V4-_w28aX-VxoimywLDDnX6owhdjLbqt9a54_ouU,3172
|
8
8
|
snplib/parentage/__init__.py,sha256=bN3mWTxmaFQ1qzRtyMLaAoxfomz6jnoWa-kmnJ9q_fE,280
|
9
9
|
snplib/parentage/_discov.py,sha256=qGlNzpl4xKOWRr6-fi1osylzizgiPO8vCus8VE56nec,3180
|
@@ -12,11 +12,11 @@ snplib/parentage/_verif.py,sha256=VbX46dC4tl4Qeuw65aRg1s6hSn0FI25Hy9-3U_jxmrg,30
|
|
12
12
|
snplib/parentage/isag_disc.pl,sha256=XzjcsnO_kwPg4WaE2YMuZXBNHQ9ixi6pg5n2mfGOuJU,14219
|
13
13
|
snplib/parentage/isag_verif.pl,sha256=e_c4YGd5_JXGWqFQwmcxjp6hEkdcqpK_5y5MqJ8J9YY,8254
|
14
14
|
snplib/statistics/__init__.py,sha256=XJFU7mEwAJJ2M187jEkO8rFNYKoxF-g9KF_stS7eFFw,302
|
15
|
-
snplib/statistics/_callrate.py,sha256=
|
15
|
+
snplib/statistics/_callrate.py,sha256=yfHxnNVpcDfV3qxZVwrk2RWPgy9dTf7NHWczDUORwtY,1866
|
16
16
|
snplib/statistics/_freq.py,sha256=ZPZBZM3xq9EseOxuMzRVvzkjjFfaaA4ZvF7XI8ctON0,1623
|
17
17
|
snplib/statistics/_snphwe.py,sha256=KcoRGwovMCc53-GJ8VfYs_3ZEHObgt8B0EvrW5nFnmM,3353
|
18
|
-
snplib-1.0.
|
19
|
-
snplib-1.0.
|
20
|
-
snplib-1.0.
|
21
|
-
snplib-1.0.
|
22
|
-
snplib-1.0.
|
18
|
+
snplib-1.0.9.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
19
|
+
snplib-1.0.9.dist-info/METADATA,sha256=tgDJRi81nc66_LSZtTTkvPENK6r9OZ6WKf_MGPk7hcg,2163
|
20
|
+
snplib-1.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
21
|
+
snplib-1.0.9.dist-info/top_level.txt,sha256=CGCrLXuCSyXPCTwMFQjPxQR7b93FFFft56sAPPun_2g,7
|
22
|
+
snplib-1.0.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|