snplib 1.0.10__tar.gz → 1.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snplib-1.0.10 → snplib-1.2.10}/.gitignore +2 -0
- {snplib-1.0.10/src/snplib.egg-info → snplib-1.2.10}/PKG-INFO +7 -7
- {snplib-1.0.10 → snplib-1.2.10}/docs/conf.py +1 -1
- {snplib-1.0.10 → snplib-1.2.10}/docs/examples.rst +130 -11
- snplib-1.2.10/docs/requirements.txt +2 -0
- {snplib-1.0.10 → snplib-1.2.10}/pyproject.toml +7 -7
- snplib-1.2.10/requirements.txt +10 -0
- snplib-1.2.10/src/snplib/finalreport/_finalreport.py +304 -0
- {snplib-1.0.10 → snplib-1.2.10/src/snplib.egg-info}/PKG-INFO +7 -7
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib.egg-info/SOURCES.txt +4 -0
- snplib-1.2.10/src/snplib.egg-info/requires.txt +8 -0
- snplib-1.2.10/tests/finalreport/files/fr/file8.txt +28 -0
- snplib-1.2.10/tests/finalreport/files/fr/file8.xlsx +0 -0
- snplib-1.2.10/tests/finalreport/files/fr/file9.txt +28 -0
- snplib-1.2.10/tests/finalreport/files/fr/file9.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/test_finalreport.py +138 -47
- snplib-1.0.10/docs/requirements.txt +0 -2
- snplib-1.0.10/requirements.txt +0 -12
- snplib-1.0.10/src/snplib/finalreport/_finalreport.py +0 -251
- snplib-1.0.10/src/snplib.egg-info/requires.txt +0 -8
- {snplib-1.0.10 → snplib-1.2.10}/.github/workflows/linux.yml +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/.github/workflows/macos.yml +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/.github/workflows/windows.yml +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/.readthedocs.yaml +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/LICENSE +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/README.md +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/Makefile +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/index.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/install.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/intro.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/logo.png +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/make.bat +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/modules.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/snplib.finalreport.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/snplib.format.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/snplib.parentage.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/snplib.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/snplib.statistics.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/docs/usage.rst +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/iconlib.png +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/setup.cfg +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/finalreport/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/format/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/format/__settings.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/format/_plink.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/format/_snp.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/_discov.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/_isagmark.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/_verif.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/isag_disc.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/parentage/isag_verif.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/statistics/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/statistics/_callrate.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/statistics/_freq.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib/statistics/_snphwe.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib.egg-info/dependency_links.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/src/snplib.egg-info/top_level.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file1.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file1.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file2.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file2.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file3.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file3.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file4.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file5.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file5.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file6.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file6.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file7.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/finalreport/files/fr/file7.xlsx +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/fam/file.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/fam/file2.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/fam/file3.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/fam/file4.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/lgen/file.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/map/file_bovinesnp50.csv +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/ped/file.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/ped/file2.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/ped/file3.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fplink/ped/file4.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fsnp/file1.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/files/fsnp/file2.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/test_plink_fam.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/test_plink_lgen.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/test_plink_map.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/test_plink_ped.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/format/test_snp.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/parentage/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/parentage/data/parentage_test_disc.csv +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/parentage/data/parentage_test_verf.csv +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/parentage/test_discov.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/parentage/test_verif.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/__init__.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/data/cr/file_cra.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/data/cr/file_crm.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/data/freq/etalon.txt +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/data/freq/file.pl +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/test_callrate.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/test_freq_allele.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/test_freq_maf.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/test_hwe_t.py +0 -0
- {snplib-1.0.10 → snplib-1.2.10}/tests/statistics/test_snphwe.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: snplib
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.10
|
4
4
|
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
5
|
Author-email: Igor <igor.loschinin@gmail.com>
|
6
6
|
License: GNU
|
@@ -10,14 +10,14 @@ Classifier: Operating System :: OS Independent
|
|
10
10
|
Requires-Python: >=3.10
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
-
Requires-Dist: numpy>=
|
14
|
-
Requires-Dist: pandas>=2.
|
15
|
-
Requires-Dist: six>=1.
|
13
|
+
Requires-Dist: numpy>=2.2.3
|
14
|
+
Requires-Dist: pandas>=2.2.3
|
15
|
+
Requires-Dist: six>=1.17.0
|
16
16
|
Requires-Dist: swifter>=1.4.0
|
17
17
|
Requires-Dist: xlrd>=2.0.1
|
18
|
-
Requires-Dist: XlsxWriter>=3.
|
19
|
-
Requires-Dist: openpyxl>=3.1.
|
20
|
-
Requires-Dist: pydantic>=2.
|
18
|
+
Requires-Dist: XlsxWriter>=3.2.2
|
19
|
+
Requires-Dist: openpyxl>=3.1.5
|
20
|
+
Requires-Dist: pydantic>=2.10.6
|
21
21
|
|
22
22
|
# snptools
|
23
23
|
<p align="center">
|
@@ -73,6 +73,31 @@ Output::
|
|
73
73
|
|
74
74
|
...
|
75
75
|
|
76
|
+
To handle large files, use `usecols` and `dtype`. This reduces memory
|
77
|
+
consumption and speeds up processing.
|
78
|
+
|
79
|
+
.. note::
|
80
|
+
`usecols` is used when `allele` is **None**.
|
81
|
+
|
82
|
+
.. code-block:: python
|
83
|
+
|
84
|
+
alleles_ab = FinalReport(
|
85
|
+
usecols=['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB'],
|
86
|
+
dtype={'SNP Name': 'category'}
|
87
|
+
)
|
88
|
+
alleles_ab.handle("path/to/finalreport.txt")
|
89
|
+
data_ab = alleles_ab.snp_data
|
90
|
+
|
91
|
+
Output::
|
92
|
+
|
93
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB
|
94
|
+
ARS-BFGL-BAC-10172 HO840M003135245650 B B
|
95
|
+
ARS-BFGL-BAC-1020 HO840M003135245650 B B
|
96
|
+
ARS-BFGL-BAC-10245 HO840M003135245650 B B
|
97
|
+
ARS-BFGL-BAC-10345 HO840M003135245650 A B
|
98
|
+
ARS-BFGL-BAC-10375 HO840M003135245650 A B
|
99
|
+
...
|
100
|
+
|
76
101
|
Preparation SNP files
|
77
102
|
---------------------
|
78
103
|
|
@@ -90,7 +115,19 @@ plink - GBLUP, ssGBLUP, GWAS.
|
|
90
115
|
blupf90 format
|
91
116
|
______________
|
92
117
|
The input data for obtaining the ``snp.txt`` file used for the genomic
|
93
|
-
blupf90 evaluation is the data file - processed file ``finalreport.txt
|
118
|
+
blupf90 evaluation is the data file - processed file ``finalreport.txt``.
|
119
|
+
The processed file can be seen in the item above - Finalreport.txt processing:
|
120
|
+
|
121
|
+
Content input *file.txt*::
|
122
|
+
|
123
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score X Y
|
124
|
+
ARS-BFGL-BAC-10172 HO840M003135245650 B B 0.9420 0.069 0.801
|
125
|
+
ARS-BFGL-BAC-1020 HO840M003135245650 B B 0.9489 0.033 0.700
|
126
|
+
ARS-BFGL-BAC-10245 HO840M003135245650 B B 0.7277 0.152 1.504
|
127
|
+
ARS-BFGL-BAC-10345 HO840M003135245650 A B 0.9411 0.598 0.572
|
128
|
+
ARS-BFGL-BAC-10375 HO840M003135245650 A B 0.9348 0.430 0.494f
|
129
|
+
|
130
|
+
...
|
94
131
|
|
95
132
|
**uga**
|
96
133
|
|
@@ -99,7 +136,7 @@ blupf90 evaluation is the data file - processed file ``finalreport.txt``
|
|
99
136
|
import pandas as pd
|
100
137
|
from snplib.format import Snp
|
101
138
|
|
102
|
-
data_finalreport = pd.read_csv("file.txt", sep="\t")
|
139
|
+
data_finalreport = pd.read_csv("path_to_file/file.txt", sep="\t")
|
103
140
|
|
104
141
|
obj = Snp(fmt="uga")
|
105
142
|
obj_snp.process(data_finalreport)
|
@@ -111,7 +148,7 @@ Data after snp processing in ``uga`` (blupf90) format - obj_snp.data::
|
|
111
148
|
0 14814 02011015010000500
|
112
149
|
1 14815 01110152120222512
|
113
150
|
|
114
|
-
Default result
|
151
|
+
Default result - this is what the data looks like if ``fmt=None``::
|
115
152
|
|
116
153
|
SNP_NAME SAMPLE_ID SNP
|
117
154
|
0 ABCA12 14814 0
|
@@ -130,16 +167,104 @@ ____________
|
|
130
167
|
|
131
168
|
This page describes specialized PLINK input and output file formats which are
|
132
169
|
identifiable by file extension. https://www.cog-genomics.org/plink/1.9/formats
|
133
|
-
|
170
|
+
Common fomrats for performing GWAS analysis - ``ped``, ``map``, ``fam``, ``lgen``....
|
134
171
|
|
135
172
|
**map** - https://www.cog-genomics.org/plink/1.9/formats#map
|
136
173
|
|
174
|
+
To get the ``.map`` file, first you need to download the *manifest file* for the chip
|
175
|
+
you are using chip.
|
176
|
+
|
177
|
+
.. note::
|
178
|
+
*file_bovinesnp50.csv* - The file that is taken on the Illumina website with full
|
179
|
+
information about the chip https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
|
180
|
+
|
181
|
+
Since the make_map function accepts **pd.DataFrame**, the *manifest file* processing is performed
|
182
|
+
independently.
|
183
|
+
|
184
|
+
Input data for make_map::
|
185
|
+
|
186
|
+
IlmnID ... BeadSetID
|
187
|
+
0 BovineHD0100037694-128_T_F_2278925834 ... 1241
|
188
|
+
1 BovineHD0100037699_dup-128_T_F_2327674593 ... 1241
|
189
|
+
2 BovineHD0100037703_dup-128_B_R_2327674602 ... 1241
|
190
|
+
3 BovineHD0100037704_dup-128_T_F_2327674603 ... 1241
|
191
|
+
4 BovineHD0100037710_dup-128_T_F_2327674613 ... 1241
|
192
|
+
5 BovineHD0100037712_dup-128_B_R_2327674618 ... 1241
|
193
|
+
6 BovineHD0100037716-128_T_F_2255347065 ... 1241
|
194
|
+
7 BovineHD0100037719-128_T_F_2278926219 ... 1241
|
195
|
+
8 BovineHD0100037720-128_B_R_2255342455 ... 1241
|
196
|
+
9 BovineHD0100037722_dup-128_B_R_2327674634 ... 1241
|
197
|
+
|
198
|
+
|
199
|
+
.. note::
|
200
|
+
The original file, for example, **BovineSNP50_v3_A1.csv** looks like this::
|
201
|
+
|
202
|
+
Illumina, Inc.,,,,,,,,,,,,,,,,,
|
203
|
+
[Heading],,,,,,,,,,,,,,,,,,
|
204
|
+
Descriptor File Name,BovineSNP50_v3_A1.bpm,,,,,,,,,,,,,,,,,
|
205
|
+
Assay Format,Infinium HTS,,,,,,,,,,,,,,,,,
|
206
|
+
Date Manufactured,1/14/2016,,,,,,,,,,,,,,,,,
|
207
|
+
Loci Count ,53218,,,,,,,,,,,,,,,,,
|
208
|
+
[Assay],,,,,,,,,,,,,,,,,,
|
209
|
+
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID
|
210
|
+
ABCA12_r2-1_T_F_2277749139,ABCA12,TOP,[A/G],0059616496,CTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC,,,0,2,103548215,diploid,Bos taurus,UMD3.1,1,TOP,ACTCTGGTGGATGGTTCATAATCTGCTAAGATGAATAAGTTACTGGGGAAACTGGTGCATTTATTTTAAATATAAATTATATAGTCTGTAAGATATAAAGACTGCCTAATTTATTTGAACACCATACTGATCTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC[A/G]CTCCCTTCCAGCTTACCTCAACAGCCTGAATAATTTCCTCCTGCGAGTTAACATGTCAAAATATGATGCTGCCCGACATGGTAAAGTTATTTACATAGGAGCTCCTTGTATTGAAACTCTTGCTACTCTCCATGTGAAAATATACATTAGACCCCATTTTCCTCCCTGTGGCAGCTAT,ACTCTGGTGGATGGTTCATAATCTGCTAAGATGAATAAGTTACTGGGGAAACTGGTGCATTTATTTTAAATATAAATTATATAGTCTGTAAGATATAAAGACTGCCTAATTTATTTGAACACCATACTGATCTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC[A/G]CTCCCTTCCAGCTTACCTCAACAGCCTGAATAATTTCCTCCTGCGAGTTAACATGTCAAAATATGATGCTGCCCGACATGGTAAAGTTATTTACATAGGAGCTCCTTGTATTGAAACTCTTGCTACTCTCCATGTGAAAATATACATTAGACCCCATTTTCCTCCCTGTGGCAGCTAT,1241
|
211
|
+
APAF1_dup-1_B_F_2327661418,APAF1,BOT,[T/C],0041654401,ATATTGTGCAACTGGGCCTCTGTGAACTGGAAACTTCAGAGGTTTATCGG,,,0,5,63150400,diploid,Bos taurus,UMD3.1,1,BOT,CCATTTCCTAATATTGTGCAACTGGGCCTCTGTGAACTGGAAACTTCAGAGGTTTATCGG[T/C]AAGCTAAGCTGCAGGCCAAGCAGGAGGTCGATAACGGAATGCTTTACCTGGAGTGGGTGT,ACACCCACTCCAGGTAAAGCATTCCGTTATCGACCTCCTGCTTGGCCTGCAGCTTAGCTT[A/G]CCGATAAACCTCTGAAGTTTCCAGTTCACAGAGGCCCAGTTGCACAATATTAGGAAATGG,1241
|
212
|
+
ARS-BFGL-BAC-10172_dup-0_T_F_2328966397,ARS-BFGL-BAC-10172,TOP,[A/G],0072620471,GGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT,,,3,14,6371334,diploid,Bos taurus,UM3,0,TOP,CTCAGAAGTTGGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT[A/G]ATCCAGAATATTCTTTTAGTAATATTTTTGTTAATATTGAAATTTTTAAAACAATTGAAA,CTCAGAAGTTGGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT[A/G]ATCCAGAATATTCTTTTAGTAATATTTTTGTTAATATTGAAATTTTTAAAACAATTGAAA,1241
|
213
|
+
.
|
214
|
+
.
|
215
|
+
.
|
216
|
+
UA-IFASA-9812_dup-0_B_F_2329051536,UA-IFASA-9812,BOT,[T/C],0031677304,ACCTCCATAGCTGATAGGAATGGTCTCAACTTGCAGCCCCATTATACTAA,,,3,29,48012818,diploid,Bos taurus,UM3,0,BOT,GTAAAAACAAACCTCCATAGCTGATAGGAATGGTCTCAACTTGCAGCCCCATTATACTAA[T/C]GATGATCTGAAGTTTCTCAAGCACGCAGAGAAACGTAAGAGAAACGTTCCAGCAAAGGGA,TCCCTTTGCTGGAACGTTTCTCTTACGTTTCTCTGCGTGCTTGAGAAACTTCAGATCATC[A/G]TTAGTATAATGGGGCTGCAAGTTGAGACCATTCCTATCAGCTATGGAGGTTTGTTTTTAC,1241
|
217
|
+
UA-IFASA-9813_dup-0_B_F_2329051538,UA-IFASA-9813,BOT,[T/C],0011661313,ACCTTTGCACTCGCTAACGGTTCAGCATTAATCAGACTTCCTCAGGAATT,,,3,19,32508700,diploid,Bos taurus,UM3,0,BOT,AATAAAACCAACCTTTGCACTCGCTAACGGTTCAGCATTAATCAGACTTCCTCAGGAATT[T/C]AGGGGTCAATTCCCCCATGTCTAAAATTGAACCTCAACGTCCTTTCTGTTTTCAAAACTC,GAGTTTTGAAAACAGAAAGGACGTTGAGGTTCAATTTTAGACATGGGGGAATTGACCCCT[A/G]AATTCCTGAGGAAGTCTGATTAATGCTGAACCGTTAGCGAGTGCAAAGGTTGGTTTTATT,1241
|
218
|
+
UMPS_dup-1_T_R_2327737250,UMPS,TOP,[A/G],0073777348,TAACTGAACTCCTGGAGTCAAGTGAAGAAATTCTGGTTTCATGCTTACTC,,,0,1,69756880,diploid,Bos taurus,UMD3.1,1,BOT,TCATCTGTTGATTACATTCCATTCAGGTGCAAATGGCTGAAGAACATTCTGAATTTGTGATTGGTTTTATTTCTGGCTCC[T/C]GAGTAAGCATGAAACCAGAATTTCTTCACTTGACTCCAGGAGTTCAGTTAGAAGCAGGAGGTAAGCCTATTGATTGGTAA,TTACCAATCAATAGGCTTACCTCCTGCTTCTAACTGAACTCCTGGAGTCAAGTGAAGAAATTCTGGTTTCATGCTTACTC[A/G]GGAGCCAGAAATAAAACCAATCACAAATTCAGAATGTTCTTCAGCCATTTGCACCTGAATGGAATGTAATCAACAGATGA,1241
|
219
|
+
[Controls],,,,,,,,,,,,,,,,,,
|
220
|
+
0027630314,Staining,Red,DNP (High),,,,,,,,,,,,,,,
|
221
|
+
0029619375,Staining,Purple,DNP (Bgnd),,,,,,,,,,,,,,,
|
222
|
+
0041666334,Staining,Green,Biotin (High),,,,,,,,,,,,,,,
|
223
|
+
0034648333,Staining,Blue,Biotin (Bgnd),,,,,,,,,,,,,,,
|
224
|
+
0017616306,Extension,Red,Extension (A),,,,,,,,,,,,,,,
|
225
|
+
0014607337,Extension,Purple,Extension (T),,,,,,,,,,,,,,,
|
226
|
+
|
227
|
+
Therefore, for direct reading via **pd.read_csv()** it is necessary to
|
228
|
+
preprocess the file - delete extra lines::
|
229
|
+
|
230
|
+
Illumina, Inc.,,,,,,,,,,,,,,,,,
|
231
|
+
[Heading],,,,,,,,,,,,,,,,,,
|
232
|
+
Descriptor File Name,BovineSNP50_v3_A1.bpm,,,,,,,,,,,,,,,,,
|
233
|
+
Assay Format,Infinium HTS,,,,,,,,,,,,,,,,,
|
234
|
+
Date Manufactured,1/14/2016,,,,,,,,,,,,,,,,,
|
235
|
+
Loci Count ,53218,,,,,,,,,,,,,,,,,
|
236
|
+
[Assay],,,,,,,,,,,,,,,,,,
|
237
|
+
|
238
|
+
and
|
239
|
+
|
240
|
+
[Controls],,,,,,,,,,,,,,,,,,
|
241
|
+
0027630314,Staining,Red,DNP (High),,,,,,,,,,,,,,,
|
242
|
+
0029619375,Staining,Purple,DNP (Bgnd),,,,,,,,,,,,,,,
|
243
|
+
0041666334,Staining,Green,Biotin (High),,,,,,,,,,,,,,,
|
244
|
+
0034648333,Staining,Blue,Biotin (Bgnd),,,,,,,,,,,,,,,
|
245
|
+
0017616306,Extension,Red,Extension (A),,,,,,,,,,,,,,,
|
246
|
+
0014607337,Extension,Purple,Extension (T),,,,,,,,,,,,,,,
|
247
|
+
|
248
|
+
The file should end up looking like this::
|
249
|
+
|
250
|
+
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID
|
251
|
+
ABCA12_r2-1_T_F_2277749139,ABCA12,TOP,[A/G],0059616496,CTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC,,,0,2,103548215,diploid,Bos taurus,UMD3.1,1,TOP,ACTCTGGTGGATGGTTCATAATCTGCTAAGATGAATAAGTTACTGGGGAAACTGGTGCATTTATTTTAAATATAAATTATATAGTCTGTAAGATATAAAGACTGCCTAATTTATTTGAACACCATACTGATCTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC[A/G]CTCCCTTCCAGCTTACCTCAACAGCCTGAATAATTTCCTCCTGCGAGTTAACATGTCAAAATATGATGCTGCCCGACATGGTAAAGTTATTTACATAGGAGCTCCTTGTATTGAAACTCTTGCTACTCTCCATGTGAAAATATACATTAGACCCCATTTTCCTCCCTGTGGCAGCTAT,ACTCTGGTGGATGGTTCATAATCTGCTAAGATGAATAAGTTACTGGGGAAACTGGTGCATTTATTTTAAATATAAATTATATAGTCTGTAAGATATAAAGACTGCCTAATTTATTTGAACACCATACTGATCTTGTCTTCTTTTGGAATGTTACAGGTATGGTATGATCCAGAAGGCTATC[A/G]CTCCCTTCCAGCTTACCTCAACAGCCTGAATAATTTCCTCCTGCGAGTTAACATGTCAAAATATGATGCTGCCCGACATGGTAAAGTTATTTACATAGGAGCTCCTTGTATTGAAACTCTTGCTACTCTCCATGTGAAAATATACATTAGACCCCATTTTCCTCCCTGTGGCAGCTAT,1241
|
252
|
+
APAF1_dup-1_B_F_2327661418,APAF1,BOT,[T/C],0041654401,ATATTGTGCAACTGGGCCTCTGTGAACTGGAAACTTCAGAGGTTTATCGG,,,0,5,63150400,diploid,Bos taurus,UMD3.1,1,BOT,CCATTTCCTAATATTGTGCAACTGGGCCTCTGTGAACTGGAAACTTCAGAGGTTTATCGG[T/C]AAGCTAAGCTGCAGGCCAAGCAGGAGGTCGATAACGGAATGCTTTACCTGGAGTGGGTGT,ACACCCACTCCAGGTAAAGCATTCCGTTATCGACCTCCTGCTTGGCCTGCAGCTTAGCTT[A/G]CCGATAAACCTCTGAAGTTTCCAGTTCACAGAGGCCCAGTTGCACAATATTAGGAAATGG,1241
|
253
|
+
ARS-BFGL-BAC-10172_dup-0_T_F_2328966397,ARS-BFGL-BAC-10172,TOP,[A/G],0072620471,GGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT,,,3,14,6371334,diploid,Bos taurus,UM3,0,TOP,CTCAGAAGTTGGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT[A/G]ATCCAGAATATTCTTTTAGTAATATTTTTGTTAATATTGAAATTTTTAAAACAATTGAAA,CTCAGAAGTTGGTCCCCAAAGTATGTGGTAGCACTTACTTATGTAAGTCATCACTCAAGT[A/G]ATCCAGAATATTCTTTTAGTAATATTTTTGTTAATATTGAAATTTTTAAAACAATTGAAA,1241
|
254
|
+
.
|
255
|
+
.
|
256
|
+
.
|
257
|
+
UA-IFASA-9812_dup-0_B_F_2329051536,UA-IFASA-9812,BOT,[T/C],0031677304,ACCTCCATAGCTGATAGGAATGGTCTCAACTTGCAGCCCCATTATACTAA,,,3,29,48012818,diploid,Bos taurus,UM3,0,BOT,GTAAAAACAAACCTCCATAGCTGATAGGAATGGTCTCAACTTGCAGCCCCATTATACTAA[T/C]GATGATCTGAAGTTTCTCAAGCACGCAGAGAAACGTAAGAGAAACGTTCCAGCAAAGGGA,TCCCTTTGCTGGAACGTTTCTCTTACGTTTCTCTGCGTGCTTGAGAAACTTCAGATCATC[A/G]TTAGTATAATGGGGCTGCAAGTTGAGACCATTCCTATCAGCTATGGAGGTTTGTTTTTAC,1241
|
258
|
+
UA-IFASA-9813_dup-0_B_F_2329051538,UA-IFASA-9813,BOT,[T/C],0011661313,ACCTTTGCACTCGCTAACGGTTCAGCATTAATCAGACTTCCTCAGGAATT,,,3,19,32508700,diploid,Bos taurus,UM3,0,BOT,AATAAAACCAACCTTTGCACTCGCTAACGGTTCAGCATTAATCAGACTTCCTCAGGAATT[T/C]AGGGGTCAATTCCCCCATGTCTAAAATTGAACCTCAACGTCCTTTCTGTTTTCAAAACTC,GAGTTTTGAAAACAGAAAGGACGTTGAGGTTCAATTTTAGACATGGGGGAATTGACCCCT[A/G]AATTCCTGAGGAAGTCTGATTAATGCTGAACCGTTAGCGAGTGCAAAGGTTGGTTTTATT,1241
|
259
|
+
UMPS_dup-1_T_R_2327737250,UMPS,TOP,[A/G],0073777348,TAACTGAACTCCTGGAGTCAAGTGAAGAAATTCTGGTTTCATGCTTACTC,,,0,1,69756880,diploid,Bos taurus,UMD3.1,1,BOT,TCATCTGTTGATTACATTCCATTCAGGTGCAAATGGCTGAAGAACATTCTGAATTTGTGATTGGTTTTATTTCTGGCTCC[T/C]GAGTAAGCATGAAACCAGAATTTCTTCACTTGACTCCAGGAGTTCAGTTAGAAGCAGGAGGTAAGCCTATTGATTGGTAA,TTACCAATCAATAGGCTTACCTCCTGCTTCTAACTGAACTCCTGGAGTCAAGTGAAGAAATTCTGGTTTCATGCTTACTC[A/G]GGAGCCAGAAATAAAACCAATCACAAATTCAGAATGTTCTTCAGCCATTTGCACCTGAATGGAATGTAATCAACAGATGA,1241
|
260
|
+
|
261
|
+
|
137
262
|
.. code-block:: python
|
138
263
|
|
139
264
|
import pandas as pd
|
140
265
|
from snplib.format import make_map
|
141
266
|
|
142
|
-
input_data = pd.read_csv(
|
267
|
+
input_data = pd.read_csv("./file_bovinesnp50.csv")
|
143
268
|
data_map = make_map(input_data)
|
144
269
|
|
145
270
|
Output data view::
|
@@ -150,12 +275,6 @@ Output data view::
|
|
150
275
|
0 BovineHD0100037703 0 0
|
151
276
|
0 BovineHD0100037704 0 0
|
152
277
|
|
153
|
-
.. note::
|
154
|
-
file_bovinesnp50.csv - The file that is taken on the Illumina website with full
|
155
|
-
information about the chip
|
156
|
-
https://support.illumina.com/downloads/bovinesnp50-v3-0-product-files.html
|
157
|
-
|
158
|
-
|
159
278
|
**ped** - https://www.cog-genomics.org/plink/1.9/formats#ped
|
160
279
|
|
161
280
|
.. code-block:: python
|
@@ -17,7 +17,7 @@ snplib = ["*.pl"]
|
|
17
17
|
|
18
18
|
[project]
|
19
19
|
name = "snplib"
|
20
|
-
version = "1.
|
20
|
+
version = "1.2.10"
|
21
21
|
description = "Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing"
|
22
22
|
authors = [
|
23
23
|
{name = "Igor", email = "igor.loschinin@gmail.com"}
|
@@ -30,14 +30,14 @@ classifiers = [
|
|
30
30
|
"Operating System :: OS Independent",
|
31
31
|
]
|
32
32
|
dependencies = [
|
33
|
-
"numpy>=
|
34
|
-
"pandas>=2.
|
35
|
-
"six>=1.
|
33
|
+
"numpy>=2.2.3",
|
34
|
+
"pandas>=2.2.3",
|
35
|
+
"six>=1.17.0",
|
36
36
|
"swifter>=1.4.0",
|
37
37
|
"xlrd>=2.0.1",
|
38
|
-
"XlsxWriter>=3.
|
39
|
-
"openpyxl>=3.1.
|
40
|
-
"pydantic>=2.
|
38
|
+
"XlsxWriter>=3.2.2",
|
39
|
+
"openpyxl>=3.1.5",
|
40
|
+
"pydantic>=2.10.6",
|
41
41
|
]
|
42
42
|
|
43
43
|
[project.urls]
|
@@ -0,0 +1,304 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# coding: utf-8
|
3
|
+
__author__ = "Igor Loschinin (igor.loschinin@gmail.com)"
|
4
|
+
__all__ = ("FinalReport",)
|
5
|
+
|
6
|
+
import re
|
7
|
+
from functools import reduce
|
8
|
+
from pathlib import Path
|
9
|
+
|
10
|
+
import pandas as pd
|
11
|
+
from numpy import nan
|
12
|
+
|
13
|
+
|
14
|
+
class FinalReport(object):
|
15
|
+
""" File that contains SNP information. File processing is triggered by the
|
16
|
+
handle method. If values in 'SID' or 'UNIQ_KEY' were missing in the xlsx
|
17
|
+
conversion file, the processed data will contain NAN values.
|
18
|
+
|
19
|
+
:param allele: A variant form of a single nucleotide polymorphism (SNP), a
|
20
|
+
specific polymorphic site or a whole gene detectable at a locus. Type:
|
21
|
+
'AB', 'Forward', 'Top', 'Plus', 'Design'.
|
22
|
+
:param sep: Delimiter to use. Default value: "\\t".
|
23
|
+
:param usecols: Selection of fields for reading. Accelerates processing
|
24
|
+
and reduces memory.
|
25
|
+
:param dtype: Data type(s) to apply to either the whole dataset or
|
26
|
+
individual columns. E.g., {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
|
27
|
+
|
28
|
+
Example:
|
29
|
+
[Header]
|
30
|
+
GSGT Version 2.0.4
|
31
|
+
Processing Date 10/14/2021 4:02 PM
|
32
|
+
Content BovineSNP50_v3_A1.bpm
|
33
|
+
Num SNPs 53218
|
34
|
+
Total SNPs 53218
|
35
|
+
Num Samples 3
|
36
|
+
Total Samples 3
|
37
|
+
[Data]
|
38
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
|
39
|
+
ABCA12 1 A A 0.4048 0.8164
|
40
|
+
APAF1 1 B B 0.9067 0.9155
|
41
|
+
...
|
42
|
+
"""
|
43
|
+
|
44
|
+
__PATTERN_HEADER = re.compile(r'(^\[Header])')
|
45
|
+
__PATTERN_DATA = re.compile(r'(^\[Data])')
|
46
|
+
|
47
|
+
__slots__ = (
|
48
|
+
"_delimiter",
|
49
|
+
"__allele",
|
50
|
+
"__usecols",
|
51
|
+
"__dtype",
|
52
|
+
"__snp_data",
|
53
|
+
"__header",
|
54
|
+
"_map_rn",
|
55
|
+
)
|
56
|
+
|
57
|
+
def __init__(
|
58
|
+
self,
|
59
|
+
allele: str | list | None = None,
|
60
|
+
usecols: list[str] | None = None,
|
61
|
+
dtype: dict | None = None,
|
62
|
+
sep: str = "\t"
|
63
|
+
) -> None:
|
64
|
+
self._delimiter = sep
|
65
|
+
self.__allele = allele
|
66
|
+
self.__usecols = usecols
|
67
|
+
self.__dtype = dtype
|
68
|
+
|
69
|
+
# self._full_data = None
|
70
|
+
self.__snp_data: pd.DataFrame | None = None
|
71
|
+
self.__header = {}
|
72
|
+
self._map_rn = None
|
73
|
+
|
74
|
+
@property
|
75
|
+
def header(self) -> dict:
|
76
|
+
return self.__header
|
77
|
+
|
78
|
+
@property
|
79
|
+
def snp_data(self) -> pd.DataFrame | None:
|
80
|
+
return self.__snp_data
|
81
|
+
|
82
|
+
def handle(
|
83
|
+
self, file_rep: Path | str, conv_file: Path | str = None
|
84
|
+
) -> bool:
|
85
|
+
""" Processes the FinalReport.txt file. Highlights meta information
|
86
|
+
and data.
|
87
|
+
|
88
|
+
:param file_rep: The file FinalReport.txt or another name.
|
89
|
+
:param conv_file: The file that contains IDs of registration numbers
|
90
|
+
of animals.
|
91
|
+
:return: Returns true if file processing was successful, false if
|
92
|
+
there were errors.
|
93
|
+
"""
|
94
|
+
|
95
|
+
try:
|
96
|
+
|
97
|
+
if self.__allele is not None and self.__usecols is not None:
|
98
|
+
raise Exception("Error. Usecols is used for allele is none.")
|
99
|
+
|
100
|
+
if isinstance(file_rep, str):
|
101
|
+
file_rep = Path(file_rep)
|
102
|
+
|
103
|
+
if not file_rep.is_file() and not file_rep.exists():
|
104
|
+
return False
|
105
|
+
|
106
|
+
# Processing conversion file
|
107
|
+
if conv_file is not None:
|
108
|
+
if isinstance(conv_file, str):
|
109
|
+
conv_file = Path(conv_file)
|
110
|
+
|
111
|
+
if not conv_file.is_file() and not conv_file.exists():
|
112
|
+
return False
|
113
|
+
|
114
|
+
self.__convert_s_id(conv_file)
|
115
|
+
|
116
|
+
# # Processing report file
|
117
|
+
self.__handler_header(file_rep)
|
118
|
+
self.__handler_data(file_rep)
|
119
|
+
|
120
|
+
if not self.__snp_data.empty and self._map_rn is not None:
|
121
|
+
self.__snp_data['Sample ID'] = \
|
122
|
+
self.__snp_data['Sample ID'].map(
|
123
|
+
dict(zip(self._map_rn.SID, self._map_rn.UNIQ_KEY))
|
124
|
+
)
|
125
|
+
|
126
|
+
except Exception as e:
|
127
|
+
raise e
|
128
|
+
|
129
|
+
return True
|
130
|
+
|
131
|
+
def __handler_header(self, file_rep: Path) -> None:
|
132
|
+
""" Processes data from a file, selects meta-information.
|
133
|
+
|
134
|
+
:param file_rep: path, pointer to the file to be read.
|
135
|
+
"""
|
136
|
+
|
137
|
+
with open(file_rep, 'r') as file:
|
138
|
+
|
139
|
+
for line in file:
|
140
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
141
|
+
return
|
142
|
+
|
143
|
+
if self.__class__.__PATTERN_HEADER.findall(line.strip()) or\
|
144
|
+
len(line.strip()) == 0:
|
145
|
+
continue
|
146
|
+
|
147
|
+
key = line.strip().split("\t")[0]
|
148
|
+
value = line.strip().split("\t")[1]
|
149
|
+
|
150
|
+
self.__header[key] = value
|
151
|
+
|
152
|
+
def __handler_data(self, file_rep: Path) -> None:
|
153
|
+
""" Processes data and forms an array for further processing.
|
154
|
+
|
155
|
+
:param file_rep: path, pointer to the file to be read.
|
156
|
+
"""
|
157
|
+
|
158
|
+
with open(file_rep, 'r') as file:
|
159
|
+
|
160
|
+
# Search for the data start index and skip
|
161
|
+
for line in file:
|
162
|
+
if self.__class__.__PATTERN_DATA.findall(line.strip()):
|
163
|
+
break
|
164
|
+
|
165
|
+
# line column
|
166
|
+
orig_name_col = file.readline().strip().split(self._delimiter)
|
167
|
+
|
168
|
+
if self.__allele is None and self.__usecols is None:
|
169
|
+
self.__snp_data = pd.read_csv(
|
170
|
+
file,
|
171
|
+
sep=self._delimiter,
|
172
|
+
header=None,
|
173
|
+
names=orig_name_col,
|
174
|
+
dtype=self.__dtype,
|
175
|
+
low_memory=True,
|
176
|
+
na_filter=True
|
177
|
+
)
|
178
|
+
|
179
|
+
return
|
180
|
+
|
181
|
+
sub_n_col = self.__processing_columns(orig_name_col)
|
182
|
+
self.__snp_data = pd.read_csv(
|
183
|
+
file,
|
184
|
+
sep=self._delimiter,
|
185
|
+
header=None,
|
186
|
+
names=orig_name_col,
|
187
|
+
usecols=sub_n_col,
|
188
|
+
dtype=self.__dtype,
|
189
|
+
low_memory=True,
|
190
|
+
na_filter=True
|
191
|
+
)
|
192
|
+
|
193
|
+
return
|
194
|
+
|
195
|
+
def __processing_columns(self, lst_col: list[str]) -> list[str] | None:
|
196
|
+
""" Processing the line with all the names of the fields and the
|
197
|
+
sample of them.
|
198
|
+
|
199
|
+
:param lst_col: List of all fields.
|
200
|
+
:return: Returns a tuple with a list of names of selected fields.
|
201
|
+
"""
|
202
|
+
|
203
|
+
if self.__usecols is not None:
|
204
|
+
check_n_col = [
|
205
|
+
item for item in self.__usecols if item in lst_col
|
206
|
+
]
|
207
|
+
|
208
|
+
# Check on empty list
|
209
|
+
if check_n_col:
|
210
|
+
return self.__usecols
|
211
|
+
|
212
|
+
raise Exception(
|
213
|
+
f"Error. The USECOLS list contains not true fields."
|
214
|
+
)
|
215
|
+
|
216
|
+
# processing alleles
|
217
|
+
sample_n_col = self.__sample_by_allele(lst_col)
|
218
|
+
if sample_n_col is None:
|
219
|
+
raise Exception(
|
220
|
+
f"Error. Allele {self.__allele} not in data."
|
221
|
+
)
|
222
|
+
|
223
|
+
return sample_n_col
|
224
|
+
|
225
|
+
def __sample_by_allele(self, names: list[str]) -> list[str] | None:
|
226
|
+
""" Method that generates a list of field names choosing which alleles
|
227
|
+
to keep
|
228
|
+
|
229
|
+
:param names: List of field names in the report file.
|
230
|
+
:return: Returns a filtered list of fields by alleles.
|
231
|
+
"""
|
232
|
+
|
233
|
+
allele_templ = r'(^Allele\d\s[:-]\s{}\b)'
|
234
|
+
|
235
|
+
match self.__allele:
|
236
|
+
case None:
|
237
|
+
return names
|
238
|
+
|
239
|
+
case str():
|
240
|
+
allele_pattern = re.compile(
|
241
|
+
allele_templ.format(self.__allele)
|
242
|
+
)
|
243
|
+
|
244
|
+
case list() | tuple() | set():
|
245
|
+
allele_pattern = re.compile(
|
246
|
+
allele_templ.format("|".join(self.__allele))
|
247
|
+
)
|
248
|
+
case _:
|
249
|
+
return None
|
250
|
+
|
251
|
+
lst_allele = reduce(
|
252
|
+
lambda i, j: i + j,
|
253
|
+
[allele_pattern.findall(item) for item in names]
|
254
|
+
)
|
255
|
+
|
256
|
+
if len(lst_allele) == 0:
|
257
|
+
return None
|
258
|
+
|
259
|
+
exclude_alleles = [
|
260
|
+
item for item in names
|
261
|
+
if item.startswith("Allele") and item not in lst_allele
|
262
|
+
]
|
263
|
+
|
264
|
+
return list(filter(
|
265
|
+
lambda x: True if x not in exclude_alleles else False, names
|
266
|
+
))
|
267
|
+
|
268
|
+
def __convert_s_id(self, path_file: Path) -> None:
|
269
|
+
"""Converts sample id which is in FinalReport to animal registration
|
270
|
+
number.
|
271
|
+
|
272
|
+
:param path_file: xlsx file with animal numbers label
|
273
|
+
"""
|
274
|
+
|
275
|
+
self._map_rn = pd.read_excel(
|
276
|
+
path_file,
|
277
|
+
header=None,
|
278
|
+
names=['SID', 'UNIQ_KEY', 'SEX'],
|
279
|
+
dtype={'SID': str},
|
280
|
+
index_col=False
|
281
|
+
)
|
282
|
+
|
283
|
+
if self._map_rn.empty:
|
284
|
+
self._map_rn = None
|
285
|
+
return
|
286
|
+
|
287
|
+
self._map_rn.SID = self._map_rn.SID.str.strip()
|
288
|
+
self._map_rn.UNIQ_KEY = self._map_rn.UNIQ_KEY.str.strip()
|
289
|
+
|
290
|
+
if self._check_on_ru_symbols(self._map_rn.UNIQ_KEY):
|
291
|
+
raise Exception("Error. Unique keys contain Cyrillic alphabet.")
|
292
|
+
|
293
|
+
@staticmethod
|
294
|
+
def _check_on_ru_symbols(seq: pd.Series) -> bool | None:
|
295
|
+
""" Checial verification of the Cyrillic
|
296
|
+
|
297
|
+
:param seq: Squeezed for verification.
|
298
|
+
:return: Truth if there are no symbols of Cyril and there is a lie if
|
299
|
+
there is.
|
300
|
+
"""
|
301
|
+
|
302
|
+
return seq.apply(
|
303
|
+
lambda x: bool(re.search('[а-яА-Я]', x)) if x is not nan else x
|
304
|
+
).any()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: snplib
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.10
|
4
4
|
Summary: Snptools is a tool for Single Nucleotide Polymorphism (SNP) data processing
|
5
5
|
Author-email: Igor <igor.loschinin@gmail.com>
|
6
6
|
License: GNU
|
@@ -10,14 +10,14 @@ Classifier: Operating System :: OS Independent
|
|
10
10
|
Requires-Python: >=3.10
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
License-File: LICENSE
|
13
|
-
Requires-Dist: numpy>=
|
14
|
-
Requires-Dist: pandas>=2.
|
15
|
-
Requires-Dist: six>=1.
|
13
|
+
Requires-Dist: numpy>=2.2.3
|
14
|
+
Requires-Dist: pandas>=2.2.3
|
15
|
+
Requires-Dist: six>=1.17.0
|
16
16
|
Requires-Dist: swifter>=1.4.0
|
17
17
|
Requires-Dist: xlrd>=2.0.1
|
18
|
-
Requires-Dist: XlsxWriter>=3.
|
19
|
-
Requires-Dist: openpyxl>=3.1.
|
20
|
-
Requires-Dist: pydantic>=2.
|
18
|
+
Requires-Dist: XlsxWriter>=3.2.2
|
19
|
+
Requires-Dist: openpyxl>=3.1.5
|
20
|
+
Requires-Dist: pydantic>=2.10.6
|
21
21
|
|
22
22
|
# snptools
|
23
23
|
<p align="center">
|
@@ -63,6 +63,10 @@ tests/finalreport/files/fr/file6.txt
|
|
63
63
|
tests/finalreport/files/fr/file6.xlsx
|
64
64
|
tests/finalreport/files/fr/file7.txt
|
65
65
|
tests/finalreport/files/fr/file7.xlsx
|
66
|
+
tests/finalreport/files/fr/file8.txt
|
67
|
+
tests/finalreport/files/fr/file8.xlsx
|
68
|
+
tests/finalreport/files/fr/file9.txt
|
69
|
+
tests/finalreport/files/fr/file9.xlsx
|
66
70
|
tests/format/__init__.py
|
67
71
|
tests/format/test_plink_fam.py
|
68
72
|
tests/format/test_plink_lgen.py
|
@@ -0,0 +1,28 @@
|
|
1
|
+
[Header]
|
2
|
+
GSGT Version 2.0.4
|
3
|
+
Processing Date 10/14/2021 4:02 PM
|
4
|
+
Content BovineSNP50_v3_A1.bpm
|
5
|
+
Num SNPs 53218
|
6
|
+
Total SNPs 53218
|
7
|
+
Num Samples 3
|
8
|
+
Total Samples 3
|
9
|
+
[Data]
|
10
|
+
SNP Name Sample ID Allele1 - AB Allele2 - AB GC Score GT Score
|
11
|
+
ABCA12 1 A A 0.4048 0.8164
|
12
|
+
APAF1 1 B B 0.9067 0.9155
|
13
|
+
ARS-BFGL-BAC-10172 1 B B 0.9140 0.8767
|
14
|
+
ARS-BFGL-BAC-1020 1 B B 0.9288 0.8919
|
15
|
+
ARS-BFGL-BAC-10245 1 B B 0.7227 0.7447
|
16
|
+
ARS-BFGL-BAC-10345 1 A B 0.9468 0.9127
|
17
|
+
ABCA12 2 A A 0.4048 0.8164
|
18
|
+
APAF1 2 B B 0.9067 0.9155
|
19
|
+
ARS-BFGL-BAC-10172 2 A B 0.9140 0.8767
|
20
|
+
ARS-BFGL-BAC-1020 2 A B 0.9288 0.8919
|
21
|
+
ARS-BFGL-BAC-10245 2 A A 0.7227 0.7447
|
22
|
+
ARS-BFGL-BAC-10345 2 B B 0.9468 0.9127
|
23
|
+
ABCA12 3 A A 0.4048 0.8164
|
24
|
+
APAF1 3 B B 0.9067 0.9155
|
25
|
+
ARS-BFGL-BAC-10172 3 A B 0.9140 0.8767
|
26
|
+
ARS-BFGL-BAC-1020 3 A B 0.9288 0.8919
|
27
|
+
ARS-BFGL-BAC-10245 3 A A 0.7227 0.7447
|
28
|
+
ARS-BFGL-BAC-10345 3 A B 0.9468 0.9127
|
Binary file
|