polygenic-pgx 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polygenic/__init__.py +0 -0
- polygenic/__main__.py +12 -0
- polygenic/data/__init__.py +0 -0
- polygenic/data/csv_accessor.py +143 -0
- polygenic/data/data_accessor.py +109 -0
- polygenic/data/gwas.py +638 -0
- polygenic/data/mobigen_utils.py +70 -0
- polygenic/data/polars_frame.py +128 -0
- polygenic/data/snp_data.py +23 -0
- polygenic/data/vcf_accessor.py +220 -0
- polygenic/data/vcf_record.py +213 -0
- polygenic/error/__init__.py +0 -0
- polygenic/error/polygenic_exception.py +3 -0
- polygenic/model/__init__.py +0 -0
- polygenic/model/model.py +799 -0
- polygenic/model/utils.py +140 -0
- polygenic/pgstk.py +212 -0
- polygenic/resources/chromsizes/hg38.chrom.sizes +26 -0
- polygenic/rsidx/__init__.py +38 -0
- polygenic/rsidx/__main__.py +31 -0
- polygenic/rsidx/_version.py +520 -0
- polygenic/rsidx/cli.py +83 -0
- polygenic/rsidx/index.py +69 -0
- polygenic/rsidx/search.py +63 -0
- polygenic/tools/__init__.py +6 -0
- polygenic/tools/data/__init__.py +0 -0
- polygenic/tools/data/chromsizes.py +111 -0
- polygenic/tools/data/colors.py +49 -0
- polygenic/tools/gwasfilecreate.py +22 -0
- polygenic/tools/modelbiobankuk.py +196 -0
- polygenic/tools/modelgwasfile.py +14 -0
- polygenic/tools/modelpgscat.py +164 -0
- polygenic/tools/pgscompute.py +262 -0
- polygenic/tools/plotmanhattan.py +101 -0
- polygenic/tools/tsvtovcf.py +24 -0
- polygenic/tools/utils.py +562 -0
- polygenic/tools/vcfimpute.py +263 -0
- polygenic/tools/vcfindex.py +5 -0
- polygenic/tools/vcfstatbaf.py +98 -0
- polygenic/tools/vcfstatzygosity.py +75 -0
- polygenic/version.py +3 -0
- polygenic_pgx-2.5.0.dist-info/METADATA +43 -0
- polygenic_pgx-2.5.0.dist-info/RECORD +47 -0
- polygenic_pgx-2.5.0.dist-info/WHEEL +5 -0
- polygenic_pgx-2.5.0.dist-info/entry_points.txt +4 -0
- polygenic_pgx-2.5.0.dist-info/licenses/LICENSE +1 -0
- polygenic_pgx-2.5.0.dist-info/top_level.txt +1 -0
polygenic/__init__.py
ADDED
|
File without changes
|
polygenic/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
high level support for csv files
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import numpy as np
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from polygenic.error.polygenic_exception import PolygenicException
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger('polygenic.data.' + __name__)
|
|
14
|
+
|
|
15
|
+
class CsvAccessor(object):
|
|
16
|
+
"""
|
|
17
|
+
class for reading csv files
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, csv_path: str, column_mappings: dict = None):
|
|
21
|
+
super().__init__()
|
|
22
|
+
self.__path = csv_path
|
|
23
|
+
self.__delimiter = '\t'
|
|
24
|
+
self.__column_name_mapping = {}
|
|
25
|
+
if not os.path.exists(self.__path):
|
|
26
|
+
raise PolygenicException(f"Can not access {self.__path}")
|
|
27
|
+
self.__data = self.read_data()
|
|
28
|
+
if column_mappings is not None:
|
|
29
|
+
self.standardize_column_names(column_mappings)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def __find_name_of_column_by_list_of_synonyms(self, names: list, equals_instead_of_contains: bool = True):
|
|
34
|
+
for column_name in self.__data.columns:
|
|
35
|
+
for name in names:
|
|
36
|
+
if name is not None:
|
|
37
|
+
if equals_instead_of_contains:
|
|
38
|
+
if name.lower() == column_name.lower():
|
|
39
|
+
return column_name
|
|
40
|
+
else:
|
|
41
|
+
if name.lower() in column_name.lower():
|
|
42
|
+
return column_name
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
def __map_column_names(self, column_names: dict):
|
|
46
|
+
"""
|
|
47
|
+
map the column names to the internal names
|
|
48
|
+
"""
|
|
49
|
+
self.__column_name_mapping.update({'rsid': self.__find_name_of_column_by_list_of_synonyms([column_names.get('rsid_column_name'), 'rsid', 'SNP'])})
|
|
50
|
+
self.__column_name_mapping.update({'chromosome': self.__find_name_of_column_by_list_of_synonyms([column_names.get('chromosome_column_name'), 'chromosome', 'chrom', 'chr'])})
|
|
51
|
+
self.__column_name_mapping.update({'gnomad': self.__find_name_of_column_by_list_of_synonyms([column_names.get('gnomadid_column_name'), 'gnomadid', 'gnomad'])})
|
|
52
|
+
self.__column_name_mapping.update({'position': self.__find_name_of_column_by_list_of_synonyms([column_names.get('position_column_name'), 'position', 'pos', 'bp'])})
|
|
53
|
+
self.__column_name_mapping.update({'ref': self.__find_name_of_column_by_list_of_synonyms([column_names.get('ref_allele_column_name'), 'ref', 'reference', 'ref_allele', 'A1'])})
|
|
54
|
+
self.__column_name_mapping.update({'alt': self.__find_name_of_column_by_list_of_synonyms([column_names.get('alt_allele_column_name'), 'alt', 'alt_allele', 'other_allele', 'effect', 'effect_allele', 'A2'])})
|
|
55
|
+
self.__column_name_mapping.update({'effect': self.__find_name_of_column_by_list_of_synonyms([column_names.get('effect_allele_column_name'), 'effect', 'effect_allele', 'alt', 'A2'])})
|
|
56
|
+
self.__column_name_mapping.update({'pvalue': self.__find_name_of_column_by_list_of_synonyms([column_names.get('pvalue_column_name'), 'pvalue', 'p'])})
|
|
57
|
+
self.__column_name_mapping.update({'beta': self.__find_name_of_column_by_list_of_synonyms([column_names.get('beta_column_name'), 'beta', 'beta_coefficient'])})
|
|
58
|
+
self.__column_name_mapping.update({'or': self.__find_name_of_column_by_list_of_synonyms([column_names.get('af_column_name'), 'or'])})
|
|
59
|
+
self.__column_name_mapping.update({'af': self.__find_name_of_column_by_list_of_synonyms([column_names.get('af_column_name'), 'af'])})
|
|
60
|
+
self.__column_name_mapping.update({'info': self.__find_name_of_column_by_list_of_synonyms([column_names.get('info_column_name'), 'info'])})
|
|
61
|
+
# if thers is no chromosome and position throw error
|
|
62
|
+
if self.__column_name_mapping.get('chromosome') is None or self.__column_name_mapping.get('position') is None:
|
|
63
|
+
raise PolygenicException("chromosome and position column names must be provided")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_column_names(self):
|
|
67
|
+
"""
|
|
68
|
+
return the column names
|
|
69
|
+
"""
|
|
70
|
+
return self.__data.columns
|
|
71
|
+
|
|
72
|
+
def standardize_column_names(self, column_names: dict):
|
|
73
|
+
"""
|
|
74
|
+
standardize the column names
|
|
75
|
+
"""
|
|
76
|
+
renaming_dict = {}
|
|
77
|
+
|
|
78
|
+
self.__map_column_names(column_names)
|
|
79
|
+
for key, value in self.__column_name_mapping.items():
|
|
80
|
+
if value is not None:
|
|
81
|
+
renaming_dict.update({value: key})
|
|
82
|
+
|
|
83
|
+
self.__data.rename(columns=renaming_dict, inplace=True)
|
|
84
|
+
### if effect and reference are the same, then we should add additional reference column
|
|
85
|
+
if self.__column_name_mapping.get("ref") == self.__column_name_mapping.get("effect"):
|
|
86
|
+
if 'ref' not in self.__data.columns:
|
|
87
|
+
self.__data['ref'] = self.__data['effect']
|
|
88
|
+
if 'effect' not in self.__data.columns:
|
|
89
|
+
self.__data['effect'] = self.__data['ref']
|
|
90
|
+
### the same for alt
|
|
91
|
+
if self.__column_name_mapping.get("alt") == self.__column_name_mapping.get("effect"):
|
|
92
|
+
if 'alt' not in self.__data.columns:
|
|
93
|
+
self.__data['alt'] = self.__data['effect']
|
|
94
|
+
if 'effect' not in self.__data.columns:
|
|
95
|
+
self.__data['effect'] = self.__data['alt']
|
|
96
|
+
|
|
97
|
+
### if there are columns missing fill with None
|
|
98
|
+
for key in column_names.keys():
|
|
99
|
+
if key not in self.__data.columns:
|
|
100
|
+
self.__data[key] = None
|
|
101
|
+
self.__data = self.__data[column_names.keys()]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_data(self):
|
|
107
|
+
"""
|
|
108
|
+
return the dataframe
|
|
109
|
+
"""
|
|
110
|
+
return self.__data
|
|
111
|
+
|
|
112
|
+
def read_data(self):
|
|
113
|
+
"""
|
|
114
|
+
read the csv file and return a dataframe
|
|
115
|
+
"""
|
|
116
|
+
temp = pd.read_csv(filepath_or_buffer = self.__path, sep = self.__delimiter, nrows = 500)
|
|
117
|
+
n = len(temp.to_csv(index=False))
|
|
118
|
+
df = [temp[:0]]
|
|
119
|
+
t = 500 * int(os.path.getsize(self.__path)/n*500*2.5/10**5) + 1
|
|
120
|
+
with tqdm(total = t, file = sys.stdout, leave=False) as pbar:
|
|
121
|
+
for i,chunk in enumerate(pd.read_csv(self.__path, sep = self.__delimiter, chunksize=10**5, low_memory=False)):
|
|
122
|
+
df.append(chunk)
|
|
123
|
+
pbar.set_description('Reading csv chunks (estimated): %d' % ((1 + i) * 500))
|
|
124
|
+
pbar.update(500)
|
|
125
|
+
|
|
126
|
+
# data = temp[:0].append(df)
|
|
127
|
+
data = pd.concat([temp[:0]] + df)
|
|
128
|
+
|
|
129
|
+
del df
|
|
130
|
+
return data
|
|
131
|
+
|
|
132
|
+
def get_symbol_for_genomic_position(self, chrom, pos):
|
|
133
|
+
"""
|
|
134
|
+
return the symbol for a genomic position
|
|
135
|
+
"""
|
|
136
|
+
data = self.__data
|
|
137
|
+
data = data.loc[data["chromosome"] == str(chrom)]
|
|
138
|
+
if len(data.index) == 0:
|
|
139
|
+
return None
|
|
140
|
+
data = data.assign(pos_start = abs(data["start"] - np.int64(pos)),
|
|
141
|
+
pos_end = abs(data["end"] - np.int64(pos)))
|
|
142
|
+
data = data.assign(position = data[["pos_start", "pos_end"]].min(axis = 1))
|
|
143
|
+
return data.sort_values(by=['pos_start'])['symbol'].head(1).iloc[0]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from polygenic.data.vcf_accessor import VcfAccessor
|
|
3
|
+
from polygenic.data.vcf_record import VcfRecord
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger('description_language.' + __name__)
|
|
6
|
+
|
|
7
|
+
class DataAccessor(object):
|
|
8
|
+
def __init__(self,
|
|
9
|
+
genotypes: VcfAccessor,
|
|
10
|
+
allele_frequencies: VcfAccessor = None,
|
|
11
|
+
sample_name: str = None,
|
|
12
|
+
model_name: str = "",
|
|
13
|
+
af_field_name: str = "AF_nfe",
|
|
14
|
+
parameters = {}):
|
|
15
|
+
self.__genotypes = genotypes
|
|
16
|
+
self.__allele_frequencies = allele_frequencies
|
|
17
|
+
if sample_name is None:
|
|
18
|
+
sample_name = genotypes.get_sample_names()[0]
|
|
19
|
+
self.__sample_name = sample_name
|
|
20
|
+
self.__af_field_name = af_field_name
|
|
21
|
+
self.__parameters = parameters
|
|
22
|
+
self.__cache = {}
|
|
23
|
+
|
|
24
|
+
def get_parameters(self) -> dict:
|
|
25
|
+
return(self.__parameters)
|
|
26
|
+
|
|
27
|
+
def get_copy_number(self, region: str) -> dict:
|
|
28
|
+
"""Resolve the structural / copy-number event for a gene region (CNV calling).
|
|
29
|
+
|
|
30
|
+
`region` is "chrom:start-end". Returns the FIRST structural record the sample
|
|
31
|
+
actually carries (GT contains an alt), exposing total copy number, span (for
|
|
32
|
+
whole-vs-partial), and phase (which haplotype carries it — resolved upstream).
|
|
33
|
+
Returns source "missing" when there is no structural call for the sample.
|
|
34
|
+
|
|
35
|
+
polygenic does NOT compute allelic ratios or apply thresholds here — allele-specific
|
|
36
|
+
copy number must already be encoded as phase by the upstream caller."""
|
|
37
|
+
result = {
|
|
38
|
+
"copy_number": None, "svtype": None, "start": None, "end": None,
|
|
39
|
+
"phased": None, "gt": None, "source": "missing"
|
|
40
|
+
}
|
|
41
|
+
if self.__genotypes is None:
|
|
42
|
+
return result
|
|
43
|
+
try:
|
|
44
|
+
chromosome, span = region.split(":")
|
|
45
|
+
region_start, region_end = (int(value) for value in span.split("-"))
|
|
46
|
+
except (ValueError, AttributeError):
|
|
47
|
+
return result
|
|
48
|
+
records = self.__genotypes.get_structural_records_by_region(chromosome, region_start, region_end)
|
|
49
|
+
for record in records:
|
|
50
|
+
gt = record.get_fmt_field(self.__sample_name, "GT")
|
|
51
|
+
if gt is None or "1" not in gt: # sample does not carry this structural event
|
|
52
|
+
continue
|
|
53
|
+
result["copy_number"] = record.get_copy_number(self.__sample_name)
|
|
54
|
+
result["svtype"] = record.get_svtype()
|
|
55
|
+
result["start"] = int(record.get_pos())
|
|
56
|
+
result["end"] = record.get_end()
|
|
57
|
+
result["phased"] = record.is_phased(self.__sample_name)
|
|
58
|
+
result["gt"] = gt
|
|
59
|
+
result["source"] = "structural"
|
|
60
|
+
return result
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
def get_genotype_by_rsid(self, rsid) -> VcfRecord:
|
|
64
|
+
if rsid in self.__cache:
|
|
65
|
+
return self.__cache[rsid]
|
|
66
|
+
genotype = {"rsid": rsid}
|
|
67
|
+
if not self.__genotypes is None:
|
|
68
|
+
record = self.__genotypes.get_record_by_rsid(rsid)
|
|
69
|
+
#print("====================>" + str(rsid) + " " + " " + str(record.get_ref()))
|
|
70
|
+
if not record is None:
|
|
71
|
+
genotype["genotype"] = record.get_genotype(self.__sample_name)
|
|
72
|
+
genotype["phased"] = record.is_phased(self.__sample_name)
|
|
73
|
+
if genotype["genotype"][0] is None:
|
|
74
|
+
if not self.__allele_frequencies is None:
|
|
75
|
+
af_record = self.__allele_frequencies.get_record_by_rsid(rsid)
|
|
76
|
+
if af_record is not None:
|
|
77
|
+
genotype["genotype"] = af_record.get_genotype_by_af(self.__af_field_name)
|
|
78
|
+
genotype["phased"] = False
|
|
79
|
+
genotype["source"] = "af"
|
|
80
|
+
genotype["ref"] = af_record.get_ref()
|
|
81
|
+
self.__cache[rsid] = genotype
|
|
82
|
+
return genotype
|
|
83
|
+
if record.get_ref() is not None and self.__parameters.get('ref_fallback', False):
|
|
84
|
+
genotype["genotype"] = [record.get_ref(), record.get_ref()]
|
|
85
|
+
genotype["phased"] = False
|
|
86
|
+
genotype["source"] = "reference"
|
|
87
|
+
genotype["ref"] = record.get_ref()
|
|
88
|
+
self.__cache[rsid] = genotype
|
|
89
|
+
return genotype
|
|
90
|
+
record is None
|
|
91
|
+
else:
|
|
92
|
+
if record.is_ldproxy(self.__sample_name):
|
|
93
|
+
genotype["source"] = "ldproxy"
|
|
94
|
+
elif record.is_imputed():
|
|
95
|
+
genotype["source"] = "imputing"
|
|
96
|
+
else:
|
|
97
|
+
genotype["source"] = "genotyping"
|
|
98
|
+
genotype["ref"] = record.get_ref()
|
|
99
|
+
self.__cache[rsid] = genotype
|
|
100
|
+
return genotype
|
|
101
|
+
genotype["genotype"] = [None, None]
|
|
102
|
+
genotype["phased"] = None
|
|
103
|
+
genotype["source"] = "missing"
|
|
104
|
+
genotype["ref"] = None
|
|
105
|
+
self.__cache[rsid] = genotype
|
|
106
|
+
return genotype
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|