polygenic-pgx 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. polygenic/__init__.py +0 -0
  2. polygenic/__main__.py +12 -0
  3. polygenic/data/__init__.py +0 -0
  4. polygenic/data/csv_accessor.py +143 -0
  5. polygenic/data/data_accessor.py +109 -0
  6. polygenic/data/gwas.py +638 -0
  7. polygenic/data/mobigen_utils.py +70 -0
  8. polygenic/data/polars_frame.py +128 -0
  9. polygenic/data/snp_data.py +23 -0
  10. polygenic/data/vcf_accessor.py +220 -0
  11. polygenic/data/vcf_record.py +213 -0
  12. polygenic/error/__init__.py +0 -0
  13. polygenic/error/polygenic_exception.py +3 -0
  14. polygenic/model/__init__.py +0 -0
  15. polygenic/model/model.py +799 -0
  16. polygenic/model/utils.py +140 -0
  17. polygenic/pgstk.py +212 -0
  18. polygenic/resources/chromsizes/hg38.chrom.sizes +26 -0
  19. polygenic/rsidx/__init__.py +38 -0
  20. polygenic/rsidx/__main__.py +31 -0
  21. polygenic/rsidx/_version.py +520 -0
  22. polygenic/rsidx/cli.py +83 -0
  23. polygenic/rsidx/index.py +69 -0
  24. polygenic/rsidx/search.py +63 -0
  25. polygenic/tools/__init__.py +6 -0
  26. polygenic/tools/data/__init__.py +0 -0
  27. polygenic/tools/data/chromsizes.py +111 -0
  28. polygenic/tools/data/colors.py +49 -0
  29. polygenic/tools/gwasfilecreate.py +22 -0
  30. polygenic/tools/modelbiobankuk.py +196 -0
  31. polygenic/tools/modelgwasfile.py +14 -0
  32. polygenic/tools/modelpgscat.py +164 -0
  33. polygenic/tools/pgscompute.py +262 -0
  34. polygenic/tools/plotmanhattan.py +101 -0
  35. polygenic/tools/tsvtovcf.py +24 -0
  36. polygenic/tools/utils.py +562 -0
  37. polygenic/tools/vcfimpute.py +263 -0
  38. polygenic/tools/vcfindex.py +5 -0
  39. polygenic/tools/vcfstatbaf.py +98 -0
  40. polygenic/tools/vcfstatzygosity.py +75 -0
  41. polygenic/version.py +3 -0
  42. polygenic_pgx-2.5.0.dist-info/METADATA +43 -0
  43. polygenic_pgx-2.5.0.dist-info/RECORD +47 -0
  44. polygenic_pgx-2.5.0.dist-info/WHEEL +5 -0
  45. polygenic_pgx-2.5.0.dist-info/entry_points.txt +4 -0
  46. polygenic_pgx-2.5.0.dist-info/licenses/LICENSE +1 -0
  47. polygenic_pgx-2.5.0.dist-info/top_level.txt +1 -0
polygenic/__init__.py ADDED
File without changes
polygenic/__main__.py ADDED
@@ -0,0 +1,12 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """
4
+ Entrypoint module
5
+ """
6
+
7
+ import sys
8
+
9
+ from polygenic.pgstk import main
10
+
11
+ if __name__ == "__main__":
12
+ main(sys.argv[1:])
File without changes
@@ -0,0 +1,143 @@
1
+ """
2
+ high level support for csv files
3
+ """
4
+ import logging
5
+ import os
6
+ import sys
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+
10
+ import pandas as pd
11
+ from polygenic.error.polygenic_exception import PolygenicException
12
+
13
+ logger = logging.getLogger('polygenic.data.' + __name__)
14
+
15
+ class CsvAccessor(object):
16
+ """
17
+ class for reading csv files
18
+ """
19
+
20
+ def __init__(self, csv_path: str, column_mappings: dict = None):
21
+ super().__init__()
22
+ self.__path = csv_path
23
+ self.__delimiter = '\t'
24
+ self.__column_name_mapping = {}
25
+ if not os.path.exists(self.__path):
26
+ raise PolygenicException(f"Can not access {self.__path}")
27
+ self.__data = self.read_data()
28
+ if column_mappings is not None:
29
+ self.standardize_column_names(column_mappings)
30
+
31
+
32
+
33
+ def __find_name_of_column_by_list_of_synonyms(self, names: list, equals_instead_of_contains: bool = True):
34
+ for column_name in self.__data.columns:
35
+ for name in names:
36
+ if name is not None:
37
+ if equals_instead_of_contains:
38
+ if name.lower() == column_name.lower():
39
+ return column_name
40
+ else:
41
+ if name.lower() in column_name.lower():
42
+ return column_name
43
+ return None
44
+
45
+ def __map_column_names(self, column_names: dict):
46
+ """
47
+ map the column names to the internal names
48
+ """
49
+ self.__column_name_mapping.update({'rsid': self.__find_name_of_column_by_list_of_synonyms([column_names.get('rsid_column_name'), 'rsid', 'SNP'])})
50
+ self.__column_name_mapping.update({'chromosome': self.__find_name_of_column_by_list_of_synonyms([column_names.get('chromosome_column_name'), 'chromosome', 'chrom', 'chr'])})
51
+ self.__column_name_mapping.update({'gnomad': self.__find_name_of_column_by_list_of_synonyms([column_names.get('gnomadid_column_name'), 'gnomadid', 'gnomad'])})
52
+ self.__column_name_mapping.update({'position': self.__find_name_of_column_by_list_of_synonyms([column_names.get('position_column_name'), 'position', 'pos', 'bp'])})
53
+ self.__column_name_mapping.update({'ref': self.__find_name_of_column_by_list_of_synonyms([column_names.get('ref_allele_column_name'), 'ref', 'reference', 'ref_allele', 'A1'])})
54
+ self.__column_name_mapping.update({'alt': self.__find_name_of_column_by_list_of_synonyms([column_names.get('alt_allele_column_name'), 'alt', 'alt_allele', 'other_allele', 'effect', 'effect_allele', 'A2'])})
55
+ self.__column_name_mapping.update({'effect': self.__find_name_of_column_by_list_of_synonyms([column_names.get('effect_allele_column_name'), 'effect', 'effect_allele', 'alt', 'A2'])})
56
+ self.__column_name_mapping.update({'pvalue': self.__find_name_of_column_by_list_of_synonyms([column_names.get('pvalue_column_name'), 'pvalue', 'p'])})
57
+ self.__column_name_mapping.update({'beta': self.__find_name_of_column_by_list_of_synonyms([column_names.get('beta_column_name'), 'beta', 'beta_coefficient'])})
58
+ self.__column_name_mapping.update({'or': self.__find_name_of_column_by_list_of_synonyms([column_names.get('af_column_name'), 'or'])})
59
+ self.__column_name_mapping.update({'af': self.__find_name_of_column_by_list_of_synonyms([column_names.get('af_column_name'), 'af'])})
60
+ self.__column_name_mapping.update({'info': self.__find_name_of_column_by_list_of_synonyms([column_names.get('info_column_name'), 'info'])})
61
+ # if thers is no chromosome and position throw error
62
+ if self.__column_name_mapping.get('chromosome') is None or self.__column_name_mapping.get('position') is None:
63
+ raise PolygenicException("chromosome and position column names must be provided")
64
+
65
+
66
+ def get_column_names(self):
67
+ """
68
+ return the column names
69
+ """
70
+ return self.__data.columns
71
+
72
+ def standardize_column_names(self, column_names: dict):
73
+ """
74
+ standardize the column names
75
+ """
76
+ renaming_dict = {}
77
+
78
+ self.__map_column_names(column_names)
79
+ for key, value in self.__column_name_mapping.items():
80
+ if value is not None:
81
+ renaming_dict.update({value: key})
82
+
83
+ self.__data.rename(columns=renaming_dict, inplace=True)
84
+ ### if effect and reference are the same, then we should add additional reference column
85
+ if self.__column_name_mapping.get("ref") == self.__column_name_mapping.get("effect"):
86
+ if 'ref' not in self.__data.columns:
87
+ self.__data['ref'] = self.__data['effect']
88
+ if 'effect' not in self.__data.columns:
89
+ self.__data['effect'] = self.__data['ref']
90
+ ### the same for alt
91
+ if self.__column_name_mapping.get("alt") == self.__column_name_mapping.get("effect"):
92
+ if 'alt' not in self.__data.columns:
93
+ self.__data['alt'] = self.__data['effect']
94
+ if 'effect' not in self.__data.columns:
95
+ self.__data['effect'] = self.__data['alt']
96
+
97
+ ### if there are columns missing fill with None
98
+ for key in column_names.keys():
99
+ if key not in self.__data.columns:
100
+ self.__data[key] = None
101
+ self.__data = self.__data[column_names.keys()]
102
+
103
+
104
+
105
+
106
+ def get_data(self):
107
+ """
108
+ return the dataframe
109
+ """
110
+ return self.__data
111
+
112
+ def read_data(self):
113
+ """
114
+ read the csv file and return a dataframe
115
+ """
116
+ temp = pd.read_csv(filepath_or_buffer = self.__path, sep = self.__delimiter, nrows = 500)
117
+ n = len(temp.to_csv(index=False))
118
+ df = [temp[:0]]
119
+ t = 500 * int(os.path.getsize(self.__path)/n*500*2.5/10**5) + 1
120
+ with tqdm(total = t, file = sys.stdout, leave=False) as pbar:
121
+ for i,chunk in enumerate(pd.read_csv(self.__path, sep = self.__delimiter, chunksize=10**5, low_memory=False)):
122
+ df.append(chunk)
123
+ pbar.set_description('Reading csv chunks (estimated): %d' % ((1 + i) * 500))
124
+ pbar.update(500)
125
+
126
+ # data = temp[:0].append(df)
127
+ data = pd.concat([temp[:0]] + df)
128
+
129
+ del df
130
+ return data
131
+
132
+ def get_symbol_for_genomic_position(self, chrom, pos):
133
+ """
134
+ return the symbol for a genomic position
135
+ """
136
+ data = self.__data
137
+ data = data.loc[data["chromosome"] == str(chrom)]
138
+ if len(data.index) == 0:
139
+ return None
140
+ data = data.assign(pos_start = abs(data["start"] - np.int64(pos)),
141
+ pos_end = abs(data["end"] - np.int64(pos)))
142
+ data = data.assign(position = data[["pos_start", "pos_end"]].min(axis = 1))
143
+ return data.sort_values(by=['pos_start'])['symbol'].head(1).iloc[0]
@@ -0,0 +1,109 @@
1
+ import logging
2
+ from polygenic.data.vcf_accessor import VcfAccessor
3
+ from polygenic.data.vcf_record import VcfRecord
4
+
5
+ logger = logging.getLogger('description_language.' + __name__)
6
+
7
+ class DataAccessor(object):
8
+ def __init__(self,
9
+ genotypes: VcfAccessor,
10
+ allele_frequencies: VcfAccessor = None,
11
+ sample_name: str = None,
12
+ model_name: str = "",
13
+ af_field_name: str = "AF_nfe",
14
+ parameters = {}):
15
+ self.__genotypes = genotypes
16
+ self.__allele_frequencies = allele_frequencies
17
+ if sample_name is None:
18
+ sample_name = genotypes.get_sample_names()[0]
19
+ self.__sample_name = sample_name
20
+ self.__af_field_name = af_field_name
21
+ self.__parameters = parameters
22
+ self.__cache = {}
23
+
24
+ def get_parameters(self) -> dict:
25
+ return(self.__parameters)
26
+
27
+ def get_copy_number(self, region: str) -> dict:
28
+ """Resolve the structural / copy-number event for a gene region (CNV calling).
29
+
30
+ `region` is "chrom:start-end". Returns the FIRST structural record the sample
31
+ actually carries (GT contains an alt), exposing total copy number, span (for
32
+ whole-vs-partial), and phase (which haplotype carries it — resolved upstream).
33
+ Returns source "missing" when there is no structural call for the sample.
34
+
35
+ polygenic does NOT compute allelic ratios or apply thresholds here — allele-specific
36
+ copy number must already be encoded as phase by the upstream caller."""
37
+ result = {
38
+ "copy_number": None, "svtype": None, "start": None, "end": None,
39
+ "phased": None, "gt": None, "source": "missing"
40
+ }
41
+ if self.__genotypes is None:
42
+ return result
43
+ try:
44
+ chromosome, span = region.split(":")
45
+ region_start, region_end = (int(value) for value in span.split("-"))
46
+ except (ValueError, AttributeError):
47
+ return result
48
+ records = self.__genotypes.get_structural_records_by_region(chromosome, region_start, region_end)
49
+ for record in records:
50
+ gt = record.get_fmt_field(self.__sample_name, "GT")
51
+ if gt is None or "1" not in gt: # sample does not carry this structural event
52
+ continue
53
+ result["copy_number"] = record.get_copy_number(self.__sample_name)
54
+ result["svtype"] = record.get_svtype()
55
+ result["start"] = int(record.get_pos())
56
+ result["end"] = record.get_end()
57
+ result["phased"] = record.is_phased(self.__sample_name)
58
+ result["gt"] = gt
59
+ result["source"] = "structural"
60
+ return result
61
+ return result
62
+
63
+ def get_genotype_by_rsid(self, rsid) -> VcfRecord:
64
+ if rsid in self.__cache:
65
+ return self.__cache[rsid]
66
+ genotype = {"rsid": rsid}
67
+ if not self.__genotypes is None:
68
+ record = self.__genotypes.get_record_by_rsid(rsid)
69
+ #print("====================>" + str(rsid) + " " + " " + str(record.get_ref()))
70
+ if not record is None:
71
+ genotype["genotype"] = record.get_genotype(self.__sample_name)
72
+ genotype["phased"] = record.is_phased(self.__sample_name)
73
+ if genotype["genotype"][0] is None:
74
+ if not self.__allele_frequencies is None:
75
+ af_record = self.__allele_frequencies.get_record_by_rsid(rsid)
76
+ if af_record is not None:
77
+ genotype["genotype"] = af_record.get_genotype_by_af(self.__af_field_name)
78
+ genotype["phased"] = False
79
+ genotype["source"] = "af"
80
+ genotype["ref"] = af_record.get_ref()
81
+ self.__cache[rsid] = genotype
82
+ return genotype
83
+ if record.get_ref() is not None and self.__parameters.get('ref_fallback', False):
84
+ genotype["genotype"] = [record.get_ref(), record.get_ref()]
85
+ genotype["phased"] = False
86
+ genotype["source"] = "reference"
87
+ genotype["ref"] = record.get_ref()
88
+ self.__cache[rsid] = genotype
89
+ return genotype
90
+ record is None
91
+ else:
92
+ if record.is_ldproxy(self.__sample_name):
93
+ genotype["source"] = "ldproxy"
94
+ elif record.is_imputed():
95
+ genotype["source"] = "imputing"
96
+ else:
97
+ genotype["source"] = "genotyping"
98
+ genotype["ref"] = record.get_ref()
99
+ self.__cache[rsid] = genotype
100
+ return genotype
101
+ genotype["genotype"] = [None, None]
102
+ genotype["phased"] = None
103
+ genotype["source"] = "missing"
104
+ genotype["ref"] = None
105
+ self.__cache[rsid] = genotype
106
+ return genotype
107
+
108
+
109
+