geney 1.2.40__py2.py3-none-any.whl → 1.2.41__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/graphic_utils.py CHANGED
@@ -1,9 +1,8 @@
1
-
2
1
  import matplotlib.pyplot as plt
3
2
  from matplotlib.patches import Rectangle
4
3
  import seaborn as sns
5
4
  from collections import namedtuple
6
- from geney.utils import find_files_by_gene_name, reverse_complement, unload_pickle, contains, unload_json, dump_json #, is_monotonic
5
+ from geney.utils import unload_pickle, contains, unload_json, dump_json
7
6
 
8
7
 
9
8
  ### Graphical Stuff
geney/oncosplice.py CHANGED
@@ -331,19 +331,19 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
331
331
  report['primary_transcript'] = reference_transcript.primary_transcript
332
332
  report['transcript_id'] = reference_transcript.transcript_id
333
333
  # report['mut_id'] = mut.mut_id
334
- report['cons_available'] = int(reference_transcript.cons_available)
334
+ # report['cons_available'] = int(reference_transcript.cons_available)
335
335
  # report['protein_coding'] = reference_transcript.transcript_biotype
336
336
 
337
337
  # report['reference_mrna'] = reference_transcript.transcript_seq
338
- report['reference_cds_start'] = reference_transcript.TIS
338
+ # report['reference_cds_start'] = reference_transcript.TIS
339
339
  # report['reference_pre_mrna'] = reference_transcript.pre_mrna
340
340
  # report[
341
341
  # 'reference_orf'] = reference_transcript.orf # pre_mrna[reference_transcript.transcript_indices.index(reference_transcript.TIS):reference_transcript.transcript_indices.index(reference_transcript.TTS)]
342
342
  report['reference_protein'] = reference_transcript.protein
343
- report['reference_protein_length'] = len(reference_transcript.protein)
343
+ # report['reference_protein_length'] = len(reference_transcript.protein)
344
344
 
345
345
  # report['variant_mrna'] = variant_transcript.transcript_seq
346
- report['variant_cds_start'] = variant_transcript.TIS
346
+ # report['variant_cds_start'] = variant_transcript.TIS
347
347
  # report[
348
348
  # 'variant_pre_mrna'] = variant_transcript.pre_mrna # pre_mrna[variant_transcript.transcript_indices.index(variant_transcript.TIS):variant_transcript.transcript_indices.index(variant_transcript.TTS)]
349
349
  # report['variant_orf'] = variant_transcript.orf
@@ -363,6 +363,8 @@ def OncospliceAnnotator(reference_transcript, variant_transcript, mut):
363
363
 
364
364
  def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, organism='hg38', engine='spliceai', domains=None):
365
365
  gene = Gene(mut_id.split(':')[0], organism=organism)
366
+ reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
367
+
366
368
  mutations = [get_mutation(m, rev=gene.rev) for m in mut_id.split('|')]
367
369
 
368
370
  results = []
@@ -408,7 +410,7 @@ def oncosplice(mut_id, splicing_threshold=0.5, protein_coding=True, primary_tran
408
410
  report['isoform_prevalence'] = new_boundaries['path_weight']
409
411
  report['full_missplicing'] = missplicing.aberrant_splicing
410
412
  report['missplicing'] = max(missplicing)
411
- # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
413
+ report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
412
414
  results.append(report)
413
415
 
414
416
  report = pd.DataFrame(results)
@@ -445,6 +447,8 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
445
447
  index=['domain_identifier', 'score'])
446
448
 
447
449
  gene = Gene(mut_id.split(':')[0], organism=organism)
450
+ reference_gene_proteins = {tid: transcript.generate_pre_mrna().generate_mature_mrna().generate_protein() for tid, transcript in gene.run_transcripts(protein_coding=True)}
451
+
448
452
  mutations = [get_mutation(mut_id, rev=gene.rev) for mut_id in mut_id.split('|')]
449
453
 
450
454
  results = []
@@ -501,7 +505,7 @@ async def oncosplice_prototype(mut_id, splicing_threshold=0.5, protein_coding=Tr
501
505
  report['full_missplicing'] = missplicing.aberrant_splicing
502
506
  report['missplicing'] = max(missplicing)
503
507
  report['domains_affected'] = domains_affected
504
- # report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
508
+ report['reference_resemblance'] = reference_gene_proteins.get(transcript.protein, None)
505
509
  results.append(pd.Series(report))
506
510
 
507
511
  report = pd.concat(results, axis=1).T
geney/tcga_utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import pandas as pd
3
3
  import random
4
4
  from pathlib import Path
5
+ from tqdm import tqdm
5
6
 
6
7
  class TCGACase:
7
8
  def __init__(self, df):
@@ -98,38 +99,61 @@ class TCGACase:
98
99
  class TCGAGene:
99
100
  def __init__(self, gene, cancer_path=Path('/tamir2/cancer_proj/gdc_db/data/filtered_feb_2021/AllGenes/'),
100
101
  valid_cases=None, extra_cols=[], exclude_filters=None, include_filter=None):
101
- df = pd.read_csv(cancer_path / gene / 'GeneMutTble.txt',
102
- usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
103
- 'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
104
- 'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
105
- 'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type'] + extra_cols,
106
- low_memory=False).sort_values('Start_Position', ascending=True)
107
-
108
- if df.empty:
109
- self.df = df
102
+ file_path = cancer_path / gene / 'GeneMutTble.txt'
103
+ if not file_path.exists():
104
+ self.df = pd.DataFrame()
110
105
 
111
106
  else:
112
- df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
113
- df = df.astype({'Start_Position': int})
114
-
115
- if include_filter is not None:
116
- df = df[df.FILTER == include_filter]
107
+ df = pd.read_csv(file_path,
108
+ usecols=['Variant_Type', 'FILTER', 'vcf_tumor_gt', 'vcf_normal_gt',
109
+ 'COSMIC', 't_depth', 't_ref_count', 't_alt_count', 'Proj_name',
110
+ 'HGVSc', 'Chromosome', 'Start_Position', 'Reference_Allele',
111
+ 'Tumor_Seq_Allele2', 'case_id', 'Gene_name', 'Variant_Type',
112
+ 'Variant_Classification'] + extra_cols,
113
+ low_memory=False).sort_values('Start_Position', ascending=True)
117
114
 
118
- elif exclude_filters is not None:
119
- for exclude_filter in exclude_filters:
120
- df = df[~df.FILTER.str.contains(exclude_filter)]
115
+ df['attention'] = True
121
116
 
122
- if valid_cases is not None:
123
- df = df[df.case_id.isin(valid_cases)]
117
+ if df.empty:
118
+ self.df = df
124
119
 
125
- df['mut_id'] = df.apply(lambda
126
- row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
127
- axis=1)
128
-
129
- df['ratio'] = df.t_alt_count + df.t_ref_count
130
- df = df[df.ratio > 0]
131
- df['ratio'] = df.t_alt_count / df.ratio
132
- self.df = df
120
+ else:
121
+ df = df[df.Variant_Type.isin(['SNP', 'INS', 'DEL'])]
122
+ df = df.astype({'Start_Position': int})
123
+
124
+ if include_filter is not None:
125
+ # df = df[df.FILTER == include_filter]
126
+ df.loc[~df['FILTER'].str.contains(include_filter), 'attention'] = False
127
+
128
+ elif exclude_filters is not None:
129
+ for exclude_filter in exclude_filters:
130
+ # df = df[~df.FILTER.str.contains(exclude_filter)]
131
+ df.loc[df['FILTER'].str.contains(exclude_filter), 'attention'] = False
132
+
133
+ if valid_cases is not None:
134
+ # df = df[df.case_id.isin(valid_cases)]
135
+ df.loc[~df.case_id.isin(valid_cases), 'attention'] = False
136
+
137
+ df['mut_id'] = df.apply(lambda
138
+ row: f"{row.Gene_name}:{row.Chromosome.replace('chr', '')}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
139
+ axis=1)
140
+ df['mut_id_yoram'] = df.apply(lambda
141
+ row: f"{row.Gene_name}:{row.Chromosome}:{row.Variant_Classification}:{row.Start_Position}:{row.Reference_Allele}:{row.Tumor_Seq_Allele2}",
142
+ axis=1)
143
+ silent_mut_classes = ["3'Flank", "3'UTR", "Silent", "Splice_Site", "Splice_Region", "Intron", "5'Flank",
144
+ "3'Flank"]
145
+ df['silent'] = df.apply(lambda row: row.Variant_Classification in silent_mut_classes, axis=1)
146
+ df['ratio'] = df.t_alt_count + df.t_ref_count
147
+ df = df[df.ratio > 0]
148
+ df['ratio'] = df.t_alt_count / df.ratio
149
+ self.df = df
150
+
151
+ def __repr__(self):
152
+ return repr(self.df[self.df.attention])
153
+
154
+ @property
155
+ def data(self):
156
+ return self.df[self.df.attention]
133
157
 
134
158
  def affected_cases(self, mut_id=None, read_ratio=0, filters=[]):
135
159
  if mut_id is None:
@@ -164,18 +188,27 @@ class TCGAGene:
164
188
  def total_prevalence(self, mut_id):
165
189
  pass
166
190
 
167
- def project_prevalence(self, mut_id):
168
- pass
191
+ def project_prevalence(self, mut_id, df_p_proc):
192
+ mut_prevalence = {}
193
+ for i, g in tqdm(self.data.groupby(['mut_id', 'Transcript_ID'])):
194
+ mut_prevalence[i] = series_to_pretty_string((df_p_proc[g.case_id].value_counts() / project_counts).dropna())
195
+ return pd.Series(mut_prevalence)
169
196
 
170
197
  def project_counts(self, mut_id):
171
198
  pass
172
199
 
200
+ def filter_silent_muts(self):
201
+ self.df.loc[self.df.silent, 'attention'] = False
202
+ return self
173
203
 
174
204
 
175
-
176
-
177
-
178
-
205
+ def series_to_pretty_string(series):
206
+ # Format each index-value pair, applying scientific notation to floats with 3 significant figures
207
+ pretty_str = "\n".join([
208
+ f"{index}: {value:.3e}" if isinstance(value, float) else f"{index}: {value}"
209
+ for index, value in series.items()
210
+ ])
211
+ return pretty_str
179
212
 
180
213
 
181
214
  # CLINICAL_DATA_FILE = Path('/tamir2/nicolaslynn/data/TCGA/cancer_reports/new_df_p_proc.pkl')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.40
3
+ Version: 1.2.41
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -2,25 +2,25 @@ geney/Fasta_segment.py,sha256=0zCdzPUbDeM9Rz642woH5Q94pwI46O0fE3H8w0XWebc,11255
2
2
  geney/__init__.py,sha256=eBdDl42N6UhcYeZDjOnv199Z88fI5_8Y6xW8447OKXM,755
3
3
  geney/config_setup.py,sha256=klm_k7Ca_703DpeGBcGoDqz1XwHQhNXENPKjj_xfSQw,608
4
4
  geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
5
- geney/graphic_utils.py,sha256=tjm6IDQ1BdfSeuPYzjlqAUHFQoDYH9jXTzJjKFS4Hh4,11078
5
+ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
6
6
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
7
7
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
8
8
  geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
9
- geney/oncosplice.py,sha256=J_nFs_xBSJtgMqeHv628QodRL0B2d-Zi1Ke7Pk7S4R4,22595
9
+ geney/oncosplice.py,sha256=1K8p-sytnMUKTYwO_z_YJLelLosKj8TZpM0i5lHcMFI,22941
10
10
  geney/pangolin_utils.py,sha256=lLmnjJdJjqwWS85-1jlPLIjD2z14sWjzU87hS-8xxpQ,2873
11
11
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
12
12
  geney/seqmat_utils.py,sha256=YV5DFLbfjXLIswPGvqK1-eEfwn9TUby0b2kewdGAKws,18372
13
13
  geney/spliceai_utils.py,sha256=gIGPC8u3J15A7EQrk2Elho5PbF9MmUUNopGGH-eEV8s,1873
14
14
  geney/splicing_utils.py,sha256=lGBNknnAdKhcJ3MqPQ5c9oz_NKcL2lcFAr78StjKa6o,16151
15
15
  geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
16
- geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
16
+ geney/tcga_utils.py,sha256=wM52QZ1M_54CrXZ_uj05R14ycZh23gTZUI8b0ZMtPd0,17615
17
17
  geney/tis_utils.py,sha256=vA2ci4gNfwwQZlCjPpO5ehvL2NRVeM7lHI_VyfT-_10,8049
18
18
  geney/utils.py,sha256=EsKvBM-Nz2a3_4ZAhF4Dxd4PwT7_6YYKpxEN4LLgg10,2174
19
19
  geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
21
21
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
22
22
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
23
- geney-1.2.40.dist-info/METADATA,sha256=ja7ULYnyNPbYYj-wloXQzHDH86TL2mg4LfgEmZaMcbE,948
24
- geney-1.2.40.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
- geney-1.2.40.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
- geney-1.2.40.dist-info/RECORD,,
23
+ geney-1.2.41.dist-info/METADATA,sha256=e7eHu8HlNdNuNXLWxK17ok3lAetzKTJ7ie-8MRct1T8,948
24
+ geney-1.2.41.dist-info/WHEEL,sha256=fS9sRbCBHs7VFcwJLnLXN1MZRR0_TVTxvXKzOnaSFs8,110
25
+ geney-1.2.41.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
26
+ geney-1.2.41.dist-info/RECORD,,
File without changes