geney 1.2.4__py2.py3-none-any.whl → 1.2.6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/oncosplice.py +147 -55
- geney/power_utils.py +3 -3
- geney/tcga_utils.py +2 -2
- {geney-1.2.4.dist-info → geney-1.2.6.dist-info}/METADATA +1 -1
- {geney-1.2.4.dist-info → geney-1.2.6.dist-info}/RECORD +7 -7
- {geney-1.2.4.dist-info → geney-1.2.6.dist-info}/WHEEL +0 -0
- {geney-1.2.4.dist-info → geney-1.2.6.dist-info}/top_level.txt +0 -0
geney/oncosplice.py
CHANGED
|
@@ -35,7 +35,7 @@ import torch
|
|
|
35
35
|
from pkg_resources import resource_filename
|
|
36
36
|
from pangolin.model import *
|
|
37
37
|
|
|
38
|
-
pang_model_nums = [0, 2, 4, 6]
|
|
38
|
+
pang_model_nums = [0, 1, 2, 3, 4, 5, 6]
|
|
39
39
|
pang_models = []
|
|
40
40
|
for i in pang_model_nums:
|
|
41
41
|
for j in range(1, 6):
|
|
@@ -545,7 +545,7 @@ class Transcript:
|
|
|
545
545
|
def generate_mature_mrna_pos(self, reset=True):
|
|
546
546
|
if reset:
|
|
547
547
|
pre_seq_pos, pre_indices_pos = self.generate_pre_mrna_pos()
|
|
548
|
-
self.pre_mrna,
|
|
548
|
+
self.pre_mrna, self.pre_indices = self.__pos2sense(pre_seq_pos, pre_indices_pos)
|
|
549
549
|
else:
|
|
550
550
|
pre_seq_pos, pre_indices_pos = self.__sense2pos(self.pre_mrna, self.pre_indices)
|
|
551
551
|
|
|
@@ -725,7 +725,17 @@ def pang_one_hot_encode(seq):
|
|
|
725
725
|
seq = np.asarray(list(map(int, list(seq))))
|
|
726
726
|
return IN_MAP[seq.astype('int8')]
|
|
727
727
|
|
|
728
|
+
|
|
729
|
+
def get_pos_seq_indices(t):
|
|
730
|
+
seq, indices = t.pre_mrna, t.pre_indices
|
|
731
|
+
if t.rev:
|
|
732
|
+
return reverse_complement(seq), indices[::-1]
|
|
733
|
+
else:
|
|
734
|
+
return seq, indices
|
|
735
|
+
|
|
736
|
+
|
|
728
737
|
def pangolin_predict_probs(true_seq, models):
|
|
738
|
+
# print(f"Running pangolin on: {true_seq}")
|
|
729
739
|
model_nums = [0, 2, 4, 6]
|
|
730
740
|
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
731
741
|
|
|
@@ -752,44 +762,38 @@ def pangolin_predict_probs(true_seq, models):
|
|
|
752
762
|
splicing_pred = np.array(scores).max(axis=0)
|
|
753
763
|
donor_probs = [splicing_pred[i] * donor_dinucleotide[i] for i in range(len(true_seq))]
|
|
754
764
|
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
755
|
-
return donor_probs, acceptor_probs
|
|
765
|
+
return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
|
|
756
766
|
|
|
757
|
-
|
|
767
|
+
|
|
768
|
+
def find_transcript_missplicing(mutations, ref_transcript, var_transcript, context=7500, threshold=0.5,
|
|
769
|
+
engine='spliceai'):
|
|
758
770
|
positions = mutations.positions
|
|
759
771
|
end_positions = [m.start + len(m.ref) for m in mutations.variants]
|
|
760
772
|
positions.extend(end_positions)
|
|
773
|
+
center = int(np.mean(positions) // 1)
|
|
774
|
+
|
|
775
|
+
seq_start_pos, seq_end_pos = center - context, center + context
|
|
776
|
+
transcript_start, transcript_end, rev = ref_transcript.transcript_lower, ref_transcript.transcript_upper, ref_transcript.rev
|
|
777
|
+
|
|
778
|
+
# Generate reference sequence data
|
|
779
|
+
ref_seq, ref_indices = get_pos_seq_indices(ref_transcript)
|
|
780
|
+
center_index = ref_indices.index(center)
|
|
781
|
+
start_cutoff = ref_indices.index(seq_start_pos) if seq_start_pos in ref_indices else 0
|
|
782
|
+
end_cutoff = ref_indices.index(seq_end_pos) if seq_end_pos in ref_indices else len(ref_indices)
|
|
783
|
+
start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
|
|
784
|
+
ref_seq = 'N' * start_pad + ref_seq[start_cutoff:end_cutoff] + 'N' * end_pad
|
|
785
|
+
ref_indices = [-1] * start_pad + ref_indices[start_cutoff:end_cutoff] + [-1] * end_pad
|
|
786
|
+
|
|
787
|
+
# Generate mutation sequence data
|
|
788
|
+
mut_seq, mut_indices = get_pos_seq_indices(var_transcript)
|
|
789
|
+
start_cutoff = mut_indices.index(seq_start_pos) if seq_start_pos in mut_indices else 0
|
|
790
|
+
end_cutoff = mut_indices.index(seq_end_pos) if seq_end_pos in mut_indices else len(mut_indices)
|
|
791
|
+
start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
|
|
792
|
+
mut_seq = 'N' * start_pad + mut_seq[start_cutoff:end_cutoff] + 'N' * end_pad
|
|
793
|
+
mut_indices = [-1] * start_pad + mut_indices[start_cutoff:end_cutoff] + [-1] * end_pad
|
|
761
794
|
|
|
762
|
-
seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
763
|
-
seq_end_pos = max(positions) + sai_mrg_context + min_coverage
|
|
764
|
-
|
|
765
|
-
fasta_obj = Fasta_segment()
|
|
766
|
-
ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
|
|
767
|
-
config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
|
|
768
|
-
seq_start_pos,
|
|
769
|
-
seq_end_pos)
|
|
770
|
-
|
|
771
|
-
transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
|
|
772
|
-
|
|
773
|
-
# visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
774
|
-
# visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
775
|
-
|
|
776
|
-
start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
|
|
777
|
-
end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
|
|
778
|
-
end_pad = len(ref_indices) - end_cutoff
|
|
779
|
-
ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
|
|
780
|
-
ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
|
|
781
|
-
mut_seq, mut_indices = ref_seq, ref_indices
|
|
782
|
-
|
|
783
|
-
for mut in mutations:
|
|
784
|
-
mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
|
|
785
|
-
|
|
786
|
-
ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
|
|
787
|
-
mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
|
|
788
795
|
copy_mut_indices = mut_indices.copy()
|
|
789
796
|
|
|
790
|
-
visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
791
|
-
visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
792
|
-
|
|
793
797
|
if rev:
|
|
794
798
|
ref_seq = reverse_complement(ref_seq)
|
|
795
799
|
mut_seq = reverse_complement(mut_seq)
|
|
@@ -801,11 +805,20 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
801
805
|
mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
|
|
802
806
|
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
803
807
|
mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
808
|
+
ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
|
|
809
|
+
|
|
804
810
|
|
|
805
811
|
elif engine == 'pangolin':
|
|
806
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq,
|
|
807
|
-
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq,
|
|
812
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
813
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, models=pang_models)
|
|
814
|
+
ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
|
|
808
815
|
|
|
816
|
+
else:
|
|
817
|
+
raise ValueError(f"{engine} not implemented")
|
|
818
|
+
|
|
819
|
+
visible_donors = np.intersect1d(ref_transcript.donors, ref_indices)
|
|
820
|
+
visible_acceptors = np.intersect1d(ref_transcript.acceptors, ref_indices)
|
|
821
|
+
# print(ref_indices.index(visible_donors[0]), ref_seq_donor_probs[ref_indices.index(visible_donors[0])], mut_seq_donor_probs[mut_indices.index(visible_donors[0])])
|
|
809
822
|
|
|
810
823
|
assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
811
824
|
assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
@@ -813,7 +826,7 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
813
826
|
iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
814
827
|
{p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
815
828
|
visible_acceptors,
|
|
816
|
-
threshold=
|
|
829
|
+
threshold=threshold)
|
|
817
830
|
|
|
818
831
|
assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
819
832
|
assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
@@ -821,13 +834,15 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
821
834
|
idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
822
835
|
{p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
823
836
|
visible_donors,
|
|
824
|
-
threshold=
|
|
837
|
+
threshold=threshold)
|
|
825
838
|
|
|
826
839
|
ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
827
840
|
ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
828
841
|
|
|
829
|
-
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
830
|
-
|
|
842
|
+
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
843
|
+
visible_acceptors if p not in mut_indices and p not in dap}
|
|
844
|
+
lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
|
|
845
|
+
if p not in mut_indices and p not in ddp}
|
|
831
846
|
dap.update(lost_acceptors)
|
|
832
847
|
ddp.update(lost_donors)
|
|
833
848
|
|
|
@@ -836,6 +851,90 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
836
851
|
return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
837
852
|
|
|
838
853
|
|
|
854
|
+
# def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
|
|
855
|
+
# positions = mutations.positions
|
|
856
|
+
# end_positions = [m.start + len(m.ref) for m in mutations.variants]
|
|
857
|
+
# positions.extend(end_positions)
|
|
858
|
+
#
|
|
859
|
+
# seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
860
|
+
# seq_end_pos = max(positions) + sai_mrg_context + min_coverage
|
|
861
|
+
#
|
|
862
|
+
# fasta_obj = Fasta_segment()
|
|
863
|
+
# ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
|
|
864
|
+
# config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
|
|
865
|
+
# seq_start_pos,
|
|
866
|
+
# seq_end_pos)
|
|
867
|
+
#
|
|
868
|
+
# transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
|
|
869
|
+
#
|
|
870
|
+
# # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
871
|
+
# # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
872
|
+
#
|
|
873
|
+
# start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
|
|
874
|
+
# end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
|
|
875
|
+
# end_pad = len(ref_indices) - end_cutoff
|
|
876
|
+
# ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
|
|
877
|
+
# ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
|
|
878
|
+
# mut_seq, mut_indices = ref_seq, ref_indices
|
|
879
|
+
#
|
|
880
|
+
# for mut in mutations:
|
|
881
|
+
# mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
|
|
882
|
+
#
|
|
883
|
+
# ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
|
|
884
|
+
# mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
|
|
885
|
+
# copy_mut_indices = mut_indices.copy()
|
|
886
|
+
#
|
|
887
|
+
# visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
888
|
+
# visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
889
|
+
#
|
|
890
|
+
# if rev:
|
|
891
|
+
# ref_seq = reverse_complement(ref_seq)
|
|
892
|
+
# mut_seq = reverse_complement(mut_seq)
|
|
893
|
+
# ref_indices = ref_indices[::-1]
|
|
894
|
+
# mut_indices = mut_indices[::-1]
|
|
895
|
+
#
|
|
896
|
+
# if engine == 'spliceai':
|
|
897
|
+
# ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
|
|
898
|
+
# mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
|
|
899
|
+
# ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
900
|
+
# mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
901
|
+
#
|
|
902
|
+
# elif engine == 'pangolin':
|
|
903
|
+
# ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
|
|
904
|
+
# mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
|
|
905
|
+
#
|
|
906
|
+
# else:
|
|
907
|
+
# raise ValueError(f"{engine} not implemented")
|
|
908
|
+
#
|
|
909
|
+
# assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
910
|
+
# assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
911
|
+
#
|
|
912
|
+
# iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
913
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
914
|
+
# visible_acceptors,
|
|
915
|
+
# threshold=sai_threshold)
|
|
916
|
+
#
|
|
917
|
+
# assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
918
|
+
# assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
919
|
+
#
|
|
920
|
+
# idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
921
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
922
|
+
# visible_donors,
|
|
923
|
+
# threshold=sai_threshold)
|
|
924
|
+
#
|
|
925
|
+
# ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
926
|
+
# ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
927
|
+
#
|
|
928
|
+
# lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
|
|
929
|
+
# lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
|
|
930
|
+
# dap.update(lost_acceptors)
|
|
931
|
+
# ddp.update(lost_donors)
|
|
932
|
+
#
|
|
933
|
+
# missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
|
|
934
|
+
# missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
935
|
+
# return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
936
|
+
|
|
937
|
+
|
|
839
938
|
# def run_spliceai(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
|
|
840
939
|
# positions = mutations.positions
|
|
841
940
|
# seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
@@ -1400,18 +1499,15 @@ def moving_average_conv(vector, window_size, factor=1):
|
|
|
1400
1499
|
|
|
1401
1500
|
return np.convolve(vector, np.ones(window_size), mode='same') / window_size
|
|
1402
1501
|
|
|
1403
|
-
def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False,
|
|
1502
|
+
def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcript=False, window_length=13, save_spliceai_results=False, force_spliceai=False, organism='hg38'):
|
|
1404
1503
|
mutation = Variations(mut_id)
|
|
1405
|
-
try:
|
|
1406
|
-
|
|
1407
|
-
except FileNotFoundError:
|
|
1408
|
-
|
|
1504
|
+
# try:
|
|
1505
|
+
reference_gene = Gene(mutation.gene, organism=organism)
|
|
1506
|
+
# except FileNotFoundError:
|
|
1507
|
+
# return pd.DataFrame()
|
|
1409
1508
|
|
|
1410
|
-
|
|
1411
|
-
mutated_gene = Gene(mutation.gene, mut_id)
|
|
1412
|
-
# if not per_transcript_missplicing:
|
|
1413
|
-
# missplicing_obj = PredictSpliceAI(mutation, reference_gene, threshold=sai_threshold, force=True, save_results=False)
|
|
1414
|
-
# missplicing = missplicing_obj.missplicing
|
|
1509
|
+
reference_gene_proteins = {g.protein: g.transcript_id for g in reference_gene.run_transcripts()}
|
|
1510
|
+
mutated_gene = Gene(mutation.gene, mut_id, organism=organism)
|
|
1415
1511
|
|
|
1416
1512
|
results = []
|
|
1417
1513
|
for variant in mutated_gene.run_transcripts(protein_coding=protein_coding, primary_transcript=primary_transcript):
|
|
@@ -1420,10 +1516,9 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1420
1516
|
continue
|
|
1421
1517
|
|
|
1422
1518
|
cons_vector = transform_conservation_vector(reference.cons_vector, window=window_length)
|
|
1423
|
-
# if per_transcript_missplicing:
|
|
1424
1519
|
missplicing_obj = PredictSpliceAI(mutation, reference, threshold=sai_threshold, force=force_spliceai, save_results=save_spliceai_results)
|
|
1425
1520
|
missplicing = missplicing_obj.apply_sai_threshold_primary(threshold=sai_threshold)
|
|
1426
|
-
|
|
1521
|
+
|
|
1427
1522
|
for i, new_boundaries in enumerate(develop_aberrant_splicing(variant, missplicing)):
|
|
1428
1523
|
variant_isoform = deepcopy(variant)
|
|
1429
1524
|
variant_isoform.reset_acceptors(acceptors=new_boundaries['acceptors']).reset_donors(donors=new_boundaries['donors']).organize().generate_protein()
|
|
@@ -1432,9 +1527,6 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1432
1527
|
modified_positions = find_modified_positions(len(reference.protein), deleted, inserted)
|
|
1433
1528
|
temp_cons = np.convolve(cons_vector * modified_positions, np.ones(window_length)) / window_length
|
|
1434
1529
|
affected_cons_scores = max(temp_cons)
|
|
1435
|
-
# temp_cons = np.convolve(cons_vector, np.ones(window_length))
|
|
1436
|
-
# print(temp_cons)
|
|
1437
|
-
# print(cons_vector)
|
|
1438
1530
|
percentile = (
|
|
1439
1531
|
sorted(cons_vector).index(next(x for x in sorted(cons_vector) if x >= affected_cons_scores)) / len(
|
|
1440
1532
|
cons_vector))
|
|
@@ -1449,7 +1541,7 @@ def oncosplice(mut_id, sai_threshold=0.5, protein_coding=True, primary_transcrip
|
|
|
1449
1541
|
report['isoform_prevalence'] = new_boundaries['path_weight']
|
|
1450
1542
|
report['full_missplicing'] = missplicing
|
|
1451
1543
|
report['missplicing'] = max(missplicing_obj)
|
|
1452
|
-
report['reference_resemblance'] =
|
|
1544
|
+
report['reference_resemblance'] = reference_gene_proteins.get(variant_isoform.protein, None)
|
|
1453
1545
|
results.append(report)
|
|
1454
1546
|
|
|
1455
1547
|
report = pd.DataFrame(results)
|
geney/power_utils.py
CHANGED
|
@@ -38,7 +38,7 @@ def write_executors(folder_path, script='geney.power_utils', input_file='/tamir2
|
|
|
38
38
|
|
|
39
39
|
def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
40
40
|
walltime="24:00:00", dashboard_address=":23154",
|
|
41
|
-
log_directory="dask-logs", slurm=False):
|
|
41
|
+
log_directory="dask-logs", slurm=False, organism='hg38'):
|
|
42
42
|
"""
|
|
43
43
|
Launch a Dask cluster using PBS.
|
|
44
44
|
|
|
@@ -63,7 +63,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
|
63
63
|
walltime='7200',
|
|
64
64
|
scheduler_options={"dashboard_address": dashboard_address},
|
|
65
65
|
log_directory=log_directory,
|
|
66
|
-
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
66
|
+
job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
else:
|
|
@@ -75,7 +75,7 @@ def launch_dask_cluster(memory_size="3GB", num_workers=10, queue="tamirQ",
|
|
|
75
75
|
walltime=walltime,
|
|
76
76
|
scheduler_options={"dashboard_address": dashboard_address},
|
|
77
77
|
log_directory=log_directory,
|
|
78
|
-
job_script_prologue=[f"cd {config_setup['BASE']}"]
|
|
78
|
+
job_script_prologue=[f"cd {config_setup[organism]['BASE']}"]
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
dask_cluster.scale(num_workers)
|
geney/tcga_utils.py
CHANGED
|
@@ -363,8 +363,8 @@ class TCGAGene:
|
|
|
363
363
|
# return cases
|
|
364
364
|
#
|
|
365
365
|
#
|
|
366
|
-
|
|
367
|
-
|
|
366
|
+
def create_mut_id(row):
|
|
367
|
+
return f"{row.Gene_name}:{row['Chromosome']}:{row['Start_Position']}:{row['Reference_Allele']}:{row['Tumor_Seq_Allele2']}"
|
|
368
368
|
#
|
|
369
369
|
#
|
|
370
370
|
# def is_in_exon(mut_id, tid):
|
|
@@ -9,15 +9,15 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
|
9
9
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
10
10
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
11
11
|
geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
|
|
12
|
-
geney/oncosplice.py,sha256=
|
|
12
|
+
geney/oncosplice.py,sha256=aaOxri0bXLPfB3Mu8tkzk4KEVlzzocImzj3MI-0uU_0,76949
|
|
13
13
|
geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
|
|
14
14
|
geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
|
|
15
15
|
geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
|
|
16
|
-
geney/power_utils.py,sha256=
|
|
16
|
+
geney/power_utils.py,sha256=nppfT1-bOC1dnvfRs55LipjoWDlRrOqWiuCMH0v1auU,7303
|
|
17
17
|
geney/survival.py,sha256=gNKZGcwxDZ00ixVBHf3ZdjbY_AHQOCU9kKpBC_dokbM,5572
|
|
18
18
|
geney/survival_utils.py,sha256=2CAkC2LsspicHIdrqsiPnjgvpr5KHDUfLFFqnRbPJqs,5762
|
|
19
19
|
geney/tcga_annotations.py,sha256=DjRl6Pk5VAOL1yhbt8SXD6FZhYbcYNu3FtXYMeveGB0,15016
|
|
20
|
-
geney/tcga_utils.py,sha256=
|
|
20
|
+
geney/tcga_utils.py,sha256=vXSMf1OxoF_AdE_rMguy_BoYaart_E1t4FFMx2DS1Ak,15585
|
|
21
21
|
geney/utils.py,sha256=xJi7fk3g7DkR2rKOb8WePLQNM1ib83rcHecwRdwd5lA,2036
|
|
22
22
|
geney/analyzers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
geney/analyzers/benchmark_clinvar.py,sha256=ZAxvZ-Ue5T6au5mGbk8clfvbAYl13NIY7U92KzL0lXI,5531
|
|
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
|
|
|
45
45
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
46
46
|
geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
-
geney-1.2.
|
|
49
|
-
geney-1.2.
|
|
50
|
-
geney-1.2.
|
|
51
|
-
geney-1.2.
|
|
48
|
+
geney-1.2.6.dist-info/METADATA,sha256=hcHNz2oNxRLzNpimwrHeM2yKSnd1z5KHJ_Rl86LD3QE,1198
|
|
49
|
+
geney-1.2.6.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
|
50
|
+
geney-1.2.6.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
51
|
+
geney-1.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|