geney 1.2.5__py2.py3-none-any.whl → 1.2.6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/oncosplice.py
CHANGED
|
@@ -545,7 +545,7 @@ class Transcript:
|
|
|
545
545
|
def generate_mature_mrna_pos(self, reset=True):
|
|
546
546
|
if reset:
|
|
547
547
|
pre_seq_pos, pre_indices_pos = self.generate_pre_mrna_pos()
|
|
548
|
-
self.pre_mrna,
|
|
548
|
+
self.pre_mrna, self.pre_indices = self.__pos2sense(pre_seq_pos, pre_indices_pos)
|
|
549
549
|
else:
|
|
550
550
|
pre_seq_pos, pre_indices_pos = self.__sense2pos(self.pre_mrna, self.pre_indices)
|
|
551
551
|
|
|
@@ -725,7 +725,17 @@ def pang_one_hot_encode(seq):
|
|
|
725
725
|
seq = np.asarray(list(map(int, list(seq))))
|
|
726
726
|
return IN_MAP[seq.astype('int8')]
|
|
727
727
|
|
|
728
|
+
|
|
729
|
+
def get_pos_seq_indices(t):
|
|
730
|
+
seq, indices = t.pre_mrna, t.pre_indices
|
|
731
|
+
if t.rev:
|
|
732
|
+
return reverse_complement(seq), indices[::-1]
|
|
733
|
+
else:
|
|
734
|
+
return seq, indices
|
|
735
|
+
|
|
736
|
+
|
|
728
737
|
def pangolin_predict_probs(true_seq, models):
|
|
738
|
+
# print(f"Running pangolin on: {true_seq}")
|
|
729
739
|
model_nums = [0, 2, 4, 6]
|
|
730
740
|
INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
|
|
731
741
|
|
|
@@ -754,42 +764,36 @@ def pangolin_predict_probs(true_seq, models):
|
|
|
754
764
|
acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
|
|
755
765
|
return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
|
|
756
766
|
|
|
757
|
-
|
|
767
|
+
|
|
768
|
+
def find_transcript_missplicing(mutations, ref_transcript, var_transcript, context=7500, threshold=0.5,
|
|
769
|
+
engine='spliceai'):
|
|
758
770
|
positions = mutations.positions
|
|
759
771
|
end_positions = [m.start + len(m.ref) for m in mutations.variants]
|
|
760
772
|
positions.extend(end_positions)
|
|
773
|
+
center = int(np.mean(positions) // 1)
|
|
774
|
+
|
|
775
|
+
seq_start_pos, seq_end_pos = center - context, center + context
|
|
776
|
+
transcript_start, transcript_end, rev = ref_transcript.transcript_lower, ref_transcript.transcript_upper, ref_transcript.rev
|
|
777
|
+
|
|
778
|
+
# Generate reference sequence data
|
|
779
|
+
ref_seq, ref_indices = get_pos_seq_indices(ref_transcript)
|
|
780
|
+
center_index = ref_indices.index(center)
|
|
781
|
+
start_cutoff = ref_indices.index(seq_start_pos) if seq_start_pos in ref_indices else 0
|
|
782
|
+
end_cutoff = ref_indices.index(seq_end_pos) if seq_end_pos in ref_indices else len(ref_indices)
|
|
783
|
+
start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
|
|
784
|
+
ref_seq = 'N' * start_pad + ref_seq[start_cutoff:end_cutoff] + 'N' * end_pad
|
|
785
|
+
ref_indices = [-1] * start_pad + ref_indices[start_cutoff:end_cutoff] + [-1] * end_pad
|
|
786
|
+
|
|
787
|
+
# Generate mutation sequence data
|
|
788
|
+
mut_seq, mut_indices = get_pos_seq_indices(var_transcript)
|
|
789
|
+
start_cutoff = mut_indices.index(seq_start_pos) if seq_start_pos in mut_indices else 0
|
|
790
|
+
end_cutoff = mut_indices.index(seq_end_pos) if seq_end_pos in mut_indices else len(mut_indices)
|
|
791
|
+
start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
|
|
792
|
+
mut_seq = 'N' * start_pad + mut_seq[start_cutoff:end_cutoff] + 'N' * end_pad
|
|
793
|
+
mut_indices = [-1] * start_pad + mut_indices[start_cutoff:end_cutoff] + [-1] * end_pad
|
|
761
794
|
|
|
762
|
-
seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
763
|
-
seq_end_pos = max(positions) + sai_mrg_context + min_coverage
|
|
764
|
-
|
|
765
|
-
fasta_obj = Fasta_segment()
|
|
766
|
-
ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
|
|
767
|
-
config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
|
|
768
|
-
seq_start_pos,
|
|
769
|
-
seq_end_pos)
|
|
770
|
-
|
|
771
|
-
transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
|
|
772
|
-
|
|
773
|
-
# visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
774
|
-
# visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
775
|
-
|
|
776
|
-
start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
|
|
777
|
-
end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
|
|
778
|
-
end_pad = len(ref_indices) - end_cutoff
|
|
779
|
-
ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
|
|
780
|
-
ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
|
|
781
|
-
mut_seq, mut_indices = ref_seq, ref_indices
|
|
782
|
-
|
|
783
|
-
for mut in mutations:
|
|
784
|
-
mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
|
|
785
|
-
|
|
786
|
-
ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
|
|
787
|
-
mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
|
|
788
795
|
copy_mut_indices = mut_indices.copy()
|
|
789
796
|
|
|
790
|
-
visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
791
|
-
visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
792
|
-
|
|
793
797
|
if rev:
|
|
794
798
|
ref_seq = reverse_complement(ref_seq)
|
|
795
799
|
mut_seq = reverse_complement(mut_seq)
|
|
@@ -801,21 +805,28 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
801
805
|
mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
|
|
802
806
|
ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
803
807
|
mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
808
|
+
ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
|
|
809
|
+
|
|
804
810
|
|
|
805
811
|
elif engine == 'pangolin':
|
|
806
|
-
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq,
|
|
807
|
-
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq,
|
|
812
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
|
|
813
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, models=pang_models)
|
|
814
|
+
ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
|
|
808
815
|
|
|
809
816
|
else:
|
|
810
817
|
raise ValueError(f"{engine} not implemented")
|
|
811
818
|
|
|
819
|
+
visible_donors = np.intersect1d(ref_transcript.donors, ref_indices)
|
|
820
|
+
visible_acceptors = np.intersect1d(ref_transcript.acceptors, ref_indices)
|
|
821
|
+
# print(ref_indices.index(visible_donors[0]), ref_seq_donor_probs[ref_indices.index(visible_donors[0])], mut_seq_donor_probs[mut_indices.index(visible_donors[0])])
|
|
822
|
+
|
|
812
823
|
assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
813
824
|
assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
814
825
|
|
|
815
826
|
iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
816
827
|
{p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
817
828
|
visible_acceptors,
|
|
818
|
-
threshold=
|
|
829
|
+
threshold=threshold)
|
|
819
830
|
|
|
820
831
|
assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
821
832
|
assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
@@ -823,13 +834,15 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
823
834
|
idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
824
835
|
{p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
825
836
|
visible_donors,
|
|
826
|
-
threshold=
|
|
837
|
+
threshold=threshold)
|
|
827
838
|
|
|
828
839
|
ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
829
840
|
ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
830
841
|
|
|
831
|
-
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
832
|
-
|
|
842
|
+
lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
|
|
843
|
+
visible_acceptors if p not in mut_indices and p not in dap}
|
|
844
|
+
lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
|
|
845
|
+
if p not in mut_indices and p not in ddp}
|
|
833
846
|
dap.update(lost_acceptors)
|
|
834
847
|
ddp.update(lost_donors)
|
|
835
848
|
|
|
@@ -838,6 +851,90 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
|
|
|
838
851
|
return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
839
852
|
|
|
840
853
|
|
|
854
|
+
# def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
|
|
855
|
+
# positions = mutations.positions
|
|
856
|
+
# end_positions = [m.start + len(m.ref) for m in mutations.variants]
|
|
857
|
+
# positions.extend(end_positions)
|
|
858
|
+
#
|
|
859
|
+
# seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
860
|
+
# seq_end_pos = max(positions) + sai_mrg_context + min_coverage
|
|
861
|
+
#
|
|
862
|
+
# fasta_obj = Fasta_segment()
|
|
863
|
+
# ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
|
|
864
|
+
# config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
|
|
865
|
+
# seq_start_pos,
|
|
866
|
+
# seq_end_pos)
|
|
867
|
+
#
|
|
868
|
+
# transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
|
|
869
|
+
#
|
|
870
|
+
# # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
871
|
+
# # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
872
|
+
#
|
|
873
|
+
# start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
|
|
874
|
+
# end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
|
|
875
|
+
# end_pad = len(ref_indices) - end_cutoff
|
|
876
|
+
# ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
|
|
877
|
+
# ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
|
|
878
|
+
# mut_seq, mut_indices = ref_seq, ref_indices
|
|
879
|
+
#
|
|
880
|
+
# for mut in mutations:
|
|
881
|
+
# mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
|
|
882
|
+
#
|
|
883
|
+
# ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
|
|
884
|
+
# mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
|
|
885
|
+
# copy_mut_indices = mut_indices.copy()
|
|
886
|
+
#
|
|
887
|
+
# visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
|
|
888
|
+
# visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
|
|
889
|
+
#
|
|
890
|
+
# if rev:
|
|
891
|
+
# ref_seq = reverse_complement(ref_seq)
|
|
892
|
+
# mut_seq = reverse_complement(mut_seq)
|
|
893
|
+
# ref_indices = ref_indices[::-1]
|
|
894
|
+
# mut_indices = mut_indices[::-1]
|
|
895
|
+
#
|
|
896
|
+
# if engine == 'spliceai':
|
|
897
|
+
# ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
|
|
898
|
+
# mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
|
|
899
|
+
# ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
|
|
900
|
+
# mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
|
|
901
|
+
#
|
|
902
|
+
# elif engine == 'pangolin':
|
|
903
|
+
# ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
|
|
904
|
+
# mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
|
|
905
|
+
#
|
|
906
|
+
# else:
|
|
907
|
+
# raise ValueError(f"{engine} not implemented")
|
|
908
|
+
#
|
|
909
|
+
# assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
|
|
910
|
+
# assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
|
|
911
|
+
#
|
|
912
|
+
# iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
|
|
913
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
|
|
914
|
+
# visible_acceptors,
|
|
915
|
+
# threshold=sai_threshold)
|
|
916
|
+
#
|
|
917
|
+
# assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
|
|
918
|
+
# assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
|
|
919
|
+
#
|
|
920
|
+
# idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
|
|
921
|
+
# {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
|
|
922
|
+
# visible_donors,
|
|
923
|
+
# threshold=sai_threshold)
|
|
924
|
+
#
|
|
925
|
+
# ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
|
|
926
|
+
# ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
|
|
927
|
+
#
|
|
928
|
+
# lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
|
|
929
|
+
# lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
|
|
930
|
+
# dap.update(lost_acceptors)
|
|
931
|
+
# ddp.update(lost_donors)
|
|
932
|
+
#
|
|
933
|
+
# missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
|
|
934
|
+
# missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
935
|
+
# return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
|
|
936
|
+
|
|
937
|
+
|
|
841
938
|
# def run_spliceai(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
|
|
842
939
|
# positions = mutations.positions
|
|
843
940
|
# seq_start_pos = min(positions) - sai_mrg_context - min_coverage
|
|
@@ -9,7 +9,7 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
|
9
9
|
geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
|
|
10
10
|
geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
|
|
11
11
|
geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
|
|
12
|
-
geney/oncosplice.py,sha256=
|
|
12
|
+
geney/oncosplice.py,sha256=aaOxri0bXLPfB3Mu8tkzk4KEVlzzocImzj3MI-0uU_0,76949
|
|
13
13
|
geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
|
|
14
14
|
geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
|
|
15
15
|
geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
|
|
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
|
|
|
45
45
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
46
46
|
geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
-
geney-1.2.
|
|
49
|
-
geney-1.2.
|
|
50
|
-
geney-1.2.
|
|
51
|
-
geney-1.2.
|
|
48
|
+
geney-1.2.6.dist-info/METADATA,sha256=hcHNz2oNxRLzNpimwrHeM2yKSnd1z5KHJ_Rl86LD3QE,1198
|
|
49
|
+
geney-1.2.6.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
|
|
50
|
+
geney-1.2.6.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
51
|
+
geney-1.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|