geney 1.2.5__py2.py3-none-any.whl → 1.2.6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/oncosplice.py CHANGED
@@ -545,7 +545,7 @@ class Transcript:
545
545
  def generate_mature_mrna_pos(self, reset=True):
546
546
  if reset:
547
547
  pre_seq_pos, pre_indices_pos = self.generate_pre_mrna_pos()
548
- self.pre_mrna, _ = self.__pos2sense(pre_seq_pos, pre_indices_pos)
548
+ self.pre_mrna, self.pre_indices = self.__pos2sense(pre_seq_pos, pre_indices_pos)
549
549
  else:
550
550
  pre_seq_pos, pre_indices_pos = self.__sense2pos(self.pre_mrna, self.pre_indices)
551
551
 
@@ -725,7 +725,17 @@ def pang_one_hot_encode(seq):
725
725
  seq = np.asarray(list(map(int, list(seq))))
726
726
  return IN_MAP[seq.astype('int8')]
727
727
 
728
+
729
+ def get_pos_seq_indices(t):
730
+ seq, indices = t.pre_mrna, t.pre_indices
731
+ if t.rev:
732
+ return reverse_complement(seq), indices[::-1]
733
+ else:
734
+ return seq, indices
735
+
736
+
728
737
  def pangolin_predict_probs(true_seq, models):
738
+ # print(f"Running pangolin on: {true_seq}")
729
739
  model_nums = [0, 2, 4, 6]
730
740
  INDEX_MAP = {0: 1, 1: 2, 2: 4, 3: 5, 4: 7, 5: 8, 6: 10, 7: 11}
731
741
 
@@ -754,42 +764,36 @@ def pangolin_predict_probs(true_seq, models):
754
764
  acceptor_probs = [splicing_pred[i] * acceptor_dinucleotide[i] for i in range(len(true_seq))]
755
765
  return donor_probs[5000:-5000], acceptor_probs[5000:-5000]
756
766
 
757
- def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
767
+
768
+ def find_transcript_missplicing(mutations, ref_transcript, var_transcript, context=7500, threshold=0.5,
769
+ engine='spliceai'):
758
770
  positions = mutations.positions
759
771
  end_positions = [m.start + len(m.ref) for m in mutations.variants]
760
772
  positions.extend(end_positions)
773
+ center = int(np.mean(positions) // 1)
774
+
775
+ seq_start_pos, seq_end_pos = center - context, center + context
776
+ transcript_start, transcript_end, rev = ref_transcript.transcript_lower, ref_transcript.transcript_upper, ref_transcript.rev
777
+
778
+ # Generate reference sequence data
779
+ ref_seq, ref_indices = get_pos_seq_indices(ref_transcript)
780
+ center_index = ref_indices.index(center)
781
+ start_cutoff = ref_indices.index(seq_start_pos) if seq_start_pos in ref_indices else 0
782
+ end_cutoff = ref_indices.index(seq_end_pos) if seq_end_pos in ref_indices else len(ref_indices)
783
+ start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
784
+ ref_seq = 'N' * start_pad + ref_seq[start_cutoff:end_cutoff] + 'N' * end_pad
785
+ ref_indices = [-1] * start_pad + ref_indices[start_cutoff:end_cutoff] + [-1] * end_pad
786
+
787
+ # Generate mutation sequence data
788
+ mut_seq, mut_indices = get_pos_seq_indices(var_transcript)
789
+ start_cutoff = mut_indices.index(seq_start_pos) if seq_start_pos in mut_indices else 0
790
+ end_cutoff = mut_indices.index(seq_end_pos) if seq_end_pos in mut_indices else len(mut_indices)
791
+ start_pad, end_pad = max(0, context - (center_index - start_cutoff)), max(0, context - (end_cutoff - center_index))
792
+ mut_seq = 'N' * start_pad + mut_seq[start_cutoff:end_cutoff] + 'N' * end_pad
793
+ mut_indices = [-1] * start_pad + mut_indices[start_cutoff:end_cutoff] + [-1] * end_pad
761
794
 
762
- seq_start_pos = min(positions) - sai_mrg_context - min_coverage
763
- seq_end_pos = max(positions) + sai_mrg_context + min_coverage
764
-
765
- fasta_obj = Fasta_segment()
766
- ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
767
- config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
768
- seq_start_pos,
769
- seq_end_pos)
770
-
771
- transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
772
-
773
- # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
774
- # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
775
-
776
- start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
777
- end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
778
- end_pad = len(ref_indices) - end_cutoff
779
- ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
780
- ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
781
- mut_seq, mut_indices = ref_seq, ref_indices
782
-
783
- for mut in mutations:
784
- mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
785
-
786
- ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
787
- mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
788
795
  copy_mut_indices = mut_indices.copy()
789
796
 
790
- visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
791
- visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
792
-
793
797
  if rev:
794
798
  ref_seq = reverse_complement(ref_seq)
795
799
  mut_seq = reverse_complement(mut_seq)
@@ -801,21 +805,28 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
801
805
  mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
802
806
  ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
803
807
  mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
808
+ ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
809
+
804
810
 
805
811
  elif engine == 'pangolin':
806
- ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
807
- mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
812
+ ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, models=pang_models)
813
+ mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, models=pang_models)
814
+ ref_indices, mut_indices = ref_indices[5000:-5000], mut_indices[5000:-5000]
808
815
 
809
816
  else:
810
817
  raise ValueError(f"{engine} not implemented")
811
818
 
819
+ visible_donors = np.intersect1d(ref_transcript.donors, ref_indices)
820
+ visible_acceptors = np.intersect1d(ref_transcript.acceptors, ref_indices)
821
+ # print(ref_indices.index(visible_donors[0]), ref_seq_donor_probs[ref_indices.index(visible_donors[0])], mut_seq_donor_probs[mut_indices.index(visible_donors[0])])
822
+
812
823
  assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
813
824
  assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
814
825
 
815
826
  iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
816
827
  {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
817
828
  visible_acceptors,
818
- threshold=sai_threshold)
829
+ threshold=threshold)
819
830
 
820
831
  assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
821
832
  assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
@@ -823,13 +834,15 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
823
834
  idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
824
835
  {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
825
836
  visible_donors,
826
- threshold=sai_threshold)
837
+ threshold=threshold)
827
838
 
828
839
  ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
829
840
  ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
830
841
 
831
- lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
832
- lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
842
+ lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in
843
+ visible_acceptors if p not in mut_indices and p not in dap}
844
+ lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors
845
+ if p not in mut_indices and p not in ddp}
833
846
  dap.update(lost_acceptors)
834
847
  ddp.update(lost_donors)
835
848
 
@@ -838,6 +851,90 @@ def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, mi
838
851
  return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
839
852
 
840
853
 
854
+ # def run_spliceai_transcript(mutations, transcript_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5, engine='spliceai'):
855
+ # positions = mutations.positions
856
+ # end_positions = [m.start + len(m.ref) for m in mutations.variants]
857
+ # positions.extend(end_positions)
858
+ #
859
+ # seq_start_pos = min(positions) - sai_mrg_context - min_coverage
860
+ # seq_end_pos = max(positions) + sai_mrg_context + min_coverage
861
+ #
862
+ # fasta_obj = Fasta_segment()
863
+ # ref_seq, ref_indices = fasta_obj.read_segment_endpoints(
864
+ # config_setup[transcript_data.organism]['CHROM_SOURCE'] / f'chr{mutations.chrom}.fasta',
865
+ # seq_start_pos,
866
+ # seq_end_pos)
867
+ #
868
+ # transcript_start, transcript_end, rev = transcript_data.transcript_lower, transcript_data.transcript_upper, transcript_data.rev
869
+ #
870
+ # # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
871
+ # # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
872
+ #
873
+ # start_pad = ref_indices.index(transcript_start) if transcript_start in ref_indices else 0
874
+ # end_cutoff = ref_indices.index(transcript_end) if transcript_end in ref_indices else len(ref_indices)
875
+ # end_pad = len(ref_indices) - end_cutoff
876
+ # ref_seq = 'N' * start_pad + ref_seq[start_pad:end_cutoff] + 'N' * end_pad
877
+ # ref_indices = [-1] * start_pad + ref_indices[start_pad:end_cutoff] + [-1] * end_pad
878
+ # mut_seq, mut_indices = ref_seq, ref_indices
879
+ #
880
+ # for mut in mutations:
881
+ # mut_seq, mut_indices = generate_mut_variant(seq=mut_seq, indices=mut_indices, mut=mut)
882
+ #
883
+ # ref_indices = ref_indices[sai_mrg_context:-sai_mrg_context]
884
+ # mut_indices = mut_indices[sai_mrg_context:-sai_mrg_context]
885
+ # copy_mut_indices = mut_indices.copy()
886
+ #
887
+ # visible_donors = np.intersect1d(transcript_data.donors, ref_indices)
888
+ # visible_acceptors = np.intersect1d(transcript_data.acceptors, ref_indices)
889
+ #
890
+ # if rev:
891
+ # ref_seq = reverse_complement(ref_seq)
892
+ # mut_seq = reverse_complement(mut_seq)
893
+ # ref_indices = ref_indices[::-1]
894
+ # mut_indices = mut_indices[::-1]
895
+ #
896
+ # if engine == 'spliceai':
897
+ # ref_seq_probs_temp = sai_predict_probs(ref_seq, sai_models)
898
+ # mut_seq_probs_temp = sai_predict_probs(mut_seq, sai_models)
899
+ # ref_seq_acceptor_probs, ref_seq_donor_probs = ref_seq_probs_temp[0, :], ref_seq_probs_temp[1, :]
900
+ # mut_seq_acceptor_probs, mut_seq_donor_probs = mut_seq_probs_temp[0, :], mut_seq_probs_temp[1, :]
901
+ #
902
+ # elif engine == 'pangolin':
903
+ # ref_seq_donor_probs, ref_seq_acceptor_probs = pangolin_predict_probs(ref_seq, pangolin_models=pang_models)
904
+ # mut_seq_donor_probs, mut_seq_acceptor_probs = pangolin_predict_probs(mut_seq, pangolin_models=pang_models)
905
+ #
906
+ # else:
907
+ # raise ValueError(f"{engine} not implemented")
908
+ #
909
+ # assert len(ref_indices) == len(ref_seq_acceptor_probs), 'Reference pos not the same'
910
+ # assert len(mut_indices) == len(mut_seq_acceptor_probs), 'Mut pos not the same'
911
+ #
912
+ # iap, dap = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_acceptor_probs))},
913
+ # {p: v for p, v in list(zip(mut_indices, mut_seq_acceptor_probs))},
914
+ # visible_acceptors,
915
+ # threshold=sai_threshold)
916
+ #
917
+ # assert len(ref_indices) == len(ref_seq_donor_probs), 'Reference pos not the same'
918
+ # assert len(mut_indices) == len(mut_seq_donor_probs), 'Mut pos not the same'
919
+ #
920
+ # idp, ddp = find_ss_changes({p: v for p, v in list(zip(ref_indices, ref_seq_donor_probs))},
921
+ # {p: v for p, v in list(zip(mut_indices, mut_seq_donor_probs))},
922
+ # visible_donors,
923
+ # threshold=sai_threshold)
924
+ #
925
+ # ref_acceptors = {a: b for a, b in list(zip(ref_indices, ref_seq_acceptor_probs))}
926
+ # ref_donors = {a: b for a, b in list(zip(ref_indices, ref_seq_donor_probs))}
927
+ #
928
+ # lost_acceptors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_acceptors[p]), 3)} for p in visible_acceptors if p not in mut_indices and p not in dap}
929
+ # lost_donors = {int(p): {'absolute': np.float64(0), 'delta': round(float(-ref_donors[p]), 3)} for p in visible_donors if p not in mut_indices and p not in ddp}
930
+ # dap.update(lost_acceptors)
931
+ # ddp.update(lost_donors)
932
+ #
933
+ # missplicing = {'missed_acceptors': dap, 'missed_donors': ddp, 'discovered_acceptors': iap, 'discovered_donors': idp}
934
+ # missplicing = {outk: {float(k): v for k, v in outv.items()} for outk, outv in missplicing.items()}
935
+ # return {outk: {int(k) if k.is_integer() else k: v for k, v in outv.items()} for outk, outv in missplicing.items()}
936
+
937
+
841
938
  # def run_spliceai(mutations, gene_data, sai_mrg_context=5000, min_coverage=2500, sai_threshold=0.5):
842
939
  # positions = mutations.positions
843
940
  # seq_start_pos = min(positions) - sai_mrg_context - min_coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.2.5
3
+ Version: 1.2.6
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -9,7 +9,7 @@ geney/gtex.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
9
9
  geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
10
10
  geney/immune_utils.py,sha256=ZRni5ttrhpYBnmNr0d0ZatIbNPYs4nmQuoUO00SpsS4,5271
11
11
  geney/netchop.py,sha256=AMiy9YsdTmX4B3k3Y5Yh7EmoGAojM1O3AzhPKOiB--g,3050
12
- geney/oncosplice.py,sha256=9oZs9W_bI6O5h3284WvatkerhSCaxMZWfs1xVc1lJO0,71524
12
+ geney/oncosplice.py,sha256=aaOxri0bXLPfB3Mu8tkzk4KEVlzzocImzj3MI-0uU_0,76949
13
13
  geney/oncosplice_mouse.py,sha256=LYLOukI9qI1IBkyl1qVRFR5d1NAw7Orlj8Zth-4xCW8,12962
14
14
  geney/oncosplice_pipeline.py,sha256=hpGqFHOdn8i8tvvs1-t3-G9Ko18zInwoDXBJbbrfbC4,68036
15
15
  geney/performance_utils.py,sha256=FQt7rA4r-Wuq3kceCxsSuMfj3wU1tMG8QnbL59aBohs,4700
@@ -45,7 +45,7 @@ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFW
45
45
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
46
46
  geney/translation_termination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  geney/translation_termination/tts_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- geney-1.2.5.dist-info/METADATA,sha256=9UptuZVJWZvVN6Y9KgPUxrC4gnijFVW4CtkkESxrY9E,1198
49
- geney-1.2.5.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
- geney-1.2.5.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
- geney-1.2.5.dist-info/RECORD,,
48
+ geney-1.2.6.dist-info/METADATA,sha256=hcHNz2oNxRLzNpimwrHeM2yKSnd1z5KHJ_Rl86LD3QE,1198
49
+ geney-1.2.6.dist-info/WHEEL,sha256=iYlv5fX357PQyRT2o6tw1bN-YcKFFHKqB_LwHO5wP-g,110
50
+ geney-1.2.6.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
+ geney-1.2.6.dist-info/RECORD,,
File without changes