geney 1.3.65__py2.py3-none-any.whl → 1.3.67__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/splicing_utils.py CHANGED
@@ -166,7 +166,7 @@ def run_splicing_engine(seq, engine='spliceai'):
166
166
  match engine:
167
167
  case 'spliceai':
168
168
  from .spliceai_utils import sai_predict_probs, sai_models
169
- donor_probs, acceptor_probs = sai_predict_probs(seq, models=sai_models)
169
+ acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
170
170
 
171
171
  case 'pangolin':
172
172
  from .pangolin_utils import pangolin_predict_probs, pang_models
@@ -214,6 +214,7 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
214
214
  # Create dictionaries and sort them by probability in descending order
215
215
  donor_probs = dict(sorted(((i, p) for i, p in zip(ref_indices, ref_seq_donor_probs)),
216
216
  key=lambda item: item[1], reverse=True))
217
+
217
218
  acceptor_probs = dict(sorted(((i, p) for i, p in zip(ref_indices, ref_seq_acceptor_probs)),
218
219
  key=lambda item: item[1], reverse=True))
219
220
 
@@ -332,8 +333,8 @@ def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, thresh
332
333
  if ref_seq.seq == var_seq.seq:
333
334
  return Missplicing({'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}})
334
335
 
335
- ref_seq_acceptor_probs, ref_seq_donor_probs = run_splicing_engine(ref_seq.seq, engine)
336
- mut_seq_acceptor_probs, mut_seq_donor_probs = run_splicing_engine(var_seq.seq, engine)
336
+ ref_seq_donor_probs, ref_seq_acceptor_probs = run_splicing_engine(ref_seq.seq, engine)
337
+ mut_seq_donor_probs, mut_seq_acceptor_probs = run_splicing_engine(var_seq.seq, engine)
337
338
  ref_indices = ref_seq.indices[5000:-5000]
338
339
  mut_indices = var_seq.indices[5000:-5000]
339
340
  visible_donors = np.intersect1d(donors, ref_indices)
@@ -462,49 +463,199 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
462
463
  return pd.concat(results)
463
464
 
464
465
 
466
+ # def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
467
+ # donor_probs, acceptor_probs = {}, {}
468
+ # lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
469
+ # g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
470
+ # print(g.rev)
471
+ # if g.rev:
472
+ # lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
473
+ # else:
474
+ # factor = 1
475
+ #
476
+ # lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
477
+ #
478
+ # for m in ['wild_type'] + mid.split('|') + [mid]:
479
+ # transcript = g.clone().pre_mrna
480
+ # if m != 'wild_type':
481
+ # mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
482
+ # if g.rev:
483
+ # mutations = [m.reverse_complement() for m in mutations]
484
+ # for mutation in mutations:
485
+ # if mutation in transcript:
486
+ # transcript.mutate(mutation, inplace=True)
487
+ #
488
+ # donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
489
+ # donor_probs[m] = donors
490
+ # acceptor_probs[m] = acceptors
491
+ #
492
+ # acceptors = pd.DataFrame.from_dict(acceptor_probs).T
493
+ # donors = pd.DataFrame.from_dict(donor_probs).T
494
+ #
495
+ # acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
496
+ # acceptors = acceptors.loc[:, acceptors.nunique() > 1]
497
+ # donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
498
+ # donors = donors.loc[:, donors.nunique() > 1]
499
+ #
500
+ # donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
501
+ # (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
502
+ # acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
503
+ # (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
504
+ #
505
+ # donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
506
+ # acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
507
+ #
508
+ # return acceptors, donors
465
509
  def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
510
+ """
511
+ Process pairwise epistasis for a given mutation identifier (mid).
512
+
513
+ This function:
514
+ 1. Parses the input 'mid' to extract positions and loads a gene/transcript.
515
+ 2. Adjusts bounds based on strand orientation (reverse or forward).
516
+ 3. Iterates over several mutation scenarios (wild type, individual mutations, and combined mutations),
517
+ cloning and mutating the transcript as needed.
518
+ 4. Computes splicing probabilities (donors and acceptors) for a transcript segment.
519
+ 5. Stores these probabilities in dictionaries and converts them to DataFrames.
520
+ 6. Applies rounding, thresholding (setting very small numbers to 0), and filters out columns with little variation.
521
+ 7. Adds new features:
522
+ - residual: difference between total change and the sum of two individual deviations.
523
+ - deviation1: change from baseline (row 0) to row 1.
524
+ - deviation2: change from baseline (row 0) to row 2.
525
+ - total_deviation: change from baseline (row 0) to row 3.
526
+ and filters columns with insignificant residual (absolute value <= 0.1).
527
+
528
+ The new features persist in the returned DataFrames.
529
+
530
+ Returns:
531
+ acceptors_df (pd.DataFrame): Processed acceptor probabilities with extra features.
532
+ donors_df (pd.DataFrame): Processed donor probabilities with extra features.
533
+ """
534
+ import pandas as pd
535
+
466
536
  donor_probs, acceptor_probs = {}, {}
467
- lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
468
- g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
469
- print(g.rev)
537
+
538
+ # Parse the mid string: assume the format is "file:...:lower_pos:...:upper_pos:..."
539
+ parts = mid.split(':')
540
+ lower_pos, upper_pos = int(parts[2]), int(parts[6])
541
+
542
+ # Load gene and its transcript (as pre-mRNA)
543
+ g = Gene.from_file(parts[0]).transcript().generate_pre_mrna()
544
+
545
+ # If gene is on the reverse strand, swap positions and set factor to -1.
470
546
  if g.rev:
471
- lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
547
+ lower_pos, upper_pos = upper_pos, lower_pos
548
+ factor = -1
472
549
  else:
473
550
  factor = 1
474
551
 
552
+ # Define bounds with a 7500 bp padding on both sides.
475
553
  lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
476
-
477
- for m in ['wild_type'] + mid.split('|') + [mid]:
554
+ # Ensure lb and ub fall within the transcript indices.
555
+ if lb not in g.pre_mrna.indices:
556
+ lb = g.pre_mrna.indices.max() if g.rev else g.pre_mrna.indices.min()
557
+ if ub not in g.pre_mrna.indices:
558
+ ub = g.pre_mrna.indices.min() if g.rev else g.pre_mrna.indices.max()
559
+
560
+ # Process each mutation scenario:
561
+ # - 'wild_type' (no mutations)
562
+ # - individual mutations (split by '|')
563
+ # - a scenario with all mutations (mid itself)
564
+ scenarios = ['wild_type'] + mid.split('|') + [mid]
565
+ for m in scenarios:
566
+ # Clone the transcript for independent mutation processing.
478
567
  transcript = g.clone().pre_mrna
479
568
  if m != 'wild_type':
569
+ # Parse mutations from the scenario string.
480
570
  mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
571
+ # If the gene is reversed, get the reverse complement of each mutation.
481
572
  if g.rev:
482
- mutations = [m.reverse_complement() for m in mutations]
573
+ mutations = [mutation.reverse_complement() for mutation in mutations]
574
+ # Apply each mutation (if present) to the transcript.
483
575
  for mutation in mutations:
484
576
  if mutation in transcript:
485
577
  transcript.mutate(mutation, inplace=True)
486
578
 
579
+ # Calculate splicing probabilities on the transcript slice defined by lb:ub.
487
580
  donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
488
581
  donor_probs[m] = donors
489
582
  acceptor_probs[m] = acceptors
490
583
 
491
- acceptors = pd.DataFrame.from_dict(acceptor_probs).T
492
- donors = pd.DataFrame.from_dict(donor_probs).T
493
-
494
- acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
495
- acceptors = acceptors.loc[:, acceptors.nunique() > 1]
496
- donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
497
- donors = donors.loc[:, donors.nunique() > 1]
498
-
499
- donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
500
- (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
501
- acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
502
- (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
503
-
504
- donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
505
- acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
584
+ # Convert the results to DataFrames (each scenario as a row)
585
+ acceptors_df = pd.DataFrame.from_dict(acceptor_probs, orient='index')
586
+ donors_df = pd.DataFrame.from_dict(donor_probs, orient='index')
587
+
588
+ # Apply rounding and thresholding:
589
+ # - For acceptors: set values < 0.01 to 0, else round to 2 decimals.
590
+ # - For donors: use absolute value threshold.
591
+ acceptors_df = acceptors_df.map(
592
+ lambda x: 0 if isinstance(x, (int, float)) and x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x
593
+ ).round(2)
594
+ donors_df = donors_df.map(
595
+ lambda x: 0 if isinstance(x, (int, float)) and abs(x) < 0.01 else round(x, 2) if isinstance(x,
596
+ (int, float)) else x
597
+ ).round(2)
598
+
599
+ # Drop columns that do not vary (only one unique value).
600
+ acceptors_df = acceptors_df.loc[:, acceptors_df.nunique() > 1]
601
+ donors_df = donors_df.loc[:, donors_df.nunique() > 1]
602
+
603
+ # Further filter acceptors: keep only columns where the value in the second row is < 0.1.
604
+ # (Assumes that the second row (iloc[1]) represents a specific measure you wish to threshold.)
605
+ acceptors_df = acceptors_df.loc[:, acceptors_df.iloc[1] < 0.1]
606
+
607
+ # Helper function: add new features (residual and deviations) and filter based on residual.
608
+ def add_features_and_filter(df):
609
+ if df.shape[1] == 0:
610
+ return df # Nothing to process if no columns remain.
611
+ # Compute the residual:
612
+ # (row 3 - row 0) minus ( (row 1 - row 0) + (row 2 - row 0) )
613
+ df.loc['residual'] = (df.iloc[3] - df.iloc[0]) - ((df.iloc[1] - df.iloc[0]) + (df.iloc[2] - df.iloc[0]))
614
+ # Keep only columns where the absolute residual exceeds 0.1.
615
+ # df = df.loc[:, df.loc['residual'].abs() > 0.1]
616
+ # if df.shape[1] == 0:
617
+ # return df
618
+ # Compute deviations relative to the baseline (row 0)
619
+ df.loc['deviation1'] = df.iloc[1] - df.iloc[0]
620
+ df.loc['deviation2'] = df.iloc[2] - df.iloc[0]
621
+ df.loc['total_deviation'] = df.iloc[3] - df.iloc[0]
622
+ return df
623
+
624
+ # Apply the feature computation to both donors and acceptors.
625
+ donors_df = add_features_and_filter(donors_df)
626
+ acceptors_df = add_features_and_filter(acceptors_df)
627
+
628
+ # Return the processed dataframes with the new features persisting.
629
+ donors_df.loc['site_type', :] = 0
630
+ acceptors_df.loc['site_type', :] = 1
631
+ df = pd.concat([acceptors_df, donors_df], axis=1)
632
+
633
+ mask = df.apply(
634
+ lambda col: (
635
+ (abs(col['residual']) > 0.1) and
636
+ (abs(col['deviation1'] + col['deviation2']) < 0.1)
637
+ ),
638
+ axis=0
639
+ )
640
+ df.loc['synergistic'] = 0
641
+ df.loc['synergistic', mask] = 1
642
+
643
+ mask = df.apply(
644
+ lambda col: (
645
+ (abs(col['residual']) > 0.1) and
646
+ (abs(col['total_deviation']) <= 0.25)
647
+ ),
648
+ axis=0
649
+ )
506
650
 
507
- return acceptors, donors
651
+ df.loc['antagonistic'] = 0
652
+ df.loc['antagonistic', mask] = 1
653
+ df.loc['mut_id'] = mid
654
+ df.loc['engine'] = engine
655
+ df.loc['site'] = df.columns
656
+ df = df.rename({mid: 'epistasis', mid.split('|')[0]: 'cv1', mid.split('|')[1]: 'cv2'})
657
+ df = df.T
658
+ return df
508
659
 
509
660
 
510
661
  class Missplicing:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.3.65
3
+ Version: 1.3.67
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
16
16
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
17
17
  geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
18
18
  geney/spliceai_utils.py,sha256=tVY0T6F6l3fNoaktpn7Kq0oH5ZM0ThFYt9nPi_lfakw,3077
19
- geney/splicing_utils.py,sha256=W-N0ENZJv1PdnVlHuaN_2az2-7Zl6cHYe_CYR1G41U4,40766
19
+ geney/splicing_utils.py,sha256=afnTncU607dLLfMz4Z1pj06dkO03u6Wt43cNBu7pEjU,47647
20
20
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
21
21
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
22
22
  geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
25
25
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
26
26
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
27
27
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
28
- geney-1.3.65.dist-info/METADATA,sha256=L-doIh0XdJuxs4gg1Dhs5mLoa_1zI8_bboq4cnlfvfA,990
29
- geney-1.3.65.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
- geney-1.3.65.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
- geney-1.3.65.dist-info/RECORD,,
28
+ geney-1.3.67.dist-info/METADATA,sha256=Quhz5RoxRIVxv0VlKP9NhmIdy0NzcOi3viZ51WIBzm8,990
29
+ geney-1.3.67.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
+ geney-1.3.67.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
+ geney-1.3.67.dist-info/RECORD,,
File without changes