geney 1.3.66__py2.py3-none-any.whl → 1.3.67__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/splicing_utils.py CHANGED
@@ -463,49 +463,199 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
463
463
  return pd.concat(results)
464
464
 
465
465
 
466
+ # def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
467
+ # donor_probs, acceptor_probs = {}, {}
468
+ # lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
469
+ # g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
470
+ # print(g.rev)
471
+ # if g.rev:
472
+ # lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
473
+ # else:
474
+ # factor = 1
475
+ #
476
+ # lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
477
+ #
478
+ # for m in ['wild_type'] + mid.split('|') + [mid]:
479
+ # transcript = g.clone().pre_mrna
480
+ # if m != 'wild_type':
481
+ # mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
482
+ # if g.rev:
483
+ # mutations = [m.reverse_complement() for m in mutations]
484
+ # for mutation in mutations:
485
+ # if mutation in transcript:
486
+ # transcript.mutate(mutation, inplace=True)
487
+ #
488
+ # donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
489
+ # donor_probs[m] = donors
490
+ # acceptor_probs[m] = acceptors
491
+ #
492
+ # acceptors = pd.DataFrame.from_dict(acceptor_probs).T
493
+ # donors = pd.DataFrame.from_dict(donor_probs).T
494
+ #
495
+ # acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
496
+ # acceptors = acceptors.loc[:, acceptors.nunique() > 1]
497
+ # donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
498
+ # donors = donors.loc[:, donors.nunique() > 1]
499
+ #
500
+ # donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
501
+ # (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
502
+ # acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
503
+ # (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
504
+ #
505
+ # donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
506
+ # acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
507
+ #
508
+ # return acceptors, donors
466
509
  def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
510
+ """
511
+ Process pairwise epistasis for a given mutation identifier (mid).
512
+
513
+ This function:
514
+ 1. Parses the input 'mid' to extract positions and loads a gene/transcript.
515
+ 2. Adjusts bounds based on strand orientation (reverse or forward).
516
+ 3. Iterates over several mutation scenarios (wild type, individual mutations, and combined mutations),
517
+ cloning and mutating the transcript as needed.
518
+ 4. Computes splicing probabilities (donors and acceptors) for a transcript segment.
519
+ 5. Stores these probabilities in dictionaries and converts them to DataFrames.
520
+ 6. Applies rounding, thresholding (setting very small numbers to 0), and filters out columns with little variation.
521
+ 7. Adds new features:
522
+ - residual: difference between total change and the sum of two individual deviations.
523
+ - deviation1: change from baseline (row 0) to row 1.
524
+ - deviation2: change from baseline (row 0) to row 2.
525
+ - total_deviation: change from baseline (row 0) to row 3.
526
+ and filters columns with insignificant residual (absolute value <= 0.1).
527
+
528
+ The new features persist in the returned DataFrames.
529
+
530
+ Returns:
531
+ acceptors_df (pd.DataFrame): Processed acceptor probabilities with extra features.
532
+ donors_df (pd.DataFrame): Processed donor probabilities with extra features.
533
+ """
534
+ import pandas as pd
535
+
467
536
  donor_probs, acceptor_probs = {}, {}
468
- lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
469
- g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
470
- print(g.rev)
537
+
538
+ # Parse the mid string: assume the format is "file:...:lower_pos:...:upper_pos:..."
539
+ parts = mid.split(':')
540
+ lower_pos, upper_pos = int(parts[2]), int(parts[6])
541
+
542
+ # Load gene and its transcript (as pre-mRNA)
543
+ g = Gene.from_file(parts[0]).transcript().generate_pre_mrna()
544
+
545
+ # If gene is on the reverse strand, swap positions and set factor to -1.
471
546
  if g.rev:
472
- lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
547
+ lower_pos, upper_pos = upper_pos, lower_pos
548
+ factor = -1
473
549
  else:
474
550
  factor = 1
475
551
 
552
+ # Define bounds with a 7500 bp padding on both sides.
476
553
  lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
477
-
478
- for m in ['wild_type'] + mid.split('|') + [mid]:
554
+ # Ensure lb and ub fall within the transcript indices.
555
+ if lb not in g.pre_mrna.indices:
556
+ lb = g.pre_mrna.indices.max() if g.rev else g.pre_mrna.indices.min()
557
+ if ub not in g.pre_mrna.indices:
558
+ ub = g.pre_mrna.indices.min() if g.rev else g.pre_mrna.indices.max()
559
+
560
+ # Process each mutation scenario:
561
+ # - 'wild_type' (no mutations)
562
+ # - individual mutations (split by '|')
563
+ # - a scenario with all mutations (mid itself)
564
+ scenarios = ['wild_type'] + mid.split('|') + [mid]
565
+ for m in scenarios:
566
+ # Clone the transcript for independent mutation processing.
479
567
  transcript = g.clone().pre_mrna
480
568
  if m != 'wild_type':
569
+ # Parse mutations from the scenario string.
481
570
  mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
571
+ # If the gene is reversed, get the reverse complement of each mutation.
482
572
  if g.rev:
483
- mutations = [m.reverse_complement() for m in mutations]
573
+ mutations = [mutation.reverse_complement() for mutation in mutations]
574
+ # Apply each mutation (if present) to the transcript.
484
575
  for mutation in mutations:
485
576
  if mutation in transcript:
486
577
  transcript.mutate(mutation, inplace=True)
487
578
 
579
+ # Calculate splicing probabilities on the transcript slice defined by lb:ub.
488
580
  donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
489
581
  donor_probs[m] = donors
490
582
  acceptor_probs[m] = acceptors
491
583
 
492
- acceptors = pd.DataFrame.from_dict(acceptor_probs).T
493
- donors = pd.DataFrame.from_dict(donor_probs).T
494
-
495
- acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
496
- acceptors = acceptors.loc[:, acceptors.nunique() > 1]
497
- donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
498
- donors = donors.loc[:, donors.nunique() > 1]
499
-
500
- donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
501
- (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
502
- acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
503
- (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
504
-
505
- donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
506
- acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
584
+ # Convert the results to DataFrames (each scenario as a row)
585
+ acceptors_df = pd.DataFrame.from_dict(acceptor_probs, orient='index')
586
+ donors_df = pd.DataFrame.from_dict(donor_probs, orient='index')
587
+
588
+ # Apply rounding and thresholding:
589
+ # - For acceptors: set values < 0.01 to 0, else round to 2 decimals.
590
+ # - For donors: use absolute value threshold.
591
+ acceptors_df = acceptors_df.map(
592
+ lambda x: 0 if isinstance(x, (int, float)) and x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x
593
+ ).round(2)
594
+ donors_df = donors_df.map(
595
+ lambda x: 0 if isinstance(x, (int, float)) and abs(x) < 0.01 else round(x, 2) if isinstance(x,
596
+ (int, float)) else x
597
+ ).round(2)
598
+
599
+ # Drop columns that do not vary (only one unique value).
600
+ acceptors_df = acceptors_df.loc[:, acceptors_df.nunique() > 1]
601
+ donors_df = donors_df.loc[:, donors_df.nunique() > 1]
602
+
603
+ # Further filter acceptors: keep only columns where the value in the second row is < 0.1.
604
+ # (Assumes that the second row (iloc[1]) represents a specific measure you wish to threshold.)
605
+ acceptors_df = acceptors_df.loc[:, acceptors_df.iloc[1] < 0.1]
606
+
607
+ # Helper function: add new features (residual and deviations) and filter based on residual.
608
+ def add_features_and_filter(df):
609
+ if df.shape[1] == 0:
610
+ return df # Nothing to process if no columns remain.
611
+ # Compute the residual:
612
+ # (row 3 - row 0) minus ( (row 1 - row 0) + (row 2 - row 0) )
613
+ df.loc['residual'] = (df.iloc[3] - df.iloc[0]) - ((df.iloc[1] - df.iloc[0]) + (df.iloc[2] - df.iloc[0]))
614
+ # Keep only columns where the absolute residual exceeds 0.1.
615
+ # df = df.loc[:, df.loc['residual'].abs() > 0.1]
616
+ # if df.shape[1] == 0:
617
+ # return df
618
+ # Compute deviations relative to the baseline (row 0)
619
+ df.loc['deviation1'] = df.iloc[1] - df.iloc[0]
620
+ df.loc['deviation2'] = df.iloc[2] - df.iloc[0]
621
+ df.loc['total_deviation'] = df.iloc[3] - df.iloc[0]
622
+ return df
623
+
624
+ # Apply the feature computation to both donors and acceptors.
625
+ donors_df = add_features_and_filter(donors_df)
626
+ acceptors_df = add_features_and_filter(acceptors_df)
627
+
628
+ # Return the processed dataframes with the new features persisting.
629
+ donors_df.loc['site_type', :] = 0
630
+ acceptors_df.loc['site_type', :] = 1
631
+ df = pd.concat([acceptors_df, donors_df], axis=1)
632
+
633
+ mask = df.apply(
634
+ lambda col: (
635
+ (abs(col['residual']) > 0.1) and
636
+ (abs(col['deviation1'] + col['deviation2']) < 0.1)
637
+ ),
638
+ axis=0
639
+ )
640
+ df.loc['synergistic'] = 0
641
+ df.loc['synergistic', mask] = 1
642
+
643
+ mask = df.apply(
644
+ lambda col: (
645
+ (abs(col['residual']) > 0.1) and
646
+ (abs(col['total_deviation']) <= 0.25)
647
+ ),
648
+ axis=0
649
+ )
507
650
 
508
- return acceptors, donors
651
+ df.loc['antagonistic'] = 0
652
+ df.loc['antagonistic', mask] = 1
653
+ df.loc['mut_id'] = mid
654
+ df.loc['engine'] = engine
655
+ df.loc['site'] = df.columns
656
+ df = df.rename({mid: 'epistasis', mid.split('|')[0]: 'cv1', mid.split('|')[1]: 'cv2'})
657
+ df = df.T
658
+ return df
509
659
 
510
660
 
511
661
  class Missplicing:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.3.66
3
+ Version: 1.3.67
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
16
16
  geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
17
17
  geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
18
18
  geney/spliceai_utils.py,sha256=tVY0T6F6l3fNoaktpn7Kq0oH5ZM0ThFYt9nPi_lfakw,3077
19
- geney/splicing_utils.py,sha256=_nXLCK41GhcrkXHXAqkhNV2IcwFltSxrR-rm8fUIrfE,40767
19
+ geney/splicing_utils.py,sha256=afnTncU607dLLfMz4Z1pj06dkO03u6Wt43cNBu7pEjU,47647
20
20
  geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
21
21
  geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
22
22
  geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
25
25
  geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
26
26
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
27
27
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
28
- geney-1.3.66.dist-info/METADATA,sha256=bl8lWCBcJsbfBPJmkoY8xG0n6G7z7X1C-6jA1bSevCk,990
29
- geney-1.3.66.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
- geney-1.3.66.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
- geney-1.3.66.dist-info/RECORD,,
28
+ geney-1.3.67.dist-info/METADATA,sha256=Quhz5RoxRIVxv0VlKP9NhmIdy0NzcOi3viZ51WIBzm8,990
29
+ geney-1.3.67.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
30
+ geney-1.3.67.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
31
+ geney-1.3.67.dist-info/RECORD,,
File without changes