geney 1.3.66__py2.py3-none-any.whl → 1.3.68__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/splicing_utils.py
CHANGED
|
@@ -463,49 +463,205 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
|
|
|
463
463
|
return pd.concat(results)
|
|
464
464
|
|
|
465
465
|
|
|
466
|
+
# def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
|
|
467
|
+
# donor_probs, acceptor_probs = {}, {}
|
|
468
|
+
# lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
|
|
469
|
+
# g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
|
|
470
|
+
# print(g.rev)
|
|
471
|
+
# if g.rev:
|
|
472
|
+
# lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
|
|
473
|
+
# else:
|
|
474
|
+
# factor = 1
|
|
475
|
+
#
|
|
476
|
+
# lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
|
|
477
|
+
#
|
|
478
|
+
# for m in ['wild_type'] + mid.split('|') + [mid]:
|
|
479
|
+
# transcript = g.clone().pre_mrna
|
|
480
|
+
# if m != 'wild_type':
|
|
481
|
+
# mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
|
|
482
|
+
# if g.rev:
|
|
483
|
+
# mutations = [m.reverse_complement() for m in mutations]
|
|
484
|
+
# for mutation in mutations:
|
|
485
|
+
# if mutation in transcript:
|
|
486
|
+
# transcript.mutate(mutation, inplace=True)
|
|
487
|
+
#
|
|
488
|
+
# donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
|
|
489
|
+
# donor_probs[m] = donors
|
|
490
|
+
# acceptor_probs[m] = acceptors
|
|
491
|
+
#
|
|
492
|
+
# acceptors = pd.DataFrame.from_dict(acceptor_probs).T
|
|
493
|
+
# donors = pd.DataFrame.from_dict(donor_probs).T
|
|
494
|
+
#
|
|
495
|
+
# acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
|
|
496
|
+
# acceptors = acceptors.loc[:, acceptors.nunique() > 1]
|
|
497
|
+
# donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
|
|
498
|
+
# donors = donors.loc[:, donors.nunique() > 1]
|
|
499
|
+
#
|
|
500
|
+
# donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
|
|
501
|
+
# (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
|
|
502
|
+
# acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
|
|
503
|
+
# (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
|
|
504
|
+
#
|
|
505
|
+
# donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
|
|
506
|
+
# acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
|
|
507
|
+
#
|
|
508
|
+
# return acceptors, donors
|
|
466
509
|
def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
|
|
510
|
+
"""
|
|
511
|
+
Process pairwise epistasis for a given mutation identifier (mid).
|
|
512
|
+
|
|
513
|
+
This function:
|
|
514
|
+
1. Parses the input 'mid' to extract positions and loads a gene/transcript.
|
|
515
|
+
2. Adjusts bounds based on strand orientation (reverse or forward).
|
|
516
|
+
3. Iterates over several mutation scenarios (wild type, individual mutations, and combined mutations),
|
|
517
|
+
cloning and mutating the transcript as needed.
|
|
518
|
+
4. Computes splicing probabilities (donors and acceptors) for a transcript segment.
|
|
519
|
+
5. Stores these probabilities in dictionaries and converts them to DataFrames.
|
|
520
|
+
6. Applies rounding, thresholding (setting very small numbers to 0), and filters out columns with little variation.
|
|
521
|
+
7. Adds new features:
|
|
522
|
+
- residual: difference between total change and the sum of two individual deviations.
|
|
523
|
+
- deviation1: change from baseline (row 0) to row 1.
|
|
524
|
+
- deviation2: change from baseline (row 0) to row 2.
|
|
525
|
+
- total_deviation: change from baseline (row 0) to row 3.
|
|
526
|
+
and filters columns with insignificant residual (absolute value <= 0.1).
|
|
527
|
+
|
|
528
|
+
The new features persist in the returned DataFrames.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
acceptors_df (pd.DataFrame): Processed acceptor probabilities with extra features.
|
|
532
|
+
donors_df (pd.DataFrame): Processed donor probabilities with extra features.
|
|
533
|
+
"""
|
|
534
|
+
import pandas as pd
|
|
535
|
+
|
|
467
536
|
donor_probs, acceptor_probs = {}, {}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
537
|
+
|
|
538
|
+
# Parse the mid string: assume the format is "file:...:lower_pos:...:upper_pos:..."
|
|
539
|
+
parts = mid.split(':')
|
|
540
|
+
lower_pos, upper_pos = int(parts[2]), int(parts[6])
|
|
541
|
+
|
|
542
|
+
# Load gene and its transcript (as pre-mRNA)
|
|
543
|
+
g = Gene.from_file(parts[0]).transcript().generate_pre_mrna()
|
|
544
|
+
|
|
545
|
+
# If gene is on the reverse strand, swap positions and set factor to -1.
|
|
471
546
|
if g.rev:
|
|
472
|
-
lower_pos, upper_pos
|
|
547
|
+
lower_pos, upper_pos = upper_pos, lower_pos
|
|
548
|
+
factor = -1
|
|
473
549
|
else:
|
|
474
550
|
factor = 1
|
|
475
551
|
|
|
552
|
+
# Define bounds with a 7500 bp padding on both sides.
|
|
476
553
|
lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
|
|
477
|
-
|
|
478
|
-
|
|
554
|
+
# Ensure lb and ub fall within the transcript indices.
|
|
555
|
+
if lb not in g.pre_mrna.indices:
|
|
556
|
+
lb = g.pre_mrna.indices.max() if g.rev else g.pre_mrna.indices.min()
|
|
557
|
+
if ub not in g.pre_mrna.indices:
|
|
558
|
+
ub = g.pre_mrna.indices.min() if g.rev else g.pre_mrna.indices.max()
|
|
559
|
+
|
|
560
|
+
# Process each mutation scenario:
|
|
561
|
+
# - 'wild_type' (no mutations)
|
|
562
|
+
# - individual mutations (split by '|')
|
|
563
|
+
# - a scenario with all mutations (mid itself)
|
|
564
|
+
scenarios = ['wild_type'] + mid.split('|') + [mid]
|
|
565
|
+
for m in scenarios:
|
|
566
|
+
# Clone the transcript for independent mutation processing.
|
|
479
567
|
transcript = g.clone().pre_mrna
|
|
480
568
|
if m != 'wild_type':
|
|
569
|
+
# Parse mutations from the scenario string.
|
|
481
570
|
mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
|
|
571
|
+
# If the gene is reversed, get the reverse complement of each mutation.
|
|
482
572
|
if g.rev:
|
|
483
|
-
mutations = [
|
|
573
|
+
mutations = [mutation.reverse_complement() for mutation in mutations]
|
|
574
|
+
# Apply each mutation (if present) to the transcript.
|
|
484
575
|
for mutation in mutations:
|
|
485
576
|
if mutation in transcript:
|
|
486
577
|
transcript.mutate(mutation, inplace=True)
|
|
487
578
|
|
|
579
|
+
# Calculate splicing probabilities on the transcript slice defined by lb:ub.
|
|
488
580
|
donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
|
|
489
581
|
donor_probs[m] = donors
|
|
490
582
|
acceptor_probs[m] = acceptors
|
|
491
583
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
584
|
+
# Convert the results to DataFrames (each scenario as a row)
|
|
585
|
+
acceptors_df = pd.DataFrame.from_dict(acceptor_probs, orient='index')
|
|
586
|
+
donors_df = pd.DataFrame.from_dict(donor_probs, orient='index')
|
|
587
|
+
|
|
588
|
+
# Apply rounding and thresholding:
|
|
589
|
+
# - For acceptors: set values < 0.01 to 0, else round to 2 decimals.
|
|
590
|
+
# - For donors: use absolute value threshold.
|
|
591
|
+
acceptors_df = acceptors_df.map(
|
|
592
|
+
lambda x: 0 if isinstance(x, (int, float)) and x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x
|
|
593
|
+
).round(2)
|
|
594
|
+
donors_df = donors_df.map(
|
|
595
|
+
lambda x: 0 if isinstance(x, (int, float)) and abs(x) < 0.01 else round(x, 2) if isinstance(x,
|
|
596
|
+
(int, float)) else x
|
|
597
|
+
).round(2)
|
|
598
|
+
|
|
599
|
+
# Drop columns that do not vary (only one unique value).
|
|
600
|
+
acceptors_df = acceptors_df.loc[:, acceptors_df.nunique() > 1]
|
|
601
|
+
donors_df = donors_df.loc[:, donors_df.nunique() > 1]
|
|
602
|
+
|
|
603
|
+
# Further filter acceptors: keep only columns where the value in the second row is < 0.1.
|
|
604
|
+
# (Assumes that the second row (iloc[1]) represents a specific measure you wish to threshold.)
|
|
605
|
+
|
|
606
|
+
# Helper function: add new features (residual and deviations) and filter based on residual.
|
|
607
|
+
def add_features_and_filter(df):
|
|
608
|
+
if df.shape[1] == 0:
|
|
609
|
+
return df # Nothing to process if no columns remain.
|
|
610
|
+
# Compute the residual:
|
|
611
|
+
# (row 3 - row 0) minus ( (row 1 - row 0) + (row 2 - row 0) )
|
|
612
|
+
df.loc['residual'] = (df.iloc[3] - df.iloc[0]) - ((df.iloc[1] - df.iloc[0]) + (df.iloc[2] - df.iloc[0]))
|
|
613
|
+
# Keep only columns where the absolute residual exceeds 0.1.
|
|
614
|
+
# df = df.loc[:, df.loc['residual'].abs() > 0.1]
|
|
615
|
+
# if df.shape[1] == 0:
|
|
616
|
+
# return df
|
|
617
|
+
# Compute deviations relative to the baseline (row 0)
|
|
618
|
+
df.loc['deviation1'] = df.iloc[1] - df.iloc[0]
|
|
619
|
+
df.loc['deviation2'] = df.iloc[2] - df.iloc[0]
|
|
620
|
+
df.loc['total_deviation'] = df.iloc[3] - df.iloc[0]
|
|
621
|
+
return df
|
|
622
|
+
|
|
623
|
+
# Apply the feature computation to both donors and acceptors.
|
|
624
|
+
donors_df = add_features_and_filter(donors_df)
|
|
625
|
+
acceptors_df = add_features_and_filter(acceptors_df)
|
|
626
|
+
|
|
627
|
+
# Return the processed dataframes with the new features persisting.
|
|
628
|
+
if donors_df.shape[1] > 0:
|
|
629
|
+
donors_df.loc['site_type', :] = 0
|
|
630
|
+
if acceptors_df.shape[1] > 0:
|
|
631
|
+
acceptors_df.loc['site_type', :] = 1
|
|
632
|
+
|
|
633
|
+
df = pd.concat([acceptors_df, donors_df], axis=1)
|
|
634
|
+
|
|
635
|
+
if df.shape[1] == 0:
|
|
636
|
+
return df
|
|
637
|
+
|
|
638
|
+
mask = df.apply(
|
|
639
|
+
lambda col: (
|
|
640
|
+
(abs(col['residual']) > 0.1) and
|
|
641
|
+
(abs(col['deviation1'] + col['deviation2']) < 0.1)
|
|
642
|
+
),
|
|
643
|
+
axis=0
|
|
644
|
+
)
|
|
499
645
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
|
|
503
|
-
(acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
|
|
646
|
+
df.loc['synergistic'] = 0
|
|
647
|
+
df.loc['synergistic', mask] = 1
|
|
504
648
|
|
|
505
|
-
|
|
506
|
-
|
|
649
|
+
mask = df.apply(
|
|
650
|
+
lambda col: (
|
|
651
|
+
(abs(col['residual']) > 0.1) and
|
|
652
|
+
(abs(col['total_deviation']) <= 0.25)
|
|
653
|
+
),
|
|
654
|
+
axis=0
|
|
655
|
+
)
|
|
507
656
|
|
|
508
|
-
|
|
657
|
+
df.loc['antagonistic'] = 0
|
|
658
|
+
df.loc['antagonistic', mask] = 1
|
|
659
|
+
df.loc['mut_id'] = mid
|
|
660
|
+
df.loc['engine'] = engine
|
|
661
|
+
df.loc['site'] = df.columns
|
|
662
|
+
df = df.rename({mid: 'epistasis', mid.split('|')[0]: 'cv1', mid.split('|')[1]: 'cv2'})
|
|
663
|
+
df = df.T
|
|
664
|
+
return df
|
|
509
665
|
|
|
510
666
|
|
|
511
667
|
class Missplicing:
|
|
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
|
|
|
16
16
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
17
17
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
18
18
|
geney/spliceai_utils.py,sha256=tVY0T6F6l3fNoaktpn7Kq0oH5ZM0ThFYt9nPi_lfakw,3077
|
|
19
|
-
geney/splicing_utils.py,sha256=
|
|
19
|
+
geney/splicing_utils.py,sha256=50Cmn12BEzvCQfDe-8u4lNVkqhNn2FXI_Q0Nw98MKBo,47699
|
|
20
20
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
21
21
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
22
22
|
geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
|
|
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
25
25
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
26
26
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
27
27
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
28
|
-
geney-1.3.
|
|
29
|
-
geney-1.3.
|
|
30
|
-
geney-1.3.
|
|
31
|
-
geney-1.3.
|
|
28
|
+
geney-1.3.68.dist-info/METADATA,sha256=kREzXHGHU6MjvTQCSgq9o3csVnm28mktBzx8WeUYtX8,990
|
|
29
|
+
geney-1.3.68.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
30
|
+
geney-1.3.68.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
31
|
+
geney-1.3.68.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|