geney 1.3.65__py2.py3-none-any.whl → 1.3.67__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of geney might be problematic. Click here for more details.
geney/splicing_utils.py
CHANGED
|
@@ -166,7 +166,7 @@ def run_splicing_engine(seq, engine='spliceai'):
|
|
|
166
166
|
match engine:
|
|
167
167
|
case 'spliceai':
|
|
168
168
|
from .spliceai_utils import sai_predict_probs, sai_models
|
|
169
|
-
|
|
169
|
+
acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
|
|
170
170
|
|
|
171
171
|
case 'pangolin':
|
|
172
172
|
from .pangolin_utils import pangolin_predict_probs, pang_models
|
|
@@ -214,6 +214,7 @@ def find_transcript_splicing(transcript, engine: str = 'spliceai') -> Tuple[Dict
|
|
|
214
214
|
# Create dictionaries and sort them by probability in descending order
|
|
215
215
|
donor_probs = dict(sorted(((i, p) for i, p in zip(ref_indices, ref_seq_donor_probs)),
|
|
216
216
|
key=lambda item: item[1], reverse=True))
|
|
217
|
+
|
|
217
218
|
acceptor_probs = dict(sorted(((i, p) for i, p in zip(ref_indices, ref_seq_acceptor_probs)),
|
|
218
219
|
key=lambda item: item[1], reverse=True))
|
|
219
220
|
|
|
@@ -332,8 +333,8 @@ def find_transcript_missplicing_seqs(ref_seq, var_seq, donors, acceptors, thresh
|
|
|
332
333
|
if ref_seq.seq == var_seq.seq:
|
|
333
334
|
return Missplicing({'missed_acceptors': {}, 'missed_donors': {}, 'discovered_acceptors': {}, 'discovered_donors': {}})
|
|
334
335
|
|
|
335
|
-
|
|
336
|
-
|
|
336
|
+
ref_seq_donor_probs, ref_seq_acceptor_probs = run_splicing_engine(ref_seq.seq, engine)
|
|
337
|
+
mut_seq_donor_probs, mut_seq_acceptor_probs = run_splicing_engine(var_seq.seq, engine)
|
|
337
338
|
ref_indices = ref_seq.indices[5000:-5000]
|
|
338
339
|
mut_indices = var_seq.indices[5000:-5000]
|
|
339
340
|
visible_donors = np.intersect1d(donors, ref_indices)
|
|
@@ -462,49 +463,199 @@ def process_pairwise_epistasis(mids, engine='pangolin', fprint=False, db=None):
|
|
|
462
463
|
return pd.concat(results)
|
|
463
464
|
|
|
464
465
|
|
|
466
|
+
# def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
|
|
467
|
+
# donor_probs, acceptor_probs = {}, {}
|
|
468
|
+
# lower_pos, upper_pos = int(mid.split(':')[2]), int(mid.split(':')[6])
|
|
469
|
+
# g = Gene.from_file(mid.split(':')[0]).transcript().generate_pre_mrna()
|
|
470
|
+
# print(g.rev)
|
|
471
|
+
# if g.rev:
|
|
472
|
+
# lower_pos, upper_pos, factor = upper_pos, lower_pos, -1
|
|
473
|
+
# else:
|
|
474
|
+
# factor = 1
|
|
475
|
+
#
|
|
476
|
+
# lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
|
|
477
|
+
#
|
|
478
|
+
# for m in ['wild_type'] + mid.split('|') + [mid]:
|
|
479
|
+
# transcript = g.clone().pre_mrna
|
|
480
|
+
# if m != 'wild_type':
|
|
481
|
+
# mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
|
|
482
|
+
# if g.rev:
|
|
483
|
+
# mutations = [m.reverse_complement() for m in mutations]
|
|
484
|
+
# for mutation in mutations:
|
|
485
|
+
# if mutation in transcript:
|
|
486
|
+
# transcript.mutate(mutation, inplace=True)
|
|
487
|
+
#
|
|
488
|
+
# donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
|
|
489
|
+
# donor_probs[m] = donors
|
|
490
|
+
# acceptor_probs[m] = acceptors
|
|
491
|
+
#
|
|
492
|
+
# acceptors = pd.DataFrame.from_dict(acceptor_probs).T
|
|
493
|
+
# donors = pd.DataFrame.from_dict(donor_probs).T
|
|
494
|
+
#
|
|
495
|
+
# acceptors = acceptors.map(lambda x: 0 if x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
|
|
496
|
+
# acceptors = acceptors.loc[:, acceptors.nunique() > 1]
|
|
497
|
+
# donors = donors.map(lambda x: 0 if abs(x) < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x).round(2)
|
|
498
|
+
# donors = donors.loc[:, donors.nunique() > 1]
|
|
499
|
+
#
|
|
500
|
+
# donors.loc['residual'] = (donors.iloc[3] - donors.iloc[0]) - (
|
|
501
|
+
# (donors.iloc[1] - donors.iloc[0]) + (donors.iloc[2] - donors.iloc[0]))
|
|
502
|
+
# acceptors.loc['residual'] = (acceptors.iloc[3] - acceptors.iloc[0]) - (
|
|
503
|
+
# (acceptors.iloc[1] - acceptors.iloc[0]) + (acceptors.iloc[2] - acceptors.iloc[0]))
|
|
504
|
+
#
|
|
505
|
+
# donors = donors.loc[:, donors.loc['residual'].abs() > 0.1]
|
|
506
|
+
# acceptors = acceptors.loc[:, acceptors.loc['residual'].abs() > 0.1]
|
|
507
|
+
#
|
|
508
|
+
# return acceptors, donors
|
|
465
509
|
def process_pairwise_epistasis_explicit(mid, engine='spliceai'):
|
|
510
|
+
"""
|
|
511
|
+
Process pairwise epistasis for a given mutation identifier (mid).
|
|
512
|
+
|
|
513
|
+
This function:
|
|
514
|
+
1. Parses the input 'mid' to extract positions and loads a gene/transcript.
|
|
515
|
+
2. Adjusts bounds based on strand orientation (reverse or forward).
|
|
516
|
+
3. Iterates over several mutation scenarios (wild type, individual mutations, and combined mutations),
|
|
517
|
+
cloning and mutating the transcript as needed.
|
|
518
|
+
4. Computes splicing probabilities (donors and acceptors) for a transcript segment.
|
|
519
|
+
5. Stores these probabilities in dictionaries and converts them to DataFrames.
|
|
520
|
+
6. Applies rounding, thresholding (setting very small numbers to 0), and filters out columns with little variation.
|
|
521
|
+
7. Adds new features:
|
|
522
|
+
- residual: difference between total change and the sum of two individual deviations.
|
|
523
|
+
- deviation1: change from baseline (row 0) to row 1.
|
|
524
|
+
- deviation2: change from baseline (row 0) to row 2.
|
|
525
|
+
- total_deviation: change from baseline (row 0) to row 3.
|
|
526
|
+
and filters columns with insignificant residual (absolute value <= 0.1).
|
|
527
|
+
|
|
528
|
+
The new features persist in the returned DataFrames.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
acceptors_df (pd.DataFrame): Processed acceptor probabilities with extra features.
|
|
532
|
+
donors_df (pd.DataFrame): Processed donor probabilities with extra features.
|
|
533
|
+
"""
|
|
534
|
+
import pandas as pd
|
|
535
|
+
|
|
466
536
|
donor_probs, acceptor_probs = {}, {}
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
537
|
+
|
|
538
|
+
# Parse the mid string: assume the format is "file:...:lower_pos:...:upper_pos:..."
|
|
539
|
+
parts = mid.split(':')
|
|
540
|
+
lower_pos, upper_pos = int(parts[2]), int(parts[6])
|
|
541
|
+
|
|
542
|
+
# Load gene and its transcript (as pre-mRNA)
|
|
543
|
+
g = Gene.from_file(parts[0]).transcript().generate_pre_mrna()
|
|
544
|
+
|
|
545
|
+
# If gene is on the reverse strand, swap positions and set factor to -1.
|
|
470
546
|
if g.rev:
|
|
471
|
-
lower_pos, upper_pos
|
|
547
|
+
lower_pos, upper_pos = upper_pos, lower_pos
|
|
548
|
+
factor = -1
|
|
472
549
|
else:
|
|
473
550
|
factor = 1
|
|
474
551
|
|
|
552
|
+
# Define bounds with a 7500 bp padding on both sides.
|
|
475
553
|
lb, ub = lower_pos - (factor * 7500), upper_pos + (factor * 7500)
|
|
476
|
-
|
|
477
|
-
|
|
554
|
+
# Ensure lb and ub fall within the transcript indices.
|
|
555
|
+
if lb not in g.pre_mrna.indices:
|
|
556
|
+
lb = g.pre_mrna.indices.max() if g.rev else g.pre_mrna.indices.min()
|
|
557
|
+
if ub not in g.pre_mrna.indices:
|
|
558
|
+
ub = g.pre_mrna.indices.min() if g.rev else g.pre_mrna.indices.max()
|
|
559
|
+
|
|
560
|
+
# Process each mutation scenario:
|
|
561
|
+
# - 'wild_type' (no mutations)
|
|
562
|
+
# - individual mutations (split by '|')
|
|
563
|
+
# - a scenario with all mutations (mid itself)
|
|
564
|
+
scenarios = ['wild_type'] + mid.split('|') + [mid]
|
|
565
|
+
for m in scenarios:
|
|
566
|
+
# Clone the transcript for independent mutation processing.
|
|
478
567
|
transcript = g.clone().pre_mrna
|
|
479
568
|
if m != 'wild_type':
|
|
569
|
+
# Parse mutations from the scenario string.
|
|
480
570
|
mutations = [MutSeqMat.from_mutid(cm) for cm in m.split('|')]
|
|
571
|
+
# If the gene is reversed, get the reverse complement of each mutation.
|
|
481
572
|
if g.rev:
|
|
482
|
-
mutations = [
|
|
573
|
+
mutations = [mutation.reverse_complement() for mutation in mutations]
|
|
574
|
+
# Apply each mutation (if present) to the transcript.
|
|
483
575
|
for mutation in mutations:
|
|
484
576
|
if mutation in transcript:
|
|
485
577
|
transcript.mutate(mutation, inplace=True)
|
|
486
578
|
|
|
579
|
+
# Calculate splicing probabilities on the transcript slice defined by lb:ub.
|
|
487
580
|
donors, acceptors = find_transcript_splicing(transcript[lb:ub], engine=engine)
|
|
488
581
|
donor_probs[m] = donors
|
|
489
582
|
acceptor_probs[m] = acceptors
|
|
490
583
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
584
|
+
# Convert the results to DataFrames (each scenario as a row)
|
|
585
|
+
acceptors_df = pd.DataFrame.from_dict(acceptor_probs, orient='index')
|
|
586
|
+
donors_df = pd.DataFrame.from_dict(donor_probs, orient='index')
|
|
587
|
+
|
|
588
|
+
# Apply rounding and thresholding:
|
|
589
|
+
# - For acceptors: set values < 0.01 to 0, else round to 2 decimals.
|
|
590
|
+
# - For donors: use absolute value threshold.
|
|
591
|
+
acceptors_df = acceptors_df.map(
|
|
592
|
+
lambda x: 0 if isinstance(x, (int, float)) and x < 0.01 else round(x, 2) if isinstance(x, (int, float)) else x
|
|
593
|
+
).round(2)
|
|
594
|
+
donors_df = donors_df.map(
|
|
595
|
+
lambda x: 0 if isinstance(x, (int, float)) and abs(x) < 0.01 else round(x, 2) if isinstance(x,
|
|
596
|
+
(int, float)) else x
|
|
597
|
+
).round(2)
|
|
598
|
+
|
|
599
|
+
# Drop columns that do not vary (only one unique value).
|
|
600
|
+
acceptors_df = acceptors_df.loc[:, acceptors_df.nunique() > 1]
|
|
601
|
+
donors_df = donors_df.loc[:, donors_df.nunique() > 1]
|
|
602
|
+
|
|
603
|
+
# Further filter acceptors: keep only columns where the value in the second row is < 0.1.
|
|
604
|
+
# (Assumes that the second row (iloc[1]) represents a specific measure you wish to threshold.)
|
|
605
|
+
acceptors_df = acceptors_df.loc[:, acceptors_df.iloc[1] < 0.1]
|
|
606
|
+
|
|
607
|
+
# Helper function: add new features (residual and deviations) and filter based on residual.
|
|
608
|
+
def add_features_and_filter(df):
|
|
609
|
+
if df.shape[1] == 0:
|
|
610
|
+
return df # Nothing to process if no columns remain.
|
|
611
|
+
# Compute the residual:
|
|
612
|
+
# (row 3 - row 0) minus ( (row 1 - row 0) + (row 2 - row 0) )
|
|
613
|
+
df.loc['residual'] = (df.iloc[3] - df.iloc[0]) - ((df.iloc[1] - df.iloc[0]) + (df.iloc[2] - df.iloc[0]))
|
|
614
|
+
# Keep only columns where the absolute residual exceeds 0.1.
|
|
615
|
+
# df = df.loc[:, df.loc['residual'].abs() > 0.1]
|
|
616
|
+
# if df.shape[1] == 0:
|
|
617
|
+
# return df
|
|
618
|
+
# Compute deviations relative to the baseline (row 0)
|
|
619
|
+
df.loc['deviation1'] = df.iloc[1] - df.iloc[0]
|
|
620
|
+
df.loc['deviation2'] = df.iloc[2] - df.iloc[0]
|
|
621
|
+
df.loc['total_deviation'] = df.iloc[3] - df.iloc[0]
|
|
622
|
+
return df
|
|
623
|
+
|
|
624
|
+
# Apply the feature computation to both donors and acceptors.
|
|
625
|
+
donors_df = add_features_and_filter(donors_df)
|
|
626
|
+
acceptors_df = add_features_and_filter(acceptors_df)
|
|
627
|
+
|
|
628
|
+
# Return the processed dataframes with the new features persisting.
|
|
629
|
+
donors_df.loc['site_type', :] = 0
|
|
630
|
+
acceptors_df.loc['site_type', :] = 1
|
|
631
|
+
df = pd.concat([acceptors_df, donors_df], axis=1)
|
|
632
|
+
|
|
633
|
+
mask = df.apply(
|
|
634
|
+
lambda col: (
|
|
635
|
+
(abs(col['residual']) > 0.1) and
|
|
636
|
+
(abs(col['deviation1'] + col['deviation2']) < 0.1)
|
|
637
|
+
),
|
|
638
|
+
axis=0
|
|
639
|
+
)
|
|
640
|
+
df.loc['synergistic'] = 0
|
|
641
|
+
df.loc['synergistic', mask] = 1
|
|
642
|
+
|
|
643
|
+
mask = df.apply(
|
|
644
|
+
lambda col: (
|
|
645
|
+
(abs(col['residual']) > 0.1) and
|
|
646
|
+
(abs(col['total_deviation']) <= 0.25)
|
|
647
|
+
),
|
|
648
|
+
axis=0
|
|
649
|
+
)
|
|
506
650
|
|
|
507
|
-
|
|
651
|
+
df.loc['antagonistic'] = 0
|
|
652
|
+
df.loc['antagonistic', mask] = 1
|
|
653
|
+
df.loc['mut_id'] = mid
|
|
654
|
+
df.loc['engine'] = engine
|
|
655
|
+
df.loc['site'] = df.columns
|
|
656
|
+
df = df.rename({mid: 'epistasis', mid.split('|')[0]: 'cv1', mid.split('|')[1]: 'cv2'})
|
|
657
|
+
df = df.T
|
|
658
|
+
return df
|
|
508
659
|
|
|
509
660
|
|
|
510
661
|
class Missplicing:
|
|
@@ -16,7 +16,7 @@ geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
|
|
|
16
16
|
geney/power_utils.py,sha256=MehZFUdkJ2EFUot709yPEDxSkXmH5XevMebX2HD768A,7330
|
|
17
17
|
geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
|
|
18
18
|
geney/spliceai_utils.py,sha256=tVY0T6F6l3fNoaktpn7Kq0oH5ZM0ThFYt9nPi_lfakw,3077
|
|
19
|
-
geney/splicing_utils.py,sha256=
|
|
19
|
+
geney/splicing_utils.py,sha256=afnTncU607dLLfMz4Z1pj06dkO03u6Wt43cNBu7pEjU,47647
|
|
20
20
|
geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
|
|
21
21
|
geney/tcga_utils.py,sha256=D_BNHm-D_K408dlcJm3hzH2c6QNFjQsKvUcOPiQRk7g,17612
|
|
22
22
|
geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
|
|
@@ -25,7 +25,7 @@ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
25
25
|
geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
|
|
26
26
|
geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
|
|
27
27
|
geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
|
|
28
|
-
geney-1.3.
|
|
29
|
-
geney-1.3.
|
|
30
|
-
geney-1.3.
|
|
31
|
-
geney-1.3.
|
|
28
|
+
geney-1.3.67.dist-info/METADATA,sha256=Quhz5RoxRIVxv0VlKP9NhmIdy0NzcOi3viZ51WIBzm8,990
|
|
29
|
+
geney-1.3.67.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
|
|
30
|
+
geney-1.3.67.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
31
|
+
geney-1.3.67.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|