geney 1.3.79__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ __all__ = ['run_splicing_engine', 'adjoin_splicing_outcomes', 'process_epistasis']
2
+
3
+ import pandas as pd
4
+ from typing import List, Tuple
5
+
6
+ def run_splicing_engine(seq: str, engine: str = 'spliceai') -> Tuple[List[float], List[float]]:
7
+ """
8
+ Run the specified splicing engine to predict splice site probabilities on a sequence.
9
+
10
+ Args:
11
+ seq: Nucleotide sequence.
12
+ engine: Engine name ('spliceai' or 'pangolin').
13
+
14
+ Returns:
15
+ Tuple (donor_probs, acceptor_probs) as lists of probability values.
16
+
17
+ Raises:
18
+ ValueError: If the engine is not implemented.
19
+ """
20
+ match engine:
21
+ case 'spliceai':
22
+ from geney.utils.spliceai_utils import sai_predict_probs, sai_models
23
+ # print(seq)
24
+ acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
25
+ case 'pangolin':
26
+ from geney.utils.pangolin_utils import pangolin_predict_probs, pang_models
27
+ # print(seq)
28
+ donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
29
+ case _:
30
+ raise ValueError(f"Engine '{engine}' not implemented")
31
+
32
+ return donor_probs, acceptor_probs
33
+
34
+
35
+
36
+ def adjoin_splicing_outcomes(splicing_predictions, transcript=None):
37
+ """
38
+ Predicts splicing effect for multiple mutations and organizes the output as a multi-index DataFrame.
39
+
40
+ Args:
41
+ mut_ids (dict): Dictionary where keys are mutation labels (e.g. 'mut1', 'mut2', 'epistasis') and
42
+ values are mutation strings in format 'GENE:CHR:POS:REF:ALT'.
43
+ transcript (str): Transcript ID to target (optional).
44
+ engine (str): Splicing engine (default: 'spliceai').
45
+
46
+ Returns:
47
+ pd.DataFrame: Multi-index column DataFrame with wild-type, canonical, and mutation-specific predictions.
48
+ """
49
+ dfs = []
50
+ for label, splicing_df in splicing_predictions.items():
51
+ var_df = splicing_df.rename(columns={
52
+ 'donor_prob': ('donors', f'{label}_prob'),
53
+ 'acceptor_prob': ('acceptors', f'{label}_prob'),
54
+ 'nucleotides': ('nts', f'{label}')
55
+ })
56
+ dfs.append(var_df)
57
+
58
+ # Concatenate all DataFrames and unify columns
59
+ full_df = pd.concat(dfs, axis=1)
60
+
61
+ # Ensure MultiIndex columns
62
+ if not isinstance(full_df.columns, pd.MultiIndex):
63
+ full_df.columns = pd.MultiIndex.from_tuples(full_df.columns)
64
+
65
+ if transcript is not None:
66
+ full_df[('acceptors', 'annotated')] = full_df.apply(
67
+ lambda row: row.name in transcript.acceptors,
68
+ axis=1
69
+ )
70
+
71
+ full_df[('donors', 'annotated')] = full_df.apply(
72
+ lambda row: row.name in transcript.donors,
73
+ axis=1
74
+ )
75
+
76
+ full_df.sort_index(axis=1, level=0, inplace=True)
77
+ full_df.sort_index(ascending=not transcript.rev, inplace=True)
78
+ else:
79
+ full_df.sort_index(axis=1, level=0, inplace=True)
80
+
81
+ return full_df
82
+
83
+
84
+ def process_epistasis(df: pd.DataFrame, threshold=0.25) -> pd.DataFrame:
85
+ """
86
+ Computes the expected epistasis effect (additive) and residual epistasis
87
+ for both donor and acceptor probabilities.
88
+
89
+ Adds new columns under donors and acceptors:
90
+ - expected_epistasis
91
+ - residual_epistasis
92
+
93
+ Args:
94
+ df (pd.DataFrame): MultiIndex column DataFrame with keys:
95
+ 'wt_prob', 'mut1_prob', 'mut2_prob', 'epistasis_prob'
96
+
97
+ Returns:
98
+ pd.DataFrame: Modified DataFrame with expected and residual epistasis columns added.
99
+ """
100
+ for feature in ['donors', 'acceptors']:
101
+ wt = df[feature]['wt_prob']
102
+ mut1 = df[feature]['mut1_prob']
103
+ mut2 = df[feature]['mut2_prob']
104
+ true_epi = df[feature]['epistasis_prob']
105
+
106
+ expected = mut1 + mut2 - wt
107
+ residual = true_epi - expected
108
+
109
+ df[(feature, 'expected_epistasis')] = expected
110
+ df[(feature, 'residual_epistasis')] = residual
111
+
112
+ df = df.sort_index(axis=1, level=0)
113
+ mask = (
114
+ df['donors']['residual_epistasis'].abs() > threshold
115
+ ) | (
116
+ df['acceptors']['residual_epistasis'].abs() > threshold
117
+ )
118
+
119
+ return df[mask]
120
+
121
+
122
+ # def predict_splicing(mut_id=None, transcript=None, engine='spliceai'):
123
+ # gene = Gene.from_file(mut_id.split(':')[0]).transcript(transcript).generate_pre_mrna()
124
+ # if mut_id is None:
125
+ # pass
126
+ # else:
127
+ # for m in mut_id.split('|'):
128
+ # gene.pre_mrna.apply_mutation(m)
129
+ # gene.pre_mrna.set_name(mut_id)
130
+ # return gene.pre_mrna.predict_missplicing(engine=engine, fmt='df')
131
+ #
132
+ #
133
+ #
134
+ #
135
+ #
136
+ #
137
+ # def find_event_splicing(mutations, engine='spliceai'):
138
+ # data = epistasis_id.split('|')
139
+ # gene = data[0].split(':')[0]
140
+ # pos = int(sum([int(p.split(':')[2]) for p in data]) / 2)
141
+ # g = Gene.from_file(gene).transcript().generate_pre_mrna()
142
+ # transcript = g.clone().pre_mrna
143
+ #
144
+ # muts = [MutSeqMat.from_mutid(m, g.rev) for m in data]
145
+ # # if g.rev:
146
+ # # muts = [m.reverse_complement() for m in muts]
147
+ #
148
+ # mut1 = transcript.clone().mutate(muts[0])
149
+ # mut2 = transcript.clone().mutate(muts[1])
150
+ # epistasis = transcript.clone()
151
+ # for m in muts:
152
+ # epistasis.mutate(m, inplace=True)
153
+ #
154
+ # wild_type = transcript.predict_splicing(pos, engine=engine)
155
+ # mut1 = mut1.predict_splicing(pos, engine=engine)
156
+ # mut2 = mut2.predict_splicing(pos, engine=engine)
157
+ # epistasis = epistasis.predict_splicing(pos, engine=engine)
158
+ #
159
+ # combined = pd.concat([wild_type, mut1, mut2, epistasis], axis=1, keys=['wild_type', 'mut1', 'mut2', 'epistasis'], join='outer')
160
+ # return combined
161
+ #
162
+ # # def extract_epistatic_sites(df, site_type_col='site_type', threshold=0.25):
163
+ # # """
164
+ # # From a multi-index DataFrame with columns like ('wild_type', 'donor_prob'), etc.,
165
+ # # compute expected additive effect and epistatic residual for donor and acceptor probabilities.
166
+ # # Return only rows where:
167
+ # # 1. |residual| > threshold
168
+ # # 2. donor sites have site_type == 1, acceptor sites have site_type == 0
169
+ # # """
170
+ # # features = ['donor_prob', 'acceptor_prob']
171
+ # # expected = {}
172
+ # # residual = {}
173
+ # #
174
+ # # for feature in features:
175
+ # # wt = df[('wild_type', feature)]
176
+ # # mut1 = df[('mut1', feature)]
177
+ # # mut2 = df[('mut2', feature)]
178
+ # # epi = df[('epistasis', feature)]
179
+ # #
180
+ # # expected_feature = 3 * wt - mut1 - mut2
181
+ # # residual_feature = expected_feature - epi
182
+ # #
183
+ # # expected[('expected', feature)] = expected_feature
184
+ # # residual[('residual', feature)] = residual_feature
185
+ # #
186
+ # # # Combine new columns
187
+ # # expected_df = pd.DataFrame(expected)
188
+ # # residual_df = pd.DataFrame(residual)
189
+ # #
190
+ # # # Join to original
191
+ # # df_combined = pd.concat([df, expected_df, residual_df], axis=1)
192
+ # #
193
+ # # # Create mask based on residual threshold
194
+ # # mask = (
195
+ # # (residual_df.abs() > threshold)
196
+ # # .any(axis=1) # at least one feature has large residual
197
+ # # )
198
+ # #
199
+ # # # Site type condition: donor=1, acceptor=0
200
+ # # donor_mask = df[('wild_type', 'donor_prob')].notna() & (df[site_type_col] == 1)
201
+ # # acceptor_mask = df[('wild_type', 'acceptor_prob')].notna() & (df[site_type_col] == 0)
202
+ # #
203
+ # # # Combine all masks
204
+ # # final_mask = mask & (donor_mask | acceptor_mask)
205
+ # #
206
+ # # return df_combined[final_mask]
207
+ # #
208
+ # #
209
+ # # # variability = df.groupby(level=1, axis=1).apply(lambda subdf: subdf.max(axis=1) - subdf.min(axis=1))
210
+ # #
211
+ #
212
+ # """
213
+ # splicing_module.py
214
+ #
215
+ # A modular and comprehensive implementation for splicing, missplicing, and pairwise epistasis analysis.
216
+ # This module has been refactored with advanced Python practices:
217
+ # • Extensive type annotations and detailed docstrings.
218
+ # • Decomposition into small, testable functions and classes.
219
+ # • Explicit encapsulation of the pairwise epistasis analysis.
220
+ # • Usage of Python 3.10+ pattern matching for engine dispatch.
221
+ #
222
+ # Dependencies:
223
+ # numpy, pandas, sqlite3, json, os, redis, and internal modules: Gene, SeqMats, config, spliceai_utils, and pangolin_utils.
224
+ # """
225
+ #
226
+ # import os
227
+ # import json
228
+ # import sqlite3
229
+ # from collections import defaultdict
230
+ # from dataclasses import dataclass, field
231
+ # from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
232
+ #
233
+ # import numpy as np
234
+ # import pandas as pd
235
+ # from redis import Redis
236
+ #
237
+ # # Internal module imports (assumed to be in the same package)
238
+ # from .Gene import Gene
239
+ # from .SeqMats import MutSeqMat
240
+ # from . import config
241
+ #
242
+ # # # Type aliases for clarity
243
+ # # SpliceProbs = Dict[int, float]
244
+ # # AdjacencyKey = Tuple[int, str]
245
+ # # AdjacencyValue = Tuple[int, str, float]
246
+ # # AdjacencyList = Dict[AdjacencyKey, List[AdjacencyValue]]
247
+ #
248
+ # def run_splicing_engine(seq: str, engine: str = 'spliceai') -> Tuple[List[float], List[float]]:
249
+ # """
250
+ # Run the specified splicing engine to predict splice site probabilities on a sequence.
251
+ #
252
+ # Args:
253
+ # seq: Nucleotide sequence.
254
+ # engine: Engine name ('spliceai' or 'pangolin').
255
+ #
256
+ # Returns:
257
+ # Tuple (donor_probs, acceptor_probs) as lists of probability values.
258
+ #
259
+ # Raises:
260
+ # ValueError: If the engine is not implemented.
261
+ # """
262
+ # match engine:
263
+ # case 'spliceai':
264
+ # from .spliceai_utils import sai_predict_probs, sai_models
265
+ # acceptor_probs, donor_probs = sai_predict_probs(seq, models=sai_models)
266
+ # case 'pangolin':
267
+ # from .pangolin_utils import pangolin_predict_probs, pang_models
268
+ # donor_probs, acceptor_probs = pangolin_predict_probs(seq, models=pang_models)
269
+ # case _:
270
+ # raise ValueError(f"Engine '{engine}' not implemented")
271
+ #
272
+ # return donor_probs, acceptor_probs
273
+ #
274
+ #
275
+ # # =============================================================================
276
+ # # Helper Functions
277
+ # # =============================================================================
278
+ #
279
+ # def generate_adjacency_list(
280
+ # acceptors: List[Tuple[int, float]],
281
+ # donors: List[Tuple[int, float]],
282
+ # transcript_start: int,
283
+ # transcript_end: int,
284
+ # max_distance: int = 50,
285
+ # rev: bool = False
286
+ # ) -> AdjacencyList:
287
+ # """
288
+ # Build an adjacency list from donors to acceptors (and vice versa) based on distance and orientation.
289
+ #
290
+ # Args:
291
+ # acceptors: List of tuples (position, probability) for acceptor sites.
292
+ # donors: List of tuples (position, probability) for donor sites.
293
+ # transcript_start: Start coordinate of the transcript.
294
+ # transcript_end: End coordinate of the transcript.
295
+ # max_distance: Maximum allowed distance to connect sites.
296
+ # rev: If True, consider reverse orientation.
297
+ #
298
+ # Returns:
299
+ # A dictionary mapping (position, type) to a list of (neighbor_position, neighbor_type, normalized_probability).
300
+ # """
301
+ # # Append transcript end as an extra donor node
302
+ # donors = donors + [(transcript_end, 1)]
303
+ # # Sort acceptors and donors; use reversed ordering if needed
304
+ # acceptors = sorted(acceptors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
305
+ # donors = sorted(donors, key=lambda x: (x[0], x[1] if not rev else -x[1]), reverse=rev)
306
+ #
307
+ # adjacency_list: AdjacencyList = defaultdict(list)
308
+ #
309
+ # # Connect donors to acceptors
310
+ # for d_pos, d_prob in donors:
311
+ # running_prob = 1.0
312
+ # for a_pos, a_prob in acceptors:
313
+ # # Check orientation and max distance
314
+ # correct_orientation = (a_pos > d_pos and not rev) or (a_pos < d_pos and rev)
315
+ # distance_valid = abs(a_pos - d_pos) <= max_distance
316
+ # if correct_orientation and distance_valid:
317
+ # # Count intervening sites as a simplified penalty
318
+ # in_between_acceptors = sum(1 for a, _ in acceptors if (d_pos < a < a_pos) if not rev else (
319
+ # a_pos < a < d_pos))
320
+ # in_between_donors = sum(1 for d, _ in donors if (d_pos < d < a_pos) if not rev else (a_pos < d < d_pos))
321
+ # # If one set is empty, use raw probability; otherwise use a running product
322
+ # if in_between_donors == 0 or in_between_acceptors == 0:
323
+ # adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob))
324
+ # running_prob -= a_prob
325
+ # elif running_prob > 0:
326
+ # adjacency_list[(d_pos, 'donor')].append((a_pos, 'acceptor', a_prob * running_prob))
327
+ # running_prob -= a_prob
328
+ # else:
329
+ # break
330
+ #
331
+ # # Connect acceptors to donors
332
+ # for a_pos, a_prob in acceptors:
333
+ # running_prob = 1.0
334
+ # for d_pos, d_prob in donors:
335
+ # correct_orientation = (d_pos > a_pos and not rev) or (d_pos < a_pos and rev)
336
+ # distance_valid = abs(d_pos - a_pos) <= max_distance
337
+ # if correct_orientation and distance_valid:
338
+ # in_between_acceptors = sum(1 for a, _ in acceptors if (a_pos < a < d_pos) if not rev else (
339
+ # d_pos < a < a_pos))
340
+ # in_between_donors = sum(1 for d, _ in donors if (a_pos < d < d_pos) if not rev else (d_pos < d < a_pos))
341
+ # # Tag the donor as transcript_end if appropriate
342
+ # tag = 'donor' if d_pos != transcript_end else 'transcript_end'
343
+ # if in_between_acceptors == 0:
344
+ # adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob))
345
+ # running_prob -= d_prob
346
+ # elif running_prob > 0:
347
+ # adjacency_list[(a_pos, 'acceptor')].append((d_pos, tag, d_prob * running_prob))
348
+ # running_prob -= d_prob
349
+ # else:
350
+ # break
351
+ #
352
+ # # Connect transcript start to donors
353
+ # running_prob = 1.0
354
+ # for d_pos, d_prob in donors:
355
+ # if ((d_pos > transcript_start and not rev) or (d_pos < transcript_start and rev)) and abs(
356
+ # d_pos - transcript_start) <= max_distance:
357
+ # adjacency_list[(transcript_start, 'transcript_start')].append((d_pos, 'donor', d_prob))
358
+ # running_prob -= d_prob
359
+ # if running_prob <= 0:
360
+ # break
361
+ #
362
+ # # Normalize probabilities in each adjacency list
363
+ # for key, next_nodes in adjacency_list.items():
364
+ # total = sum(prob for _, _, prob in next_nodes)
365
+ # if total > 0:
366
+ # adjacency_list[key] = [(pos, typ, round(prob / total, 3)) for pos, typ, prob in next_nodes]
367
+ #
368
+ # return dict(adjacency_list)
369
+ #
370
+ #
371
+ # def find_all_paths(
372
+ # graph: AdjacencyList,
373
+ # start: Tuple[int, str],
374
+ # end: Tuple[int, str],
375
+ # path: List[Tuple[int, str]] = [],
376
+ # probability: float = 1.0
377
+ # ) -> Generator[Tuple[List[Tuple[int, str]], float], None, None]:
378
+ # """
379
+ # Recursively generate all paths from start node to end node in the graph.
380
+ #
381
+ # Args:
382
+ # graph: Adjacency list mapping nodes to neighbor nodes and probabilities.
383
+ # start: The starting node (position, type).
384
+ # end: The target node (position, type).
385
+ # path: The path traversed so far.
386
+ # probability: The cumulative probability along the path.
387
+ #
388
+ # Yields:
389
+ # A tuple of the complete path and its cumulative probability.
390
+ # """
391
+ # path = path + [start]
392
+ # if start == end:
393
+ # yield path, probability
394
+ # return
395
+ #
396
+ # if start not in graph:
397
+ # return
398
+ #
399
+ # for next_node, node_type, prob in graph[start]:
400
+ # yield from find_all_paths(graph, (next_node, node_type), end, path, probability * prob)
401
+ #
402
+ #
403
+ # def prepare_splice_sites(
404
+ # acceptors: List[int],
405
+ # donors: List[int],
406
+ # aberrant_splicing: Dict[str, Any]
407
+ # ) -> Tuple[List[Tuple[int, float]], List[Tuple[int, float]]]:
408
+ # """
409
+ # Prepare splice sites by merging reference sites with aberrant events.
410
+ #
411
+ # Args:
412
+ # acceptors: List of acceptor positions.
413
+ # donors: List of donor positions.
414
+ # aberrant_splicing: Dictionary containing aberrant splicing events.
415
+ #
416
+ # Returns:
417
+ # Tuple of lists:
418
+ # - List of tuples (acceptor_position, probability)
419
+ # - List of tuples (donor_position, probability)
420
+ # """
421
+ # acceptor_dict = {p: 1 for p in acceptors}
422
+ # donor_dict = {p: 1 for p in donors}
423
+ #
424
+ # for p, v in aberrant_splicing.get('missed_donors', {}).items():
425
+ # donor_dict[p] = v['absolute']
426
+ # for p, v in aberrant_splicing.get('discovered_donors', {}).items():
427
+ # donor_dict[p] = v['absolute']
428
+ # for p, v in aberrant_splicing.get('missed_acceptors', {}).items():
429
+ # acceptor_dict[p] = v['absolute']
430
+ # for p, v in aberrant_splicing.get('discovered_acceptors', {}).items():
431
+ # acceptor_dict[p] = v['absolute']
432
+ #
433
+ # # Ensure keys are integers
434
+ # acceptors_list = [(int(k), float(v)) for k, v in acceptor_dict.items()]
435
+ # donors_list = [(int(k), float(v)) for k, v in donor_dict.items()]
436
+ # return acceptors_list, donors_list
437
+ #
438
+ #
439
+ # def develop_aberrant_splicing(
440
+ # transcript: Any,
441
+ # aberrant_splicing: Any
442
+ # ) -> Generator[Dict[str, Any], None, None]:
443
+ # """
444
+ # Generator of potential aberrant splicing paths based on the transcript and missplicing events.
445
+ #
446
+ # If no aberrant events are provided, returns the original splice sites.
447
+ #
448
+ # Args:
449
+ # transcript: Transcript object containing splice site positions.
450
+ # aberrant_splicing: Object with missplicing events.
451
+ #
452
+ # Yields:
453
+ # Dictionary with keys 'acceptors', 'donors', and 'path_weight'.
454
+ # """
455
+ # if not aberrant_splicing:
456
+ # yield {
457
+ # 'acceptors': transcript.acceptors,
458
+ # 'donors': transcript.donors,
459
+ # 'path_weight': 1
460
+ # }
461
+ # else:
462
+ # all_acceptors, all_donors = prepare_splice_sites(
463
+ # transcript.acceptors, transcript.donors, aberrant_splicing.missplicing
464
+ # )
465
+ # adj_list = generate_adjacency_list(
466
+ # all_acceptors,
467
+ # all_donors,
468
+ # transcript_start=transcript.transcript_start,
469
+ # transcript_end=transcript.transcript_end,
470
+ # max_distance=100000,
471
+ # rev=transcript.rev
472
+ # )
473
+ # start_node = (transcript.transcript_start, 'transcript_start')
474
+ # end_node = (transcript.transcript_end, 'transcript_end')
475
+ # for path, prob in find_all_paths(adj_list, start_node, end_node):
476
+ # yield {
477
+ # 'acceptors': [node[0] for node in path if node[1] == 'acceptor'],
478
+ # 'donors': [node[0] for node in path if node[1] == 'donor'],
479
+ # 'path_weight': prob
480
+ # }
481
+ #
482
+ #
483
+ # def find_ss_changes(
484
+ # ref_dct: Dict[int, float],
485
+ # mut_dct: Dict[int, float],
486
+ # known_splice_sites: Union[List[int], np.ndarray],
487
+ # threshold: float = 0.5
488
+ # ) -> Tuple[Dict[float, Dict[str, float]], Dict[float, Dict[str, float]]]:
489
+ # """
490
+ # Compare reference and mutant splice probabilities to detect significant site changes.
491
+ #
492
+ # Args:
493
+ # ref_dct: Dictionary of splice site probabilities for the reference sequence.
494
+ # mut_dct: Dictionary of splice site probabilities for the mutant sequence.
495
+ # known_splice_sites: List/array of positions that are known splice sites.
496
+ # threshold: Minimum difference required to flag a significant change.
497
+ #
498
+ # Returns:
499
+ # A tuple (discovered, deleted) where:
500
+ # - discovered: Positions with a positive delta and not known splice sites.
501
+ # - deleted: Positions with a negative delta among known splice sites.
502
+ # """
503
+ # all_positions = set(ref_dct.keys()).union(mut_dct.keys())
504
+ # delta_dict = {pos: mut_dct.get(pos, 0) - ref_dct.get(pos, 0) for pos in all_positions}
505
+ #
506
+ # discovered = {
507
+ # float(k): {
508
+ # 'delta': round(float(delta), 3),
509
+ # 'absolute': round(float(mut_dct.get(k, 0)), 3),
510
+ # 'reference': round(ref_dct.get(k, 0), 3)
511
+ # }
512
+ # for k, delta in delta_dict.items() if delta >= threshold and k not in known_splice_sites
513
+ # }
514
+ # deleted = {
515
+ # float(k): {
516
+ # 'delta': round(float(delta), 3),
517
+ # 'absolute': round(float(mut_dct.get(k, 0)), 3),
518
+ # 'reference': round(ref_dct.get(k, 0), 3)
519
+ # }
520
+ # for k, delta in delta_dict.items() if -delta >= threshold and k in known_splice_sites
521
+ # }
522
+ # return discovered, deleted
523
+ #
524
+ #
525
+ #
geney/utils/utils.py ADDED
@@ -0,0 +1,89 @@
1
+ __all__ = ['is_monotonic', 'contains', 'unload_json', 'unload_pickle', 'dump_json', 'dump_pickle', 'generate_random_nucleotide_sequences']
2
+
3
+ import pickle
4
+ import json
5
+ # import re
6
+ # from pathlib import Path
7
+ from bisect import bisect_left
8
+ import hashlib
9
+
10
+ # def is_monotonic(A):
11
+ # x, y = [], []
12
+ # x.extend(A)
13
+ # y.extend(A)
14
+ # x.sort()
15
+ # y.sort(reverse=True)
16
+ # if (x == A or y == A):
17
+ # return True
18
+ # return False
19
+
20
+
21
+ # def available_genes(organism='hg38'):
22
+ # from geney import config
23
+ # annotation_path = config[organism]['MRNA_PATH'] / 'protein_coding'
24
+ # return sorted(list(set([m.stem.split('_')[-1] for m in annotation_path.glob('*')])))
25
+
26
+
27
+ def contains(a, x):
28
+ """returns true if sorted sequence `a` contains `x`"""
29
+ i = bisect_left(a, x)
30
+ return i != len(a) and a[i] == x
31
+
32
+
33
+ def unload_json(file_path):
34
+ with open(file_path, 'r') as f:
35
+ data = json.load(f)
36
+ return data
37
+
38
+
39
+ def dump_json(file_path, payload):
40
+ with open(file_path, 'w') as f:
41
+ json.dump(payload, f)
42
+ return None
43
+
44
+
45
+ def unload_pickle(file_path):
46
+
47
+ with open(file_path, 'rb') as f:
48
+ data = pickle.load(f)
49
+ return data
50
+
51
+
52
+ def dump_pickle(file_path, payload):
53
+ with open(file_path, 'wb') as f:
54
+ pickle.dump(payload, f)
55
+ return None
56
+
57
+
58
+
59
+ def is_monotonic(A):
60
+ return all(x <= y for x, y in zip(A, A[1:])) or all(x >= y for x, y in zip(A, A[1:]))
61
+
62
+
63
+
64
+ def generate_random_nucleotide_sequences(num_sequences, min_len=3, max_len=10):
65
+ """
66
+ Generate random sequences of nucleotides.
67
+
68
+ Parameters:
69
+ num_sequences (int): Number of sequences to generate.
70
+ sequence_length (int): Length of each sequence.
71
+
72
+ Returns:
73
+ list: A list of random nucleotide sequences.
74
+ """
75
+ import random
76
+ nucleotides = ['A', 'C', 'G', 'T']
77
+ lengths = list(range(min_len, max_len))
78
+ sequences = [
79
+ ''.join(random.choices(nucleotides, k=random.choice(lengths)))
80
+ for _ in range(num_sequences)
81
+ ]
82
+ return sequences
83
+
84
+
85
+
86
+ def short_hash_of_list(numbers, length=5):
87
+ encoded = repr(numbers).encode('utf-8')
88
+ full_hash = hashlib.sha256(encoded).hexdigest()
89
+ return full_hash[:length]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.3.79
3
+ Version: 1.4.1
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -0,0 +1,51 @@
1
+ geney/Fasta_segment.py,sha256=99HxNGNh_MfdVW6hhtlb1vOn7eSmT7oFoEfHDFMxG8w,11275
2
+ geney/Gene.py,sha256=G-5ROebtvbVazzPlsBJ1r2DEduCwsIA5S8_TmBuoyjw,7030
3
+ geney/Oncosplice.py,sha256=ETAvMl_Oq6mEJQHPNwdDO5csX6Ahuped_om10KifCyM,17739
4
+ geney/SeqMats.py,sha256=9-eJnfU2w3LGc0XvVvFEO_QrBneTkC6xkZKDfTcEw5o,19282
5
+ geney/SpliceSimulator.py,sha256=eVXEpczq3fqQpfmgn-xCnJdCiNYG9TwW3LkSPdyeFpI,18376
6
+ geney/Transcript.py,sha256=Wu0UiubFOdasfPCpe9uGfhPDG4MNks5LzUqGzo85ong,14458
7
+ geney/__init__.py,sha256=YLWXJS53yeryp6nVhCgFg3_Du9Guj9y3iSrdfx61q5Y,3017
8
+ geney/_config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
9
+ geney/_graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
10
+ geney/_gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
11
+ geney/_immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
12
+ geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
13
+ geney/_oncosplice.py,sha256=UkGPJqHSKK_XVsDp-03Baa3ks5ePb_1f1EB0wbkKrDo,35527
14
+ geney/_splicing_utils.py,sha256=Zda6MD0e81p46_y6A240W97d1TP4dakLhG2WT0kSN5U,31473
15
+ geney/_survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
16
+ geney/_tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
17
+ geney/_tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
18
+ geney/allele_linkage.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ geney/config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
20
+ geney/data_setup.py,sha256=2RHmuvcGUQbEglXQEZr0C2QPDTQYRZOEm0EcmyfQJgU,12229
21
+ geney/graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
22
+ geney/gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
23
+ geney/immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
24
+ geney/mutation_utils.py,sha256=C_kv2MB_L8LlhX3W2ooXjJ3uDoJ8zX1WeDtZKoBZJkI,1547
25
+ geney/pangolin_utils.py,sha256=9jdBXlOcRaUdfi-UpUxHA0AkTMZkUF-Lt7HVZ1nEm3s,2973
26
+ geney/pipelines.py,sha256=XeC4NTqxzHzrGX6HIBXe8pe4pELpjYLgTmcOt7ESN0g,2916
27
+ geney/power_utils.py,sha256=orOhsr9vkQ-Y4nD1zHj_MmR2J3uYiUsiklqVy-5T-2M,7331
28
+ geney/seqmat_utils.py,sha256=wzb3PX5it5bpIFQvcxyzlxfhoJTbHHbsjg0rzh05iVs,19753
29
+ geney/spliceai_utils.py,sha256=nyBnLdYs1rB-duA9lfJYM9Q2xNlvZA3I_sCJ1z5WjFw,3294
30
+ geney/splicing_utils.py,sha256=UkG2YphjLNUYsv3o3RGUTW1ScHbEMOLL2M_7WbgDVME,47466
31
+ geney/survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
32
+ geney/tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
33
+ geney/tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
34
+ geney/utils.py,sha256=KBdwNIywo7INVEQEsuIXauEJobvReE9TXAi5qqXanSI,2775
35
+ geney/translation_initiation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4woLFSHroWIETc,4448
37
+ geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
38
+ geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
39
+ geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
40
+ geney/utils/SeqMats.py,sha256=vjU0lTkB0s0RoLjNXLqt0kJQDni-it09-iAOv5QAYFs,17686
41
+ geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
42
+ geney/utils/__init__.py,sha256=jCoB0doidTbCFT34Yx8gQROcZOsw4LnqhgkwRgGQWt0,693
43
+ geney/utils/mutation_utils.py,sha256=r-pHr56gEa5kh_DPX8MjFY3ZfYaOtyo4CUfJ5ZHlXPw,3243
44
+ geney/utils/pangolin_utils.py,sha256=EUadXPxY7QUnsQrlyO7K5cg9mi5ssZjSDvNa_SzoBQg,6160
45
+ geney/utils/spliceai_utils.py,sha256=oRrGJqjWirzYmiBmUR9hGr4B7V_7Y1uMyRTmbFKc_t0,4539
46
+ geney/utils/splicing_utils.py,sha256=_Df3SakZrDjs2yKLG05TtfwuoXDLLrZWc9Y8i79rFDM,20633
47
+ geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
48
+ geney-1.4.1.dist-info/METADATA,sha256=HyC14VKqeu1HExC-vasEc4yhH73fN1hI_L5fAE1UXgo,989
49
+ geney-1.4.1.dist-info/WHEEL,sha256=Kh9pAotZVRFj97E15yTA4iADqXdQfIVTHcNaZTjxeGM,110
50
+ geney-1.4.1.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
+ geney-1.4.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any