geney 1.4.15__py2.py3-none-any.whl → 1.4.17__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/Transcript.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
5
  from Bio.Seq import Seq # Assuming Biopython is used
6
6
  from . import config
7
7
  from .utils.utils import unload_pickle
8
- from .utils.SeqMats import SeqMat #, MutSeqMat
8
+ from .utils.SeqMatsOld import SeqMat #, MutSeqMat
9
9
  from .utils.Fasta_segment import Fasta_segment
10
10
 
11
11
  class Transcript:
geney/_oncosplice.py CHANGED
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
  from tqdm import tqdm
5
5
  import pandas as pd
6
6
  import numpy as np
7
- from geney.utils.SeqMats import MutSeqMat
7
+ from geney.utils.SeqMatsOld import MutSeqMat
8
8
  from ._splicing_utils import find_transcript_missplicing_seqs, develop_aberrant_splicing, Missplicing
9
9
  from .Gene import Gene
10
10
 
geney/_splicing_utils.py CHANGED
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import pandas as pd
3
3
 
4
4
  from .Gene import Gene
5
- from geney.utils.SeqMats import MutSeqMat
5
+ from geney.utils.SeqMatsOld import MutSeqMat
6
6
  from collections import defaultdict
7
7
 
8
8
 
geney/utils/SeqMats.py CHANGED
@@ -1,11 +1,15 @@
1
1
  __all__ = ['SeqMat', 'format_mut_id']
2
2
 
3
+
4
+ from __future__ import annotations
3
5
  from dataclasses import dataclass, field
6
+ from typing import List, Tuple, Union, Optional
4
7
  from collections import defaultdict
5
- from typing import Optional, Union, List, Tuple
6
8
  import numpy as np
7
9
  import pandas as pd
8
10
 
11
+
12
+
9
13
  def format_mut_id(text):
10
14
  import re
11
15
  # text = "TP53:17:7579472:G:A"
@@ -25,417 +29,214 @@ def format_mut_id(text):
25
29
  return None
26
30
 
27
31
 
32
+
28
33
  @dataclass(slots=True)
29
34
  class SeqMat:
30
35
  """Represents a genomic sequence matrix used for training."""
31
- # Metadata fields (uncomment and/or extend as needed)
32
- name: str = field(default="Unnamed Sequence", metadata={"description": "Name of the sequence"})
33
- version: str = field(default="1.0", metadata={"description": "Version of the dataset"})
34
- source: str = field(default="Unknown", metadata={"description": "Source of the sequence data"})
35
- notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
36
+ name: str = field(default="Unnamed Sequence")
37
+ version: str = field(default="1.0")
38
+ source: str = field(default="Unknown")
39
+ notes: dict = field(default_factory=dict)
36
40
 
37
41
  seq_array: np.ndarray = field(init=False, repr=False)
38
42
  insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
39
43
  rev: bool = field(default=False, init=False, repr=False)
40
-
41
44
  predicted_splicing: pd.DataFrame = field(init=False, repr=False)
42
45
  _pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
43
46
 
44
47
  def __init__(
45
- self,
46
- nucleotides: str,
47
- index: np.ndarray,
48
- conservation: Optional[np.ndarray] = None,
49
- reference_nucleotides: Optional[np.ndarray] = None,
50
- notes: Optional[dict] = None,
51
- source: Optional[str] = None,
52
- rev: Optional[bool] = False,
53
- name: Optional[str] = 'wild_type',
54
- version: Optional[str] = 'none'
55
-
48
+ self,
49
+ nucleotides: str,
50
+ index: np.ndarray,
51
+ conservation: Optional[np.ndarray] = None,
52
+ reference_nucleotides: Optional[np.ndarray] = None,
53
+ notes: Optional[dict] = None,
54
+ source: Optional[str] = None,
55
+ rev: Optional[bool] = False,
56
+ name: Optional[str] = 'wild_type',
57
+ version: Optional[str] = 'none'
56
58
  ) -> None:
59
+ # Initialize metadata
60
+ self.name = name
61
+ self.version = version
62
+ self.source = source or "Unknown"
63
+ self.notes = notes or {}
64
+ self.rev = rev
57
65
  self.predicted_splicing = None
58
- nucleotides = np.array(list(nucleotides))
59
- L = nucleotides.shape[0]
66
+
67
+ # Build structured array
68
+ nts = np.array(list(nucleotides), dtype='S1')
69
+ L = len(nts)
60
70
  if index.shape[0] != L:
61
- raise ValueError("Indices array length must match nucleotide sequence length.")
71
+ raise ValueError("Indices length must match sequence length.")
62
72
  if conservation is not None and conservation.shape[0] != L:
63
- raise ValueError("Conservation vector length must match sequence length.")
64
- if reference_nucleotides is not None and reference_nucleotides.shape[0] != L:
65
- raise ValueError("Reference nucleotide vector length must match sequence length.")
73
+ raise ValueError("Conservation length must match sequence length.")
74
+ if reference_nucleotides is not None and len(reference_nucleotides) != L:
75
+ raise ValueError("Reference nucleotides length must match sequence length.")
66
76
 
67
77
  dtype = np.dtype([
68
- ("nt", "S1"),
69
- ("index", np.float64),
70
- ("ref", "S1"),
71
- ("cons", np.float32),
72
- ("valid_mask", bool),
78
+ ('nt', 'S1'),
79
+ ('index', np.float64),
80
+ ('ref', 'S1'),
81
+ ('cons', np.float32),
82
+ ('valid_mask', bool)
73
83
  ])
74
-
75
84
  self.seq_array = np.empty(L, dtype=dtype)
76
- self.seq_array["nt"] = nucleotides
77
- # Use provided reference nucleotides if available.
78
- self.seq_array["ref"] = nucleotides if reference_nucleotides is None else reference_nucleotides
79
- self.seq_array["index"] = index
80
- self.seq_array["cons"] = np.nan if conservation is None else conservation
81
- self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
85
+ self.seq_array['nt'] = nts
86
+ self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
87
+ self.seq_array['index'] = index
88
+ self.seq_array['cons'] = (np.zeros(L, dtype='f4') if conservation is None else conservation)
89
+ self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
90
+
91
+ # Initialize helpers
82
92
  self.insertion_counters = defaultdict(int)
83
- self._pos_to_idx = {pos: i for i, pos in enumerate(self.seq_array["index"])}
93
+ self._build_index_map()
84
94
 
85
- self.source = source if source is not None else "Unknown"
86
- self.notes = notes if notes is not None else {}
87
- self.name = name
88
- self.rev = rev
89
- self.version = version
95
+ def _build_index_map(self):
96
+ """Rebuild position-to-index lookup."""
97
+ self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
90
98
 
91
99
  def __len__(self) -> int:
92
- return int(self.seq_array["valid_mask"].sum())
93
-
94
- def __repr__(self):
95
- return f"<SeqMat: {self.seq}>"
96
-
97
- def __str__(self):
98
- return self.seq
99
-
100
- def get_metadata(self) -> dict:
101
- """Retrieve all metadata as a dictionary."""
102
- return {
103
- "name": self.name,
104
- "source": self.source,
105
- "version": self.version,
106
- "notes": self.notes
107
- }
100
+ return int(self.seq_array['valid_mask'].sum())
108
101
 
109
102
  @property
110
103
  def seq(self) -> str:
111
- return self.seq_array["nt"][self.seq_array["valid_mask"]].tobytes().decode()
104
+ return self.seq_array['nt'][self.seq_array['valid_mask']].tobytes().decode()
112
105
 
113
106
  @property
114
107
  def index(self) -> np.ndarray:
115
- return self.seq_array["index"][self.seq_array["valid_mask"]]
108
+ return self.seq_array['index'][self.seq_array['valid_mask']]
116
109
 
117
110
  @property
118
111
  def conservation(self) -> np.ndarray:
119
- return self.seq_array["cons"][self.seq_array["valid_mask"]]
120
-
121
- @property
122
- def max_index(self) -> float:
123
- return self.seq_array["index"].max()
124
-
125
- @property
126
- def min_index(self) -> float:
127
- return self.seq_array["index"].min()
128
-
129
- @property
130
- def start(self) -> float:
131
- return self.min_index
132
-
133
- @property
134
- def end(self) -> float:
135
- return self.max_index
136
-
137
- @property
138
- def mutated_positions(self) -> np.ndarray:
139
- return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
140
-
141
- def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
142
- cloned = SeqMat.__new__(SeqMat)
112
+ return self.seq_array['cons'][self.seq_array['valid_mask']]
113
+
114
+ def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
115
+ new = SeqMat.__new__(SeqMat)
116
+ # copy metadata
117
+ new.name = self.name
118
+ new.version = self.version
119
+ new.source = self.source
120
+ new.notes = self.notes.copy()
121
+ new.rev = self.rev
122
+ new.predicted_splicing = None
123
+ new.insertion_counters = defaultdict(int)
124
+
125
+ # slice or full copy
143
126
  if start is not None and end is not None:
144
- cloned.seq_array = self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= end)]
127
+ mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
128
+ new.seq_array = self.seq_array[mask].copy()
145
129
  else:
146
- cloned.seq_array = self.seq_array.copy()
147
- cloned.insertion_counters = defaultdict(int)
148
- cloned.name = self.name
149
- cloned.source = self.source
150
- cloned.version = self.version
151
- cloned.notes = self.notes.copy()
152
- cloned.rev = self.rev
153
-
154
- cloned._pos_to_idx = {pos: i for i, pos in enumerate(cloned.seq_array["index"])}
155
-
156
- return cloned
157
-
158
- def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
159
- """
160
- Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
161
-
162
- Parameters:
163
- pos (int): The reference position where the mutation should occur.
164
- ref (str): The reference allele (use '-' for insertions).
165
- alt (str): The alternate allele (use '-' for deletions).
166
- only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
167
-
168
- Returns:
169
- SeqMat: The mutated sequence matrix.
170
-
171
- The method normalizes the mutation (dropping any shared prefix) and then applies:
172
- - A SNP/substitution if both alleles are non-gap.
173
- - An insertion if ref is '-' (after normalization).
174
- - A deletion if alt is '-' (after normalization).
175
-
176
- For insertions, new rows are added with fractional indices computed from an insertion counter.
177
- For deletions, the corresponding rows are removed.
178
- """
179
- return_to_rc = False
130
+ new.seq_array = self.seq_array.copy()
131
+
132
+ new._build_index_map()
133
+ return new
134
+
135
+ def apply_mutation(self, pos: float, ref: str, alt: str, only_snps: bool = False) -> SeqMat:
136
+ """Apply a single mutation to this SeqMat."""
137
+ # reverse-complement context
180
138
  if self.rev:
181
- return_to_rc = True
182
139
  self.reverse_complement()
183
140
 
184
- # Normalize shared prefix (similar to left-alignment in VCFs)
141
+ # left-normalize
185
142
  while ref and alt and ref[0] == alt[0]:
186
143
  pos += 1
187
- ref = ref[1:] or "-"
188
- alt = alt[1:] or "-"
144
+ ref = ref[1:] or '-'
145
+ alt = alt[1:] or '-'
189
146
 
190
- # Case 1: SNP or multi-base substitution
191
- if ref != "-" and alt != "-":
147
+ # substitution
148
+ if ref != '-' and alt != '-':
192
149
  if len(ref) != len(alt):
193
- raise ValueError("Substitution mutations must have alleles of equal length.")
194
-
195
- # pos_idx = np.searchsorted(self.seq_array["index"], pos)
196
- pos_idx = self._pos_to_idx.get(pos)
197
-
198
- if pos_idx is None:
199
- raise ValueError(f"Position {pos} not found in index")
200
-
201
- end_idx = pos_idx + len(ref)
202
- if end_idx > len(self.seq_array):
203
- raise ValueError(f"Substitution range exceeds sequence length at position {pos}.")
204
-
205
- segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
206
- if segment != ref:
207
- raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{segment}'")
208
-
209
- # ref_segment = self.seq_array["ref"][pos_idx:end_idx]
210
- # expected_segment = np.frombuffer(ref.encode(), dtype='S1')
211
- # if not np.all(ref_segment == np.frombuffer(ref.encode(), dtype='S1')):
212
- # actual_str = ref_segment.tobytes().decode()
213
- # raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{actual_str}'")
214
- # self.seq_array["nt"][pos_idx:end_idx] = np.frombuffer(alt.encode(), dtype='S1')
215
-
216
- for i, nt in enumerate(alt):
217
- self.seq_array["nt"][pos_idx + i] = nt.encode()
218
-
219
- # Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
220
- elif ref == "-" and alt != "-":
150
+ raise ValueError("Substitution requires equal-length alleles.")
151
+ idx = self._pos_to_idx.get(pos)
152
+ if idx is None:
153
+ raise KeyError(f"Position {pos} not found.")
154
+ end = idx + len(ref)
155
+ if end > len(self.seq_array):
156
+ raise IndexError(f"Out of bounds at {pos}.")
157
+ # verify reference
158
+ ref_seg = self.seq_array['ref'][idx:end]
159
+ if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
160
+ raise ValueError(f"Ref mismatch at {pos}.")
161
+ # assign alt
162
+ self.seq_array['nt'][idx:end] = np.frombuffer(alt.encode(), dtype='S1')
163
+
164
+ # insertion
165
+ elif ref == '-' and alt != '-':
221
166
  if only_snps:
222
- return self # Skip if indels are not allowed.
223
- pos_idx = np.searchsorted(self.seq_array["index"], pos)
224
- insertion_count = self.insertion_counters[pos]
167
+ return self
168
+ idx = self._pos_to_idx.get(pos)
169
+ if idx is None:
170
+ raise KeyError(f"Position {pos} not found.")
171
+ cnt = self.insertion_counters[pos]
225
172
  eps = 1e-6
226
173
  new_rows = []
227
174
  for i, nt in enumerate(alt):
228
- new_index = pos + (insertion_count + i + 1) * eps
229
- new_row = (nt.encode(), new_index, b"-", np.float32(np.nan), True)
230
- new_rows.append(new_row)
231
- rows = list(self.seq_array)
232
- rows.extend(new_rows)
233
- new_seq_array = np.array(rows, dtype=self.seq_array.dtype)
234
- new_seq_array.sort(order="index")
235
- self.seq_array = new_seq_array
175
+ new_rows.append((nt.encode(),
176
+ pos + (cnt + i + 1)*eps,
177
+ b'-',
178
+ np.nan,
179
+ True))
180
+ self._insert_rows(idx, new_rows)
236
181
  self.insertion_counters[pos] += len(alt)
237
182
 
238
- # Case 3: Deletion (alt is '-' means bases are to be removed)
239
- elif alt == "-" and ref != "-":
183
+ # deletion
184
+ elif alt == '-' and ref != '-':
240
185
  if only_snps:
241
- return self # Skip if indels are not allowed.
242
- pos_idx = np.searchsorted(self.seq_array["index"], pos)
243
- end_idx = pos_idx + len(ref)
244
- if end_idx > len(self.seq_array):
245
- raise ValueError(f"Deletion range exceeds sequence length at position {pos}.")
246
- segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
247
- if segment != ref:
248
- raise ValueError(
249
- f"Reference mismatch for deletion at position {pos}: expected '{ref}', found '{segment}'")
250
- self.seq_array = np.delete(self.seq_array, np.s_[pos_idx:end_idx])
186
+ return self
187
+ idx = self._pos_to_idx.get(pos)
188
+ if idx is None:
189
+ raise KeyError(f"Position {pos} not found.")
190
+ end = idx + len(ref)
191
+ # verify
192
+ ref_seg = self.seq_array['ref'][idx:end]
193
+ if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
194
+ raise ValueError(f"Ref mismatch at {pos}.")
195
+ self.seq_array = np.delete(self.seq_array, np.s_[idx:end])
196
+
251
197
  else:
252
- raise ValueError("Unsupported mutation type. Provide valid ref and alt values.")
198
+ raise ValueError("Unsupported mutation type.")
253
199
 
254
- self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
255
- if return_to_rc:
256
- self.reverse_complement()
200
+ # update mask & index map
201
+ self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
202
+ self._build_index_map()
257
203
 
204
+ # restore orientation
205
+ if self.rev:
206
+ self.reverse_complement()
258
207
  return self
259
208
 
209
+ def _insert_rows(self, idx: int, rows: List[tuple]):
210
+ """Helper to insert new rows efficiently and resort."""
211
+ arr = self.seq_array.tolist()
212
+ arr[idx:idx] = rows
213
+ new = np.array(arr, dtype=self.seq_array.dtype)
214
+ new.sort(order='index')
215
+ self.seq_array = new
216
+
217
+ def complement(self) -> SeqMat:
218
+ comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
219
+ nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
220
+ new = self.clone()
221
+ new.seq_array['nt'] = nts
222
+ return new
223
+
224
+ def reverse_complement(self) -> SeqMat:
225
+ new = self.complement().clone()
226
+ new.seq_array = new.seq_array[::-1].copy()
227
+ new.rev = not self.rev
228
+ return new
229
+
260
230
  def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
231
+ idx = None
261
232
  if isinstance(key, int):
262
- pos_idx = np.where(self.seq_array["index"] == key)[0]
263
- if pos_idx.size == 0:
264
- raise IndexError(f"Position {key} not found in sequence.")
265
- return self.seq_array[pos_idx[0]]
266
- elif isinstance(key, slice):
267
- start, stop = key.start, key.stop
268
- if start is None:
269
- start = self.seq_array["index"].min()
270
- if stop is None:
271
- stop = self.seq_array["index"].max()
272
- return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
273
- else:
274
- raise TypeError("Indexing must be an integer or a slice.")
275
-
276
- def complement(self) -> "SeqMat":
277
- comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
278
- comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
279
- new_instance = self.clone()
280
- new_instance.seq_array["nt"] = comp_seq
281
- return new_instance
282
-
283
- def reverse_complement(self) -> "SeqMat":
284
- rev_comp_seq = self.complement().seq_array[::-1]
285
- self.seq_array = rev_comp_seq.copy()
286
- self.rev = not self.rev
287
- return self
288
-
289
- # def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
290
- # """
291
- # Splices out regions from the sequence corresponding to the given intron boundaries.
292
- #
293
- # Args:
294
- # introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
295
- # Coordinates should match the 'index' field.
296
- #
297
- # Returns:
298
- # SeqMat: A new instance with the intron regions removed.
299
- # """
300
- # mask = np.ones(len(self.seq_array), dtype=bool)
301
- #
302
- # for start, end in introns:
303
- # mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
304
- #
305
- # new_instance = self.clone()
306
- # new_instance.seq_array = self.seq_array[mask].copy()
307
- # return new_instance
308
-
309
- def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
310
- """
311
- Splices out regions from the sequence corresponding to the given intron boundaries.
312
-
313
- Handles reverse-complemented sequences by interpreting introns in reverse as well.
314
-
315
- Args:
316
- introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
317
- These are always genomic (absolute) coordinates,
318
- regardless of strand direction.
319
-
320
- Returns:
321
- SeqMat: A new instance with the intron regions removed.
322
- """
323
- # In reverse orientation, flip intron direction for comparison
324
- if self.rev:
325
- introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
326
-
327
- mask = np.ones(len(self.seq_array), dtype=bool)
328
-
329
- for start, end in introns:
330
- lo, hi = min(start, end) + 1, max(start, end) - 1
331
- mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
332
-
333
- new_instance = self.clone()
334
- new_instance.seq_array = self.seq_array[mask].copy()
335
- return new_instance
336
-
337
- def open_reading_frame(self, tis: int) -> "SeqMat":
338
- """
339
- Extracts the open reading frame starting from the translation initiation site (TIS)
340
- until the first in-frame stop codon.
341
-
342
- Args:
343
- tis (int): Genomic position of the translation initiation site (start codon).
344
-
345
- Returns:
346
- SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
347
- """
348
- if tis not in self.seq_array["index"]:
349
- print(f"Warning: TIS position {tis} not found, returning default.")
350
- return self.clone(start=0, end=3)
351
-
352
- # Extract nucleotide sequence and indices starting from TIS
353
- mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
354
- coding_part = self.seq_array[mask]
355
- coding_seq = coding_part["nt"].tobytes().decode()
356
-
357
- # Read codons in-frame
358
- for i in range(0, len(coding_seq) - 2, 3):
359
- codon = coding_seq[i:i + 3]
360
- if codon in {"TAA", "TAG", "TGA"}:
361
- # Determine index range for this ORF
362
- start = coding_part["index"][0]
363
- stop = coding_part["index"][i + 2]
364
- lo, hi = sorted((start, stop))
365
- return self.clone(start=lo, end=hi)
366
-
367
- raise ValueError("No in-frame stop codon found after the TIS.")
368
-
369
- def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
370
- """
371
- Predict splicing probabilities at a given position using the specified engine.
372
-
373
- Args:
374
- position (int): The genomic position to predict splicing probabilities for.
375
- engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
376
- context (int): The length of the target central region (default: 7500).
377
- format (str): Output format for the splicing engine results.
378
-
379
- Returns:
380
- pd.DataFrame: A DataFrame containing:
381
- - position: The genomic position
382
- - donor_prob: Probability of being a donor splice site
383
- - acceptor_prob: Probability of being an acceptor splice site
384
- - nucleotides: The nucleotide sequence at that position
385
-
386
- Raises:
387
- ValueError: If an unsupported engine is provided.
388
- IndexError: If the position is not found in the sequence.
389
- """
390
- # Retrieve extended context (includes flanks) around the position.
391
- # seq, indices = self.get_context(position, context=context, padding='N')
392
- target = self.clone(position - context, position + context)
393
- # print(len(target.seq))
394
- seq, indices = target.seq, target.index
395
- # print(len(seq))
396
- # rel_pos = np.where(indices == position)[0][0]
397
- # print(rel_pos)
398
- rel_pos = np.abs(indices - position).argmin()
399
- # print(rel_pos, len(seq))
400
- left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
401
- # print(left_missing, right_missing)
402
- if left_missing > 0 or right_missing > 0:
403
- step = -1 if self.rev else 1
404
-
405
- if left_missing > 0:
406
- left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
407
- else:
408
- left_pad = np.array([], dtype=indices.dtype)
409
-
410
- if right_missing > 0:
411
- right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
412
- else:
413
- right_pad = np.array([], dtype=indices.dtype)
414
-
415
- seq = 'N' * left_missing + seq + 'N' * right_missing
416
- indices = np.concatenate([left_pad, indices, right_pad])
417
-
418
- # Run the splicing prediction engine (function assumed to be defined externally)
419
- from .splicing_utils import run_splicing_engine
420
- donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
421
- # Trim off the fixed flanks before returning results.
422
- seq = seq[5000:-5000]
423
- indices = indices[5000:-5000]
424
- df = pd.DataFrame({
425
- 'position': indices,
426
- 'donor_prob': donor_probs,
427
- 'acceptor_prob': acceptor_probs,
428
- 'nucleotides': list(seq)
429
- }).set_index('position').round(3)
430
- # if reference_donors is not None:
431
- # df['ref_donor'] = df.index.isin(reference_donors).astype(int)
432
- # if reference_acceptors is not None:
433
- # df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
434
-
435
- df.attrs['name'] = self.name
436
- if inplace:
437
- self.predicted_splicing = df
438
- return self
439
- else:
440
- return df
441
-
233
+ idx = self._pos_to_idx.get(float(key))
234
+ if idx is None:
235
+ raise KeyError(f"Position {key} not found.")
236
+ return self.seq_array[idx]
237
+ if isinstance(key, slice):
238
+ start = key.start or self.min_index
239
+ stop = key.stop or self.max_index
240
+ mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= stop)
241
+ return self.seq_array[mask]
242
+ raise TypeError("Invalid index type.")
@@ -0,0 +1,441 @@
1
+ __all__ = ['SeqMat', 'format_mut_id']
2
+
3
+ from dataclasses import dataclass, field
4
+ from collections import defaultdict
5
+ from typing import Optional, Union, List, Tuple
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ def format_mut_id(text):
10
+ import re
11
+ # text = "TP53:17:7579472:G:A"
12
+
13
+ pattern = r'^[^:]+:[^:]+:(\d+):([ACGTN\-]+):([ACGTN\-]+)$'
14
+ match = re.match(pattern, text)
15
+
16
+ if match:
17
+ position = int(match.group(1))
18
+ ref = match.group(2)
19
+ alt = match.group(3)
20
+ return {'pos': position, 'ref': ref, 'alt': alt}
21
+
22
+ # print(f"Position: {position}, Ref: {ref}, Alt: {alt}")
23
+ else:
24
+ print("No match")
25
+ return None
26
+
27
+
28
+ @dataclass(slots=True)
29
+ class SeqMat:
30
+ """Represents a genomic sequence matrix used for training."""
31
+ # Metadata fields (uncomment and/or extend as needed)
32
+ name: str = field(default="Unnamed Sequence", metadata={"description": "Name of the sequence"})
33
+ version: str = field(default="1.0", metadata={"description": "Version of the dataset"})
34
+ source: str = field(default="Unknown", metadata={"description": "Source of the sequence data"})
35
+ notes: dict = field(default_factory=dict, metadata={"description": "User-defined metadata dictionary"})
36
+
37
+ seq_array: np.ndarray = field(init=False, repr=False)
38
+ insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
39
+ rev: bool = field(default=False, init=False, repr=False)
40
+
41
+ predicted_splicing: pd.DataFrame = field(init=False, repr=False)
42
+ _pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
43
+
44
+ def __init__(
45
+ self,
46
+ nucleotides: str,
47
+ index: np.ndarray,
48
+ conservation: Optional[np.ndarray] = None,
49
+ reference_nucleotides: Optional[np.ndarray] = None,
50
+ notes: Optional[dict] = None,
51
+ source: Optional[str] = None,
52
+ rev: Optional[bool] = False,
53
+ name: Optional[str] = 'wild_type',
54
+ version: Optional[str] = 'none'
55
+
56
+ ) -> None:
57
+ self.predicted_splicing = None
58
+ nucleotides = np.array(list(nucleotides))
59
+ L = nucleotides.shape[0]
60
+ if index.shape[0] != L:
61
+ raise ValueError("Indices array length must match nucleotide sequence length.")
62
+ if conservation is not None and conservation.shape[0] != L:
63
+ raise ValueError("Conservation vector length must match sequence length.")
64
+ if reference_nucleotides is not None and reference_nucleotides.shape[0] != L:
65
+ raise ValueError("Reference nucleotide vector length must match sequence length.")
66
+
67
+ dtype = np.dtype([
68
+ ("nt", "S1"),
69
+ ("index", np.float64),
70
+ ("ref", "S1"),
71
+ ("cons", np.float32),
72
+ ("valid_mask", bool),
73
+ ])
74
+
75
+ self.seq_array = np.empty(L, dtype=dtype)
76
+ self.seq_array["nt"] = nucleotides
77
+ # Use provided reference nucleotides if available.
78
+ self.seq_array["ref"] = nucleotides if reference_nucleotides is None else reference_nucleotides
79
+ self.seq_array["index"] = index
80
+ self.seq_array["cons"] = np.nan if conservation is None else conservation
81
+ self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
82
+ self.insertion_counters = defaultdict(int)
83
+ self._pos_to_idx = {pos: i for i, pos in enumerate(self.seq_array["index"])}
84
+
85
+ self.source = source if source is not None else "Unknown"
86
+ self.notes = notes if notes is not None else {}
87
+ self.name = name
88
+ self.rev = rev
89
+ self.version = version
90
+
91
+ def __len__(self) -> int:
92
+ return int(self.seq_array["valid_mask"].sum())
93
+
94
+ def __repr__(self):
95
+ return f"<SeqMat: {self.seq}>"
96
+
97
+ def __str__(self):
98
+ return self.seq
99
+
100
+ def get_metadata(self) -> dict:
101
+ """Retrieve all metadata as a dictionary."""
102
+ return {
103
+ "name": self.name,
104
+ "source": self.source,
105
+ "version": self.version,
106
+ "notes": self.notes
107
+ }
108
+
109
+ @property
110
+ def seq(self) -> str:
111
+ return self.seq_array["nt"][self.seq_array["valid_mask"]].tobytes().decode()
112
+
113
+ @property
114
+ def index(self) -> np.ndarray:
115
+ return self.seq_array["index"][self.seq_array["valid_mask"]]
116
+
117
+ @property
118
+ def conservation(self) -> np.ndarray:
119
+ return self.seq_array["cons"][self.seq_array["valid_mask"]]
120
+
121
+ @property
122
+ def max_index(self) -> float:
123
+ return self.seq_array["index"].max()
124
+
125
+ @property
126
+ def min_index(self) -> float:
127
+ return self.seq_array["index"].min()
128
+
129
+ @property
130
+ def start(self) -> float:
131
+ return self.min_index
132
+
133
+ @property
134
+ def end(self) -> float:
135
+ return self.max_index
136
+
137
+ @property
138
+ def mutated_positions(self) -> np.ndarray:
139
+ return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
140
+
141
+ def clone(self, start: Optional[int] = None, end: Optional[int] = None) -> "SeqMat":
142
+ cloned = SeqMat.__new__(SeqMat)
143
+ if start is not None and end is not None:
144
+ cloned.seq_array = self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= end)]
145
+ else:
146
+ cloned.seq_array = self.seq_array.copy()
147
+ cloned.insertion_counters = defaultdict(int)
148
+ cloned.name = self.name
149
+ cloned.source = self.source
150
+ cloned.version = self.version
151
+ cloned.notes = self.notes.copy()
152
+ cloned.rev = self.rev
153
+
154
+ cloned._pos_to_idx = {pos: i for i, pos in enumerate(cloned.seq_array["index"])}
155
+
156
+ return cloned
157
+
158
+ def apply_mutation(self, pos: int, ref: str, alt: str, only_snps: bool = False):
159
+ """
160
+ Applies a mutation (SNP, substitution, insertion, or deletion) to the sequence.
161
+
162
+ Parameters:
163
+ pos (int): The reference position where the mutation should occur.
164
+ ref (str): The reference allele (use '-' for insertions).
165
+ alt (str): The alternate allele (use '-' for deletions).
166
+ only_snps (bool): If True, only SNP substitutions are allowed; indels are ignored.
167
+
168
+ Returns:
169
+ SeqMat: The mutated sequence matrix.
170
+
171
+ The method normalizes the mutation (dropping any shared prefix) and then applies:
172
+ - A SNP/substitution if both alleles are non-gap.
173
+ - An insertion if ref is '-' (after normalization).
174
+ - A deletion if alt is '-' (after normalization).
175
+
176
+ For insertions, new rows are added with fractional indices computed from an insertion counter.
177
+ For deletions, the corresponding rows are removed.
178
+ """
179
+ return_to_rc = False
180
+ if self.rev:
181
+ return_to_rc = True
182
+ self.reverse_complement()
183
+
184
+ # Normalize shared prefix (similar to left-alignment in VCFs)
185
+ while ref and alt and ref[0] == alt[0]:
186
+ pos += 1
187
+ ref = ref[1:] or "-"
188
+ alt = alt[1:] or "-"
189
+
190
+ # Case 1: SNP or multi-base substitution
191
+ if ref != "-" and alt != "-":
192
+ if len(ref) != len(alt):
193
+ raise ValueError("Substitution mutations must have alleles of equal length.")
194
+
195
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
196
+ # pos_idx = self._pos_to_idx.get(pos)
197
+
198
+ if pos_idx is None:
199
+ raise ValueError(f"Position {pos} not found in index")
200
+
201
+ end_idx = pos_idx + len(ref)
202
+ if end_idx > len(self.seq_array):
203
+ raise ValueError(f"Substitution range exceeds sequence length at position {pos}.")
204
+
205
+ # segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
206
+ # if segment != ref:
207
+ # raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{segment}'")
208
+
209
+ ref_segment = self.seq_array["ref"][pos_idx:end_idx]
210
+ # expected_segment = np.frombuffer(ref.encode(), dtype='S1')
211
+ if not np.all(ref_segment == np.frombuffer(ref.encode(), dtype='S1')):
212
+ actual_str = ref_segment.tobytes().decode()
213
+ raise ValueError(f"Reference mismatch at position {pos}: expected '{ref}', found '{actual_str}'")
214
+ self.seq_array["nt"][pos_idx:end_idx] = np.frombuffer(alt.encode(), dtype='S1')
215
+
216
+ # for i, nt in enumerate(alt):
217
+ # self.seq_array["nt"][pos_idx + i] = nt.encode()
218
+
219
+ # Case 2: Insertion (ref is '-' means nothing was present, and we need to add bases)
220
+ elif ref == "-" and alt != "-":
221
+ if only_snps:
222
+ return self # Skip if indels are not allowed.
223
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
224
+ insertion_count = self.insertion_counters[pos]
225
+ eps = 1e-6
226
+ new_rows = []
227
+ for i, nt in enumerate(alt):
228
+ new_index = pos + (insertion_count + i + 1) * eps
229
+ new_row = (nt.encode(), new_index, b"-", np.float32(np.nan), True)
230
+ new_rows.append(new_row)
231
+ rows = list(self.seq_array)
232
+ rows.extend(new_rows)
233
+ new_seq_array = np.array(rows, dtype=self.seq_array.dtype)
234
+ new_seq_array.sort(order="index")
235
+ self.seq_array = new_seq_array
236
+ self.insertion_counters[pos] += len(alt)
237
+
238
+ # Case 3: Deletion (alt is '-' means bases are to be removed)
239
+ elif alt == "-" and ref != "-":
240
+ if only_snps:
241
+ return self # Skip if indels are not allowed.
242
+ pos_idx = np.searchsorted(self.seq_array["index"], pos)
243
+ end_idx = pos_idx + len(ref)
244
+ if end_idx > len(self.seq_array):
245
+ raise ValueError(f"Deletion range exceeds sequence length at position {pos}.")
246
+ segment = self.seq_array["ref"][pos_idx:end_idx].tobytes().decode()
247
+ if segment != ref:
248
+ raise ValueError(
249
+ f"Reference mismatch for deletion at position {pos}: expected '{ref}', found '{segment}'")
250
+ self.seq_array = np.delete(self.seq_array, np.s_[pos_idx:end_idx])
251
+ else:
252
+ raise ValueError("Unsupported mutation type. Provide valid ref and alt values.")
253
+
254
+ self.seq_array["valid_mask"] = self.seq_array["nt"] != b"-"
255
+ if return_to_rc:
256
+ self.reverse_complement()
257
+
258
+ return self
259
+
260
+ def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
261
+ if isinstance(key, int):
262
+ pos_idx = np.where(self.seq_array["index"] == key)[0]
263
+ if pos_idx.size == 0:
264
+ raise IndexError(f"Position {key} not found in sequence.")
265
+ return self.seq_array[pos_idx[0]]
266
+ elif isinstance(key, slice):
267
+ start, stop = key.start, key.stop
268
+ if start is None:
269
+ start = self.seq_array["index"].min()
270
+ if stop is None:
271
+ stop = self.seq_array["index"].max()
272
+ return self.seq_array[(self.seq_array["index"] >= start) & (self.seq_array["index"] <= stop)]
273
+ else:
274
+ raise TypeError("Indexing must be an integer or a slice.")
275
+
276
+ def complement(self) -> "SeqMat":
277
+ comp_dict = {b"A": b"T", b"T": b"A", b"C": b"G", b"G": b"C", b"-": b"-", b"N": b"N"}
278
+ comp_seq = np.array([comp_dict[nt] for nt in self.seq_array["nt"]], dtype="S1")
279
+ new_instance = self.clone()
280
+ new_instance.seq_array["nt"] = comp_seq
281
+ return new_instance
282
+
283
+ def reverse_complement(self) -> "SeqMat":
284
+ rev_comp_seq = self.complement().seq_array[::-1]
285
+ self.seq_array = rev_comp_seq.copy()
286
+ self.rev = not self.rev
287
+ return self
288
+
289
+ # def splice_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
290
+ # """
291
+ # Splices out regions from the sequence corresponding to the given intron boundaries.
292
+ #
293
+ # Args:
294
+ # introns (List[Tuple[int, int]]): List of (start, end) intron boundaries to remove.
295
+ # Coordinates should match the 'index' field.
296
+ #
297
+ # Returns:
298
+ # SeqMat: A new instance with the intron regions removed.
299
+ # """
300
+ # mask = np.ones(len(self.seq_array), dtype=bool)
301
+ #
302
+ # for start, end in introns:
303
+ # mask &= ~((self.seq_array["index"] >= start) & (self.seq_array["index"] <= end))
304
+ #
305
+ # new_instance = self.clone()
306
+ # new_instance.seq_array = self.seq_array[mask].copy()
307
+ # return new_instance
308
+
309
+ def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
310
+ """
311
+ Splices out regions from the sequence corresponding to the given intron boundaries.
312
+
313
+ Handles reverse-complemented sequences by interpreting introns in reverse as well.
314
+
315
+ Args:
316
+ introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
317
+ These are always genomic (absolute) coordinates,
318
+ regardless of strand direction.
319
+
320
+ Returns:
321
+ SeqMat: A new instance with the intron regions removed.
322
+ """
323
+ # In reverse orientation, flip intron direction for comparison
324
+ if self.rev:
325
+ introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
326
+
327
+ mask = np.ones(len(self.seq_array), dtype=bool)
328
+
329
+ for start, end in introns:
330
+ lo, hi = min(start, end) + 1, max(start, end) - 1
331
+ mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
332
+
333
+ new_instance = self.clone()
334
+ new_instance.seq_array = self.seq_array[mask].copy()
335
+ return new_instance
336
+
337
+ def open_reading_frame(self, tis: int) -> "SeqMat":
338
+ """
339
+ Extracts the open reading frame starting from the translation initiation site (TIS)
340
+ until the first in-frame stop codon.
341
+
342
+ Args:
343
+ tis (int): Genomic position of the translation initiation site (start codon).
344
+
345
+ Returns:
346
+ SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
347
+ """
348
+ if tis not in self.seq_array["index"]:
349
+ print(f"Warning: TIS position {tis} not found, returning default.")
350
+ return self.clone(start=0, end=3)
351
+
352
+ # Extract nucleotide sequence and indices starting from TIS
353
+ mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
354
+ coding_part = self.seq_array[mask]
355
+ coding_seq = coding_part["nt"].tobytes().decode()
356
+
357
+ # Read codons in-frame
358
+ for i in range(0, len(coding_seq) - 2, 3):
359
+ codon = coding_seq[i:i + 3]
360
+ if codon in {"TAA", "TAG", "TGA"}:
361
+ # Determine index range for this ORF
362
+ start = coding_part["index"][0]
363
+ stop = coding_part["index"][i + 2]
364
+ lo, hi = sorted((start, stop))
365
+ return self.clone(start=lo, end=hi)
366
+
367
+ raise ValueError("No in-frame stop codon found after the TIS.")
368
+
369
+ def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
370
+ """
371
+ Predict splicing probabilities at a given position using the specified engine.
372
+
373
+ Args:
374
+ position (int): The genomic position to predict splicing probabilities for.
375
+ engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
376
+ context (int): The length of the target central region (default: 7500).
377
+ format (str): Output format for the splicing engine results.
378
+
379
+ Returns:
380
+ pd.DataFrame: A DataFrame containing:
381
+ - position: The genomic position
382
+ - donor_prob: Probability of being a donor splice site
383
+ - acceptor_prob: Probability of being an acceptor splice site
384
+ - nucleotides: The nucleotide sequence at that position
385
+
386
+ Raises:
387
+ ValueError: If an unsupported engine is provided.
388
+ IndexError: If the position is not found in the sequence.
389
+ """
390
+ # Retrieve extended context (includes flanks) around the position.
391
+ # seq, indices = self.get_context(position, context=context, padding='N')
392
+ target = self.clone(position - context, position + context)
393
+ # print(len(target.seq))
394
+ seq, indices = target.seq, target.index
395
+ # print(len(seq))
396
+ # rel_pos = np.where(indices == position)[0][0]
397
+ # print(rel_pos)
398
+ rel_pos = np.abs(indices - position).argmin()
399
+ # print(rel_pos, len(seq))
400
+ left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
401
+ # print(left_missing, right_missing)
402
+ if left_missing > 0 or right_missing > 0:
403
+ step = -1 if self.rev else 1
404
+
405
+ if left_missing > 0:
406
+ left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
407
+ else:
408
+ left_pad = np.array([], dtype=indices.dtype)
409
+
410
+ if right_missing > 0:
411
+ right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
412
+ else:
413
+ right_pad = np.array([], dtype=indices.dtype)
414
+
415
+ seq = 'N' * left_missing + seq + 'N' * right_missing
416
+ indices = np.concatenate([left_pad, indices, right_pad])
417
+
418
+ # Run the splicing prediction engine (function assumed to be defined externally)
419
+ from .splicing_utils import run_splicing_engine
420
+ donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
421
+ # Trim off the fixed flanks before returning results.
422
+ seq = seq[5000:-5000]
423
+ indices = indices[5000:-5000]
424
+ df = pd.DataFrame({
425
+ 'position': indices,
426
+ 'donor_prob': donor_probs,
427
+ 'acceptor_prob': acceptor_probs,
428
+ 'nucleotides': list(seq)
429
+ }).set_index('position').round(3)
430
+ # if reference_donors is not None:
431
+ # df['ref_donor'] = df.index.isin(reference_donors).astype(int)
432
+ # if reference_acceptors is not None:
433
+ # df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
434
+
435
+ df.attrs['name'] = self.name
436
+ if inplace:
437
+ self.predicted_splicing = df
438
+ return self
439
+ else:
440
+ return df
441
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.4.15
3
+ Version: 1.4.17
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -3,15 +3,15 @@ geney/Gene.py,sha256=6x1sEZV50Il4oydegW6iHIF12EZTGexniG3YUD-3DfM,7036
3
3
  geney/Oncosplice.py,sha256=ETAvMl_Oq6mEJQHPNwdDO5csX6Ahuped_om10KifCyM,17739
4
4
  geney/SeqMats.py,sha256=9-eJnfU2w3LGc0XvVvFEO_QrBneTkC6xkZKDfTcEw5o,19282
5
5
  geney/SpliceSimulator.py,sha256=iF6feVeSnsKFmn3WV60CgWLI0_rSLgpq5fVFL1IOv_4,18491
6
- geney/Transcript.py,sha256=Ltlcnp93s3HxMiweUuyc4Ri3QT42l1qUtiBYH3RITFs,14464
6
+ geney/Transcript.py,sha256=_DhKQ-UnyFDPb4Cu-8sQPWvLd-kKj4ZEJq6KBntFVGE,14467
7
7
  geney/__init__.py,sha256=YLWXJS53yeryp6nVhCgFg3_Du9Guj9y3iSrdfx61q5Y,3017
8
8
  geney/_config_setup.py,sha256=nblcGU3HIt8YjdrAoGfbEVKRxwJKv0PikJ5-7AL6axQ,723
9
9
  geney/_graphic_utils.py,sha256=oMsBpB9YeEn96gGpKh4MmtagJffWZbk-xPrIwHvkFhA,11016
10
10
  geney/_gtex_utils.py,sha256=asL2lHyU5KsbWpV096vkf1Ka7hSo_RRfZqw7p5nERmE,1919
11
11
  geney/_immune_utils.py,sha256=b-8dRcCti7xsU7RG3op18lkSnAD8dp_BymGaR-hbNcI,5272
12
12
  geney/_mutation_utils.py,sha256=dHssUsnii_mf-wuRoMmF13UlD7k3ml_VwQMItTYnXpU,1132
13
- geney/_oncosplice.py,sha256=UkGPJqHSKK_XVsDp-03Baa3ks5ePb_1f1EB0wbkKrDo,35527
14
- geney/_splicing_utils.py,sha256=Zda6MD0e81p46_y6A240W97d1TP4dakLhG2WT0kSN5U,31473
13
+ geney/_oncosplice.py,sha256=qrIqo3HAZAnzhtIGgv7EnwCE5YkdTFSwWoiSZLBzCpg,35530
14
+ geney/_splicing_utils.py,sha256=7j5YC9CrWWFfct8hDdXyxYIqRtCPI4TqxA1cyAMhyy8,31476
15
15
  geney/_survival_utils.py,sha256=KnAzEviMuXh6SnVXId9PgsFLSbgkduTvYoIthxN7FPA,6886
16
16
  geney/_tcga_utils.py,sha256=uJhVnTbTysj0XrEw_YeDKRSLexsqgBLYQdhl7_hnr64,17611
17
17
  geney/_tis_utils.py,sha256=la0CZroaKe5RgAyFd4Bf_DqQncklWgAY2823xVst98o,7813
@@ -37,7 +37,8 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
37
37
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
38
38
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
39
39
  geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
40
- geney/utils/SeqMats.py,sha256=1EgYord9ieaAd1pfAbleBxBnhnubgsNEY9Y40-kVNVo,18502
40
+ geney/utils/SeqMats.py,sha256=-Wz-0Bsflf1lBoGF8RS7JjOshzHO2XCZauqcNP4PONw,8766
41
+ geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
41
42
  geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
42
43
  geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
43
44
  geney/utils/mutation_utils.py,sha256=r-pHr56gEa5kh_DPX8MjFY3ZfYaOtyo4CUfJ5ZHlXPw,3243
@@ -45,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
45
46
  geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
46
47
  geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
47
48
  geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
48
- geney-1.4.15.dist-info/METADATA,sha256=6gwA20Ma1aqa3dWNObVhc9fZWQlvsCMzqbKswUl6fTQ,990
49
- geney-1.4.15.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
50
- geney-1.4.15.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
51
- geney-1.4.15.dist-info/RECORD,,
49
+ geney-1.4.17.dist-info/METADATA,sha256=nlFX-qvZaLauG5S-SBKlM8tRausGYES8Xmrg0XSxtR4,990
50
+ geney-1.4.17.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
+ geney-1.4.17.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
+ geney-1.4.17.dist-info/RECORD,,
File without changes