geney 1.4.21__py2.py3-none-any.whl → 1.4.23__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/utils/SeqMats.py CHANGED
@@ -89,6 +89,25 @@ class SeqMat:
89
89
 
90
90
  self.insertion_counters = defaultdict(int)
91
91
 
92
+
93
+ def __len__(self) -> int:
94
+ return int(self.seq_array["valid_mask"].sum())
95
+
96
+ def __repr__(self):
97
+ return f"<SeqMat: {self.seq}>"
98
+
99
+ def __str__(self):
100
+ return self.seq
101
+
102
+ def get_metadata(self) -> dict:
103
+ """Retrieve all metadata as a dictionary."""
104
+ return {
105
+ "name": self.name,
106
+ "source": self.source,
107
+ "version": self.version,
108
+ "notes": self.notes
109
+ }
110
+
92
111
  @property
93
112
  def seq(self) -> str:
94
113
  return self.seq_array['nt'][self.seq_array['valid_mask']].tobytes().decode()
@@ -101,6 +120,26 @@ class SeqMat:
101
120
  def conservation(self) -> np.ndarray:
102
121
  return self.seq_array['cons'][self.seq_array['valid_mask']]
103
122
 
123
+ @property
124
+ def max_index(self) -> float:
125
+ return self.seq_array["index"].max()
126
+
127
+ @property
128
+ def min_index(self) -> float:
129
+ return self.seq_array["index"].min()
130
+
131
+ @property
132
+ def start(self) -> float:
133
+ return self.min_index
134
+
135
+ @property
136
+ def end(self) -> float:
137
+ return self.max_index
138
+
139
+ @property
140
+ def mutated_positions(self) -> np.ndarray:
141
+ return (self.seq_array["ref"] != self.seq_array["nt"])[self.seq_array["valid_mask"]].astype(int)
142
+
104
143
  def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
105
144
  new = SeqMat.__new__(SeqMat)
106
145
  new.name = self.name
@@ -120,37 +159,66 @@ class SeqMat:
120
159
  new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
121
160
  return new
122
161
 
162
+ # def apply_mutations(
163
+ # self,
164
+ # mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
165
+ # only_snps: bool = False
166
+ # ) -> SeqMat:
167
+
123
168
  def apply_mutations(
124
169
  self,
125
- mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
170
+ mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]] = None,
171
+ *,
172
+ pos: Optional[float] = None,
173
+ ref: Optional[str] = None,
174
+ alt: Optional[str] = None,
126
175
  only_snps: bool = False
127
- ) -> SeqMat:
176
+ ) -> SeqMat:
128
177
  """
129
178
  Apply one or a batch of mutations (pos, ref, alt) efficiently:
130
179
  - Supports a single tuple or a list of tuples
131
180
  - Assumes mutations sorted by position for vectorized searchsorted
132
181
  """
133
182
  # Normalize to list
134
- if isinstance(mutations, tuple) and len(mutations) == 3:
183
+ # if isinstance(mutations, tuple) and len(mutations) == 3:
184
+ # mutations = [mutations]
185
+ # elif not isinstance(mutations, list):
186
+ # raise TypeError("mutations must be a tuple or list of tuples")
187
+ # Input normalization
188
+ if mutations is None:
189
+ if pos is None or ref is None or alt is None:
190
+ raise ValueError("Either `mutations` or `pos, ref, alt` must be provided")
191
+ mutations = [(pos, ref, alt)]
192
+ elif isinstance(mutations, tuple) and len(mutations) == 3:
135
193
  mutations = [mutations]
136
194
  elif not isinstance(mutations, list):
137
- raise TypeError("mutations must be a tuple or list of tuples")
138
-
139
- # Left-normalize and bucket
195
+ raise TypeError("`mutations` must be a tuple or list of tuples")
196
+
197
+ # # Left-normalize and bucket
198
+ # subs, ins, dels = [], [], []
199
+ # for pos, ref, alt in mutations:
200
+ # while ref and alt and ref[0] == alt[0]:
201
+ # pos += 1
202
+ # ref = ref[1:] or '-'
203
+ # alt = alt[1:] or '-'
204
+ # if ref != '-' and alt != '-':
205
+ # subs.append((pos, ref, alt))
206
+ # elif ref == '-' and alt != '-' and not only_snps:
207
+ # ins.append((pos, alt))
208
+ # elif alt == '-' and ref != '-' and not only_snps:
209
+ # dels.append((pos, ref))
210
+ # else:
211
+ # raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
212
+ # Bucket mutations
140
213
  subs, ins, dels = [], [], []
141
- for pos, ref, alt in mutations:
142
- while ref and alt and ref[0] == alt[0]:
143
- pos += 1
144
- ref = ref[1:] or '-'
145
- alt = alt[1:] or '-'
146
- if ref != '-' and alt != '-':
147
- subs.append((pos, ref, alt))
148
- elif ref == '-' and alt != '-' and not only_snps:
149
- ins.append((pos, alt))
150
- elif alt == '-' and ref != '-' and not only_snps:
151
- dels.append((pos, ref))
152
- else:
153
- raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
214
+ for p, r, a in mutations:
215
+ # left-normalize
216
+ while r and a and r[0] == a[0]:
217
+ p += 1; r = r[1:] or '-'; a = a[1:] or '-'
218
+ if r != '-' and a != '-': subs.append((p, r, a))
219
+ elif r == '-' and a != '-' and not only_snps: ins.append((p, a))
220
+ elif a == '-' and r != '-' and not only_snps: dels.append((p, r))
221
+ else: raise ValueError(f"Unsupported mutation {p}:{r}:{a}")
154
222
 
155
223
  # Ensure seq_array indices sorted
156
224
  coords = self.seq_array['index']
@@ -225,3 +293,137 @@ class SeqMat:
225
293
  mask = (coords >= start) & (coords <= stop)
226
294
  return self.seq_array[mask]
227
295
  raise TypeError("Invalid index type.")
296
+
297
+ def cut_out(self, introns: List[Tuple[int, int]]) -> "SeqMat":
298
+ """
299
+ Splices out regions from the sequence corresponding to the given intron boundaries.
300
+
301
+ Handles reverse-complemented sequences by interpreting introns in reverse as well.
302
+
303
+ Args:
304
+ introns (List[Tuple[int, int]]): List of (start, end) intron boundaries.
305
+ These are always genomic (absolute) coordinates,
306
+ regardless of strand direction.
307
+
308
+ Returns:
309
+ SeqMat: A new instance with the intron regions removed.
310
+ """
311
+ # In reverse orientation, flip intron direction for comparison
312
+ if self.rev:
313
+ introns = [(end, start) if start > end else (start, end) for (start, end) in introns]
314
+
315
+ mask = np.ones(len(self.seq_array), dtype=bool)
316
+
317
+ for start, end in introns:
318
+ lo, hi = min(start, end) + 1, max(start, end) - 1
319
+ mask &= ~((self.seq_array["index"] >= lo) & (self.seq_array["index"] <= hi))
320
+
321
+ new_instance = self.clone()
322
+ new_instance.seq_array = self.seq_array[mask].copy()
323
+ return new_instance
324
+
325
+ def open_reading_frame(self, tis: int) -> "SeqMat":
326
+ """
327
+ Extracts the open reading frame starting from the translation initiation site (TIS)
328
+ until the first in-frame stop codon.
329
+
330
+ Args:
331
+ tis (int): Genomic position of the translation initiation site (start codon).
332
+
333
+ Returns:
334
+ SeqMat: A new SeqMat instance containing the ORF (from TIS to stop codon inclusive).
335
+ """
336
+ if tis not in self.seq_array["index"]:
337
+ print(f"Warning: TIS position {tis} not found, returning default.")
338
+ return self.clone(start=0, end=3)
339
+
340
+ # Extract nucleotide sequence and indices starting from TIS
341
+ mask = self.seq_array["index"] >= tis if not self.rev else self.seq_array["index"] <= tis
342
+ coding_part = self.seq_array[mask]
343
+ coding_seq = coding_part["nt"].tobytes().decode()
344
+
345
+ # Read codons in-frame
346
+ for i in range(0, len(coding_seq) - 2, 3):
347
+ codon = coding_seq[i:i + 3]
348
+ if codon in {"TAA", "TAG", "TGA"}:
349
+ # Determine index range for this ORF
350
+ start = coding_part["index"][0]
351
+ stop = coding_part["index"][i + 2]
352
+ lo, hi = sorted((start, stop))
353
+ return self.clone(start=lo, end=hi)
354
+
355
+ raise ValueError("No in-frame stop codon found after the TIS.")
356
+
357
+ def predict_splicing(self, position: int, engine='spliceai', context=7500, inplace=False): #, reference_donors=None, reference_acceptors=None) -> pd.DataFrame:
358
+ """
359
+ Predict splicing probabilities at a given position using the specified engine.
360
+
361
+ Args:
362
+ position (int): The genomic position to predict splicing probabilities for.
363
+ engine (str): The prediction engine to use. Supported: 'spliceai', 'pangolin'.
364
+ context (int): The length of the target central region (default: 7500).
365
+ format (str): Output format for the splicing engine results.
366
+
367
+ Returns:
368
+ pd.DataFrame: A DataFrame containing:
369
+ - position: The genomic position
370
+ - donor_prob: Probability of being a donor splice site
371
+ - acceptor_prob: Probability of being an acceptor splice site
372
+ - nucleotides: The nucleotide sequence at that position
373
+
374
+ Raises:
375
+ ValueError: If an unsupported engine is provided.
376
+ IndexError: If the position is not found in the sequence.
377
+ """
378
+ # Retrieve extended context (includes flanks) around the position.
379
+ # seq, indices = self.get_context(position, context=context, padding='N')
380
+ target = self.clone(position - context, position + context)
381
+ # print(len(target.seq))
382
+ seq, indices = target.seq, target.index
383
+ # print(len(seq))
384
+ # rel_pos = np.where(indices == position)[0][0]
385
+ # print(rel_pos)
386
+ rel_pos = np.abs(indices - position).argmin()
387
+ # print(rel_pos, len(seq))
388
+ left_missing, right_missing = max(0, context - rel_pos), max(0, context - (len(seq) - rel_pos))
389
+ # print(left_missing, right_missing)
390
+ if left_missing > 0 or right_missing > 0:
391
+ step = -1 if self.rev else 1
392
+
393
+ if left_missing > 0:
394
+ left_pad = np.arange(indices[0] - step * left_missing, indices[0], step)
395
+ else:
396
+ left_pad = np.array([], dtype=indices.dtype)
397
+
398
+ if right_missing > 0:
399
+ right_pad = np.arange(indices[-1] + step, indices[-1] + step * (right_missing + 1), step)
400
+ else:
401
+ right_pad = np.array([], dtype=indices.dtype)
402
+
403
+ seq = 'N' * left_missing + seq + 'N' * right_missing
404
+ indices = np.concatenate([left_pad, indices, right_pad])
405
+
406
+ # Run the splicing prediction engine (function assumed to be defined externally)
407
+ from .splicing_utils import run_splicing_engine
408
+ donor_probs, acceptor_probs = run_splicing_engine(seq, engine)
409
+ # Trim off the fixed flanks before returning results.
410
+ seq = seq[5000:-5000]
411
+ indices = indices[5000:-5000]
412
+ df = pd.DataFrame({
413
+ 'position': indices,
414
+ 'donor_prob': donor_probs,
415
+ 'acceptor_prob': acceptor_probs,
416
+ 'nucleotides': list(seq)
417
+ }).set_index('position').round(3)
418
+ # if reference_donors is not None:
419
+ # df['ref_donor'] = df.index.isin(reference_donors).astype(int)
420
+ # if reference_acceptors is not None:
421
+ # df['ref_acceptor'] = df.index.isin(reference_acceptors).astype(int)
422
+
423
+ df.attrs['name'] = self.name
424
+ if inplace:
425
+ self.predicted_splicing = df
426
+ return self
427
+ else:
428
+ return df
429
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.4.21
3
+ Version: 1.4.23
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
37
37
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
38
38
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
39
39
  geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
40
- geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
40
+ geney/utils/SeqMats.py,sha256=4NJMXNDw6XQAaKVpNuEIft1Xa1sSxPSFwnIeMzjI3eE,17058
41
41
  geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
42
42
  geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
43
43
  geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
46
46
  geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
47
47
  geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
48
48
  geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
49
- geney-1.4.21.dist-info/METADATA,sha256=m-32P-otBh8Nj1P_rBrqgE6wYPEA04pcTV1tGxuTkWM,990
50
- geney-1.4.21.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
- geney-1.4.21.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
- geney-1.4.21.dist-info/RECORD,,
49
+ geney-1.4.23.dist-info/METADATA,sha256=cRWawSdfae-X2F-k7AFedHrQd_OhD2zBLjkA2zzNLrs,990
50
+ geney-1.4.23.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
+ geney-1.4.23.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
+ geney-1.4.23.dist-info/RECORD,,
File without changes