geney 1.4.17__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of geney might be problematic. Click here for more details.

geney/utils/SeqMats.py CHANGED
@@ -1,7 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  __all__ = ['SeqMat', 'format_mut_id']
2
4
 
3
5
 
4
- from __future__ import annotations
5
6
  from dataclasses import dataclass, field
6
7
  from typing import List, Tuple, Union, Optional
7
8
  from collections import defaultdict
@@ -29,7 +30,6 @@ def format_mut_id(text):
29
30
  return None
30
31
 
31
32
 
32
-
33
33
  @dataclass(slots=True)
34
34
  class SeqMat:
35
35
  """Represents a genomic sequence matrix used for training."""
@@ -42,21 +42,20 @@ class SeqMat:
42
42
  insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
43
43
  rev: bool = field(default=False, init=False, repr=False)
44
44
  predicted_splicing: pd.DataFrame = field(init=False, repr=False)
45
- _pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
46
45
 
47
46
  def __init__(
48
- self,
49
- nucleotides: str,
50
- index: np.ndarray,
51
- conservation: Optional[np.ndarray] = None,
52
- reference_nucleotides: Optional[np.ndarray] = None,
53
- notes: Optional[dict] = None,
54
- source: Optional[str] = None,
55
- rev: Optional[bool] = False,
56
- name: Optional[str] = 'wild_type',
57
- version: Optional[str] = 'none'
47
+ self,
48
+ nucleotides: str,
49
+ index: np.ndarray,
50
+ conservation: Optional[np.ndarray] = None,
51
+ reference_nucleotides: Optional[np.ndarray] = None,
52
+ notes: Optional[dict] = None,
53
+ source: Optional[str] = None,
54
+ rev: Optional[bool] = False,
55
+ name: Optional[str] = 'wild_type',
56
+ version: Optional[str] = 'none'
58
57
  ) -> None:
59
- # Initialize metadata
58
+ # Metadata
60
59
  self.name = name
61
60
  self.version = version
62
61
  self.source = source or "Unknown"
@@ -85,19 +84,10 @@ class SeqMat:
85
84
  self.seq_array['nt'] = nts
86
85
  self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
87
86
  self.seq_array['index'] = index
88
- self.seq_array['cons'] = (np.zeros(L, dtype='f4') if conservation is None else conservation)
87
+ self.seq_array['cons'] = np.zeros(L, dtype='f4') if conservation is None else conservation
89
88
  self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
90
89
 
91
- # Initialize helpers
92
90
  self.insertion_counters = defaultdict(int)
93
- self._build_index_map()
94
-
95
- def _build_index_map(self):
96
- """Rebuild position-to-index lookup."""
97
- self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
98
-
99
- def __len__(self) -> int:
100
- return int(self.seq_array['valid_mask'].sum())
101
91
 
102
92
  @property
103
93
  def seq(self) -> str:
@@ -113,7 +103,6 @@ class SeqMat:
113
103
 
114
104
  def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
115
105
  new = SeqMat.__new__(SeqMat)
116
- # copy metadata
117
106
  new.name = self.name
118
107
  new.version = self.version
119
108
  new.source = self.source
@@ -122,100 +111,96 @@ class SeqMat:
122
111
  new.predicted_splicing = None
123
112
  new.insertion_counters = defaultdict(int)
124
113
 
125
- # slice or full copy
126
114
  if start is not None and end is not None:
127
115
  mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
128
116
  new.seq_array = self.seq_array[mask].copy()
129
117
  else:
130
118
  new.seq_array = self.seq_array.copy()
131
119
 
132
- new._build_index_map()
120
+ new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
133
121
  return new
134
122
 
135
- def apply_mutation(self, pos: float, ref: str, alt: str, only_snps: bool = False) -> SeqMat:
136
- """Apply a single mutation to this SeqMat."""
137
- # reverse-complement context
138
- if self.rev:
139
- self.reverse_complement()
140
-
141
- # left-normalize
142
- while ref and alt and ref[0] == alt[0]:
143
- pos += 1
144
- ref = ref[1:] or '-'
145
- alt = alt[1:] or '-'
146
-
147
- # substitution
148
- if ref != '-' and alt != '-':
149
- if len(ref) != len(alt):
150
- raise ValueError("Substitution requires equal-length alleles.")
151
- idx = self._pos_to_idx.get(pos)
152
- if idx is None:
153
- raise KeyError(f"Position {pos} not found.")
154
- end = idx + len(ref)
155
- if end > len(self.seq_array):
156
- raise IndexError(f"Out of bounds at {pos}.")
157
- # verify reference
158
- ref_seg = self.seq_array['ref'][idx:end]
159
- if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
160
- raise ValueError(f"Ref mismatch at {pos}.")
161
- # assign alt
162
- self.seq_array['nt'][idx:end] = np.frombuffer(alt.encode(), dtype='S1')
163
-
164
- # insertion
165
- elif ref == '-' and alt != '-':
166
- if only_snps:
167
- return self
168
- idx = self._pos_to_idx.get(pos)
169
- if idx is None:
170
- raise KeyError(f"Position {pos} not found.")
171
- cnt = self.insertion_counters[pos]
172
- eps = 1e-6
123
+ def apply_mutations(
124
+ self,
125
+ mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
126
+ only_snps: bool = False
127
+ ) -> SeqMat:
128
+ """
129
+ Apply one or a batch of mutations (pos, ref, alt) efficiently:
130
+ - Supports a single tuple or a list of tuples
131
+ - Assumes mutations sorted by position for vectorized searchsorted
132
+ """
133
+ # Normalize to list
134
+ if isinstance(mutations, tuple) and len(mutations) == 3:
135
+ mutations = [mutations]
136
+ elif not isinstance(mutations, list):
137
+ raise TypeError("mutations must be a tuple or list of tuples")
138
+
139
+ # Left-normalize and bucket
140
+ subs, ins, dels = [], [], []
141
+ for pos, ref, alt in mutations:
142
+ while ref and alt and ref[0] == alt[0]:
143
+ pos += 1
144
+ ref = ref[1:] or '-'
145
+ alt = alt[1:] or '-'
146
+ if ref != '-' and alt != '-':
147
+ subs.append((pos, ref, alt))
148
+ elif ref == '-' and alt != '-' and not only_snps:
149
+ ins.append((pos, alt))
150
+ elif alt == '-' and ref != '-' and not only_snps:
151
+ dels.append((pos, ref))
152
+ else:
153
+ raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
154
+
155
+ # Ensure seq_array indices sorted
156
+ coords = self.seq_array['index']
157
+
158
+ # 1) Bulk substitutions
159
+ if subs:
160
+ subs.sort(key=lambda x: x[0])
161
+ positions = np.array([p for p, _, _ in subs], dtype=coords.dtype)
162
+ idxs = np.searchsorted(coords, positions)
163
+ for (pos, ref, alt), idx in zip(subs, idxs):
164
+ length = len(ref)
165
+ if not np.all(self.seq_array['ref'][idx:idx + length] == np.frombuffer(ref.encode(), dtype='S1')):
166
+ actual = self.seq_array['ref'][idx:idx + length].tobytes().decode()
167
+ raise ValueError(f"Ref mismatch at {pos}: expected {ref}, found {actual}")
168
+ self.seq_array['nt'][idx:idx + length] = np.frombuffer(alt.encode(), dtype='S1')
169
+
170
+ # 2) Bulk insertions
171
+ if ins:
172
+ ins.sort(key=lambda x: x[0])
173
+ positions = np.array([p for p, _ in ins], dtype=coords.dtype)
174
+ idxs = np.searchsorted(coords, positions)
173
175
  new_rows = []
174
- for i, nt in enumerate(alt):
175
- new_rows.append((nt.encode(),
176
- pos + (cnt + i + 1)*eps,
177
- b'-',
178
- np.nan,
179
- True))
180
- self._insert_rows(idx, new_rows)
181
- self.insertion_counters[pos] += len(alt)
182
-
183
- # deletion
184
- elif alt == '-' and ref != '-':
185
- if only_snps:
186
- return self
187
- idx = self._pos_to_idx.get(pos)
188
- if idx is None:
189
- raise KeyError(f"Position {pos} not found.")
190
- end = idx + len(ref)
191
- # verify
192
- ref_seg = self.seq_array['ref'][idx:end]
193
- if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
194
- raise ValueError(f"Ref mismatch at {pos}.")
195
- self.seq_array = np.delete(self.seq_array, np.s_[idx:end])
196
-
197
- else:
198
- raise ValueError("Unsupported mutation type.")
199
-
200
- # update mask & index map
176
+ for (pos, alt), idx in zip(ins, idxs):
177
+ cnt = self.insertion_counters[pos]
178
+ eps = 1e-6
179
+ for i, nt in enumerate(alt):
180
+ new_idx = pos + (cnt + i + 1) * eps
181
+ new_rows.append((nt.encode(), new_idx, b'-', np.nan, True))
182
+ self.insertion_counters[pos] += len(alt)
183
+ merged = np.concatenate([self.seq_array, np.array(new_rows, dtype=self.seq_array.dtype)])
184
+ merged.sort(order='index')
185
+ self.seq_array = merged
186
+
187
+ # 3) Bulk deletions
188
+ if dels:
189
+ dels.sort(key=lambda x: x[0])
190
+ positions = np.array([p for p, _ in dels], dtype=coords.dtype)
191
+ idxs = np.searchsorted(self.seq_array['index'], positions)
192
+ mask = np.ones(len(self.seq_array), dtype=bool)
193
+ for (pos, ref), idx in zip(dels, idxs):
194
+ length = len(ref)
195
+ mask[idx:idx + length] = False
196
+ self.seq_array = self.seq_array[mask]
197
+
198
+ # Finalize valid mask
201
199
  self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
202
- self._build_index_map()
203
-
204
- # restore orientation
205
- if self.rev:
206
- self.reverse_complement()
207
200
  return self
208
201
 
209
- def _insert_rows(self, idx: int, rows: List[tuple]):
210
- """Helper to insert new rows efficiently and resort."""
211
- arr = self.seq_array.tolist()
212
- arr[idx:idx] = rows
213
- new = np.array(arr, dtype=self.seq_array.dtype)
214
- new.sort(order='index')
215
- self.seq_array = new
216
-
217
202
  def complement(self) -> SeqMat:
218
- comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
203
+ comp = {b'A': b'T', b'T': b'A', b'C': b'G', b'G': b'C', b'-': b'-'}
219
204
  nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
220
205
  new = self.clone()
221
206
  new.seq_array['nt'] = nts
@@ -228,15 +213,15 @@ class SeqMat:
228
213
  return new
229
214
 
230
215
  def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
231
- idx = None
216
+ coords = self.seq_array['index']
232
217
  if isinstance(key, int):
233
- idx = self._pos_to_idx.get(float(key))
234
- if idx is None:
218
+ idx = np.searchsorted(coords, key)
219
+ if idx >= len(coords) or coords[idx] != key:
235
220
  raise KeyError(f"Position {key} not found.")
236
221
  return self.seq_array[idx]
237
222
  if isinstance(key, slice):
238
- start = key.start or self.min_index
239
- stop = key.stop or self.max_index
240
- mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= stop)
223
+ start = key.start or coords.min()
224
+ stop = key.stop or coords.max()
225
+ mask = (coords >= start) & (coords <= stop)
241
226
  return self.seq_array[mask]
242
227
  raise TypeError("Invalid index type.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.4.17
3
+ Version: 1.4.19
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
37
37
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
38
38
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
39
39
  geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
40
- geney/utils/SeqMats.py,sha256=-Wz-0Bsflf1lBoGF8RS7JjOshzHO2XCZauqcNP4PONw,8766
40
+ geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
41
41
  geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
42
42
  geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
43
43
  geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
46
46
  geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
47
47
  geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
48
48
  geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
49
- geney-1.4.17.dist-info/METADATA,sha256=nlFX-qvZaLauG5S-SBKlM8tRausGYES8Xmrg0XSxtR4,990
50
- geney-1.4.17.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
- geney-1.4.17.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
- geney-1.4.17.dist-info/RECORD,,
49
+ geney-1.4.19.dist-info/METADATA,sha256=BIX1e0_0ypaE0e5prBztdgdzp0S1UutUqJufy-qmpZs,990
50
+ geney-1.4.19.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
+ geney-1.4.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
+ geney-1.4.19.dist-info/RECORD,,
File without changes