geney 1.4.18__py2.py3-none-any.whl → 1.4.20__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geney/utils/SeqMats.py CHANGED
@@ -30,7 +30,6 @@ def format_mut_id(text):
30
30
  return None
31
31
 
32
32
 
33
-
34
33
  @dataclass(slots=True)
35
34
  class SeqMat:
36
35
  """Represents a genomic sequence matrix used for training."""
@@ -43,21 +42,20 @@ class SeqMat:
43
42
  insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
44
43
  rev: bool = field(default=False, init=False, repr=False)
45
44
  predicted_splicing: pd.DataFrame = field(init=False, repr=False)
46
- _pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
47
45
 
48
46
  def __init__(
49
- self,
50
- nucleotides: str,
51
- index: np.ndarray,
52
- conservation: Optional[np.ndarray] = None,
53
- reference_nucleotides: Optional[np.ndarray] = None,
54
- notes: Optional[dict] = None,
55
- source: Optional[str] = None,
56
- rev: Optional[bool] = False,
57
- name: Optional[str] = 'wild_type',
58
- version: Optional[str] = 'none'
47
+ self,
48
+ nucleotides: str,
49
+ index: np.ndarray,
50
+ conservation: Optional[np.ndarray] = None,
51
+ reference_nucleotides: Optional[np.ndarray] = None,
52
+ notes: Optional[dict] = None,
53
+ source: Optional[str] = None,
54
+ rev: Optional[bool] = False,
55
+ name: Optional[str] = 'wild_type',
56
+ version: Optional[str] = 'none'
59
57
  ) -> None:
60
- # Initialize metadata
58
+ # Metadata
61
59
  self.name = name
62
60
  self.version = version
63
61
  self.source = source or "Unknown"
@@ -86,19 +84,10 @@ class SeqMat:
86
84
  self.seq_array['nt'] = nts
87
85
  self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
88
86
  self.seq_array['index'] = index
89
- self.seq_array['cons'] = (np.zeros(L, dtype='f4') if conservation is None else conservation)
87
+ self.seq_array['cons'] = np.zeros(L, dtype='f4') if conservation is None else conservation
90
88
  self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
91
89
 
92
- # Initialize helpers
93
90
  self.insertion_counters = defaultdict(int)
94
- self._build_index_map()
95
-
96
- def _build_index_map(self):
97
- """Rebuild position-to-index lookup."""
98
- self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
99
-
100
- def __len__(self) -> int:
101
- return int(self.seq_array['valid_mask'].sum())
102
91
 
103
92
  @property
104
93
  def seq(self) -> str:
@@ -114,7 +103,6 @@ class SeqMat:
114
103
 
115
104
  def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
116
105
  new = SeqMat.__new__(SeqMat)
117
- # copy metadata
118
106
  new.name = self.name
119
107
  new.version = self.version
120
108
  new.source = self.source
@@ -123,100 +111,96 @@ class SeqMat:
123
111
  new.predicted_splicing = None
124
112
  new.insertion_counters = defaultdict(int)
125
113
 
126
- # slice or full copy
127
114
  if start is not None and end is not None:
128
115
  mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
129
116
  new.seq_array = self.seq_array[mask].copy()
130
117
  else:
131
118
  new.seq_array = self.seq_array.copy()
132
119
 
133
- new._build_index_map()
120
+ new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
134
121
  return new
135
122
 
136
- def apply_mutation(self, pos: float, ref: str, alt: str, only_snps: bool = False) -> SeqMat:
137
- """Apply a single mutation to this SeqMat."""
138
- # reverse-complement context
139
- if self.rev:
140
- self.reverse_complement()
141
-
142
- # left-normalize
143
- while ref and alt and ref[0] == alt[0]:
144
- pos += 1
145
- ref = ref[1:] or '-'
146
- alt = alt[1:] or '-'
147
-
148
- # substitution
149
- if ref != '-' and alt != '-':
150
- if len(ref) != len(alt):
151
- raise ValueError("Substitution requires equal-length alleles.")
152
- idx = self._pos_to_idx.get(pos)
153
- if idx is None:
154
- raise KeyError(f"Position {pos} not found.")
155
- end = idx + len(ref)
156
- if end > len(self.seq_array):
157
- raise IndexError(f"Out of bounds at {pos}.")
158
- # verify reference
159
- ref_seg = self.seq_array['ref'][idx:end]
160
- if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
161
- raise ValueError(f"Ref mismatch at {pos}.")
162
- # assign alt
163
- self.seq_array['nt'][idx:end] = np.frombuffer(alt.encode(), dtype='S1')
164
-
165
- # insertion
166
- elif ref == '-' and alt != '-':
167
- if only_snps:
168
- return self
169
- idx = self._pos_to_idx.get(pos)
170
- if idx is None:
171
- raise KeyError(f"Position {pos} not found.")
172
- cnt = self.insertion_counters[pos]
173
- eps = 1e-6
123
+ def apply_mutations(
124
+ self,
125
+ mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
126
+ only_snps: bool = False
127
+ ) -> SeqMat:
128
+ """
129
+ Apply one or a batch of mutations (pos, ref, alt) efficiently:
130
+ - Supports a single tuple or a list of tuples
131
+ - Assumes mutations sorted by position for vectorized searchsorted
132
+ """
133
+ # Normalize to list
134
+ if isinstance(mutations, tuple) and len(mutations) == 3:
135
+ mutations = [mutations]
136
+ elif not isinstance(mutations, list):
137
+ raise TypeError("mutations must be a tuple or list of tuples")
138
+
139
+ # Left-normalize and bucket
140
+ subs, ins, dels = [], [], []
141
+ for pos, ref, alt in mutations:
142
+ while ref and alt and ref[0] == alt[0]:
143
+ pos += 1
144
+ ref = ref[1:] or '-'
145
+ alt = alt[1:] or '-'
146
+ if ref != '-' and alt != '-':
147
+ subs.append((pos, ref, alt))
148
+ elif ref == '-' and alt != '-' and not only_snps:
149
+ ins.append((pos, alt))
150
+ elif alt == '-' and ref != '-' and not only_snps:
151
+ dels.append((pos, ref))
152
+ else:
153
+ raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
154
+
155
+ # Ensure seq_array indices sorted
156
+ coords = self.seq_array['index']
157
+
158
+ # 1) Bulk substitutions
159
+ if subs:
160
+ subs.sort(key=lambda x: x[0])
161
+ positions = np.array([p for p, _, _ in subs], dtype=coords.dtype)
162
+ idxs = np.searchsorted(coords, positions)
163
+ for (pos, ref, alt), idx in zip(subs, idxs):
164
+ length = len(ref)
165
+ if not np.all(self.seq_array['ref'][idx:idx + length] == np.frombuffer(ref.encode(), dtype='S1')):
166
+ actual = self.seq_array['ref'][idx:idx + length].tobytes().decode()
167
+ raise ValueError(f"Ref mismatch at {pos}: expected {ref}, found {actual}")
168
+ self.seq_array['nt'][idx:idx + length] = np.frombuffer(alt.encode(), dtype='S1')
169
+
170
+ # 2) Bulk insertions
171
+ if ins:
172
+ ins.sort(key=lambda x: x[0])
173
+ positions = np.array([p for p, _ in ins], dtype=coords.dtype)
174
+ idxs = np.searchsorted(coords, positions)
174
175
  new_rows = []
175
- for i, nt in enumerate(alt):
176
- new_rows.append((nt.encode(),
177
- pos + (cnt + i + 1)*eps,
178
- b'-',
179
- np.nan,
180
- True))
181
- self._insert_rows(idx, new_rows)
182
- self.insertion_counters[pos] += len(alt)
183
-
184
- # deletion
185
- elif alt == '-' and ref != '-':
186
- if only_snps:
187
- return self
188
- idx = self._pos_to_idx.get(pos)
189
- if idx is None:
190
- raise KeyError(f"Position {pos} not found.")
191
- end = idx + len(ref)
192
- # verify
193
- ref_seg = self.seq_array['ref'][idx:end]
194
- if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
195
- raise ValueError(f"Ref mismatch at {pos}.")
196
- self.seq_array = np.delete(self.seq_array, np.s_[idx:end])
197
-
198
- else:
199
- raise ValueError("Unsupported mutation type.")
200
-
201
- # update mask & index map
176
+ for (pos, alt), idx in zip(ins, idxs):
177
+ cnt = self.insertion_counters[pos]
178
+ eps = 1e-6
179
+ for i, nt in enumerate(alt):
180
+ new_idx = pos + (cnt + i + 1) * eps
181
+ new_rows.append((nt.encode(), new_idx, b'-', np.nan, True))
182
+ self.insertion_counters[pos] += len(alt)
183
+ merged = np.concatenate([self.seq_array, np.array(new_rows, dtype=self.seq_array.dtype)])
184
+ merged.sort(order='index')
185
+ self.seq_array = merged
186
+
187
+ # 3) Bulk deletions
188
+ if dels:
189
+ dels.sort(key=lambda x: x[0])
190
+ positions = np.array([p for p, _ in dels], dtype=coords.dtype)
191
+ idxs = np.searchsorted(self.seq_array['index'], positions)
192
+ mask = np.ones(len(self.seq_array), dtype=bool)
193
+ for (pos, ref), idx in zip(dels, idxs):
194
+ length = len(ref)
195
+ mask[idx:idx + length] = False
196
+ self.seq_array = self.seq_array[mask]
197
+
198
+ # Finalize valid mask
202
199
  self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
203
- self._build_index_map()
204
-
205
- # restore orientation
206
- if self.rev:
207
- self.reverse_complement()
208
200
  return self
209
201
 
210
- def _insert_rows(self, idx: int, rows: List[tuple]):
211
- """Helper to insert new rows efficiently and resort."""
212
- arr = self.seq_array.tolist()
213
- arr[idx:idx] = rows
214
- new = np.array(arr, dtype=self.seq_array.dtype)
215
- new.sort(order='index')
216
- self.seq_array = new
217
-
218
202
  def complement(self) -> SeqMat:
219
- comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
203
+ comp = {b'A': b'T', b'T': b'A', b'C': b'G', b'G': b'C', b'-': b'-'}
220
204
  nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
221
205
  new = self.clone()
222
206
  new.seq_array['nt'] = nts
@@ -229,15 +213,15 @@ class SeqMat:
229
213
  return new
230
214
 
231
215
  def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
232
- idx = None
216
+ coords = self.seq_array['index']
233
217
  if isinstance(key, int):
234
- idx = self._pos_to_idx.get(float(key))
235
- if idx is None:
218
+ idx = np.searchsorted(coords, key)
219
+ if idx >= len(coords) or coords[idx] != key:
236
220
  raise KeyError(f"Position {key} not found.")
237
221
  return self.seq_array[idx]
238
222
  if isinstance(key, slice):
239
- start = key.start or self.min_index
240
- stop = key.stop or self.max_index
241
- mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= stop)
223
+ start = key.start or coords.min()
224
+ stop = key.stop or coords.max()
225
+ mask = (coords >= start) & (coords <= stop)
242
226
  return self.seq_array[mask]
243
227
  raise TypeError("Invalid index type.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geney
3
- Version: 1.4.18
3
+ Version: 1.4.20
4
4
  Summary: A Python package for gene expression modeling.
5
5
  Home-page: https://github.com/nicolaslynn/geney
6
6
  Author: Nicolas Lynn
@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
37
37
  geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
38
38
  geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
39
39
  geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
40
- geney/utils/SeqMats.py,sha256=2tJYPGy-cCCaANRbiYkki5yNxnhgzysDQGWjRYRSnlQ,8767
40
+ geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
41
41
  geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
42
42
  geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
43
43
  geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
46
46
  geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
47
47
  geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
48
48
  geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
49
- geney-1.4.18.dist-info/METADATA,sha256=WLdB6CQyOBWNDYLU05Gyj6o7kaNzA5qr7ROD0mzTgm4,990
50
- geney-1.4.18.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
- geney-1.4.18.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
- geney-1.4.18.dist-info/RECORD,,
49
+ geney-1.4.20.dist-info/METADATA,sha256=NznExaPOYo1PvAj3-Uxq3Y9SXMNLd__iz30PKF6ix18,990
50
+ geney-1.4.20.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
51
+ geney-1.4.20.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
52
+ geney-1.4.20.dist-info/RECORD,,
File without changes