PyPI - geney - Versions diffs - 1.4.18__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl - Mend

geney 1.4.18py2.py3-none-any.whl → 1.4.19py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of geney might be problematic. Click here for more details.

Files changed (5) hide show

geney/utils/SeqMats.py CHANGED Viewed

@@ -30,7 +30,6 @@ def format_mut_id(text):
         return None
 @dataclass(slots=True)
 class SeqMat:
     """Represents a genomic sequence matrix used for training."""
@@ -43,21 +42,20 @@ class SeqMat:
     insertion_counters: dict = field(default_factory=lambda: defaultdict(int), init=False, repr=False)
     rev: bool = field(default=False, init=False, repr=False)
     predicted_splicing: pd.DataFrame = field(init=False, repr=False)
-    _pos_to_idx: dict = field(default_factory=dict, init=False, repr=False)
     def __init__(
-        self,
-        nucleotides: str,
-        index: np.ndarray,
-        conservation: Optional[np.ndarray] = None,
-        reference_nucleotides: Optional[np.ndarray] = None,
-        notes: Optional[dict] = None,
-        source: Optional[str] = None,
-        rev: Optional[bool] = False,
-        name: Optional[str] = 'wild_type',
-        version: Optional[str] = 'none'
+            self,
+            nucleotides: str,
+            index: np.ndarray,
+            conservation: Optional[np.ndarray] = None,
+            reference_nucleotides: Optional[np.ndarray] = None,
+            notes: Optional[dict] = None,
+            source: Optional[str] = None,
+            rev: Optional[bool] = False,
+            name: Optional[str] = 'wild_type',
+            version: Optional[str] = 'none'
     ) -> None:
-        # Initialize metadata
+        # Metadata
         self.name = name
         self.version = version
         self.source = source or "Unknown"
@@ -86,19 +84,10 @@ class SeqMat:
         self.seq_array['nt'] = nts
         self.seq_array['ref'] = nts if reference_nucleotides is None else np.array(reference_nucleotides, dtype='S1')
         self.seq_array['index'] = index
-        self.seq_array['cons'] = (np.zeros(L, dtype='f4') if conservation is None else conservation)
+        self.seq_array['cons'] = np.zeros(L, dtype='f4') if conservation is None else conservation
         self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
-        # Initialize helpers
         self.insertion_counters = defaultdict(int)
-        self._build_index_map()
-    def _build_index_map(self):
-        """Rebuild position-to-index lookup."""
-        self._pos_to_idx = {float(pos): i for i, pos in enumerate(self.seq_array['index'])}
-    def __len__(self) -> int:
-        return int(self.seq_array['valid_mask'].sum())
     @property
     def seq(self) -> str:
@@ -114,7 +103,6 @@ class SeqMat:
     def clone(self, start: Optional[float] = None, end: Optional[float] = None) -> SeqMat:
         new = SeqMat.__new__(SeqMat)
-        # copy metadata
         new.name = self.name
         new.version = self.version
         new.source = self.source
@@ -123,100 +111,96 @@ class SeqMat:
         new.predicted_splicing = None
         new.insertion_counters = defaultdict(int)
-        # slice or full copy
         if start is not None and end is not None:
             mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= end)
             new.seq_array = self.seq_array[mask].copy()
         else:
             new.seq_array = self.seq_array.copy()
-        new._build_index_map()
+        new.seq_array['valid_mask'] = new.seq_array['nt'] != b'-'
         return new
-    def apply_mutation(self, pos: float, ref: str, alt: str, only_snps: bool = False) -> SeqMat:
-        """Apply a single mutation to this SeqMat."""
-        # reverse-complement context
-        if self.rev:
-            self.reverse_complement()
-        # left-normalize
-        while ref and alt and ref[0] == alt[0]:
-            pos += 1
-            ref = ref[1:] or '-'
-            alt = alt[1:] or '-'
-        # substitution
-        if ref != '-' and alt != '-':
-            if len(ref) != len(alt):
-                raise ValueError("Substitution requires equal-length alleles.")
-            idx = self._pos_to_idx.get(pos)
-            if idx is None:
-                raise KeyError(f"Position {pos} not found.")
-            end = idx + len(ref)
-            if end > len(self.seq_array):
-                raise IndexError(f"Out of bounds at {pos}.")
-            # verify reference
-            ref_seg = self.seq_array['ref'][idx:end]
-            if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
-                raise ValueError(f"Ref mismatch at {pos}.")
-            # assign alt
-            self.seq_array['nt'][idx:end] = np.frombuffer(alt.encode(), dtype='S1')
-        # insertion
-        elif ref == '-' and alt != '-':
-            if only_snps:
-                return self
-            idx = self._pos_to_idx.get(pos)
-            if idx is None:
-                raise KeyError(f"Position {pos} not found.")
-            cnt = self.insertion_counters[pos]
-            eps = 1e-6
+    def apply_mutations(
+            self,
+            mutations: Union[Tuple[float, str, str], List[Tuple[float, str, str]]],
+            only_snps: bool = False
+    ) -> SeqMat:
+        """
+        Apply one or a batch of mutations (pos, ref, alt) efficiently:
+        - Supports a single tuple or a list of tuples
+        - Assumes mutations sorted by position for vectorized searchsorted
+        """
+        # Normalize to list
+        if isinstance(mutations, tuple) and len(mutations) == 3:
+            mutations = [mutations]
+        elif not isinstance(mutations, list):
+            raise TypeError("mutations must be a tuple or list of tuples")
+        # Left-normalize and bucket
+        subs, ins, dels = [], [], []
+        for pos, ref, alt in mutations:
+            while ref and alt and ref[0] == alt[0]:
+                pos += 1
+                ref = ref[1:] or '-'
+                alt = alt[1:] or '-'
+            if ref != '-' and alt != '-':
+                subs.append((pos, ref, alt))
+            elif ref == '-' and alt != '-' and not only_snps:
+                ins.append((pos, alt))
+            elif alt == '-' and ref != '-' and not only_snps:
+                dels.append((pos, ref))
+            else:
+                raise ValueError(f"Unsupported mutation {pos}:{ref}:{alt}.")
+        # Ensure seq_array indices sorted
+        coords = self.seq_array['index']
+        # 1) Bulk substitutions
+        if subs:
+            subs.sort(key=lambda x: x[0])
+            positions = np.array([p for p, _, _ in subs], dtype=coords.dtype)
+            idxs = np.searchsorted(coords, positions)
+            for (pos, ref, alt), idx in zip(subs, idxs):
+                length = len(ref)
+                if not np.all(self.seq_array['ref'][idx:idx + length] == np.frombuffer(ref.encode(), dtype='S1')):
+                    actual = self.seq_array['ref'][idx:idx + length].tobytes().decode()
+                    raise ValueError(f"Ref mismatch at {pos}: expected {ref}, found {actual}")
+                self.seq_array['nt'][idx:idx + length] = np.frombuffer(alt.encode(), dtype='S1')
+        # 2) Bulk insertions
+        if ins:
+            ins.sort(key=lambda x: x[0])
+            positions = np.array([p for p, _ in ins], dtype=coords.dtype)
+            idxs = np.searchsorted(coords, positions)
             new_rows = []
-            for i, nt in enumerate(alt):
-                new_rows.append((nt.encode(),
-                                 pos + (cnt + i + 1)*eps,
-                                 b'-',
-                                 np.nan,
-                                 True))
-            self._insert_rows(idx, new_rows)
-            self.insertion_counters[pos] += len(alt)
-        # deletion
-        elif alt == '-' and ref != '-':
-            if only_snps:
-                return self
-            idx = self._pos_to_idx.get(pos)
-            if idx is None:
-                raise KeyError(f"Position {pos} not found.")
-            end = idx + len(ref)
-            # verify
-            ref_seg = self.seq_array['ref'][idx:end]
-            if not np.array_equal(ref_seg, np.frombuffer(ref.encode(), dtype='S1')):
-                raise ValueError(f"Ref mismatch at {pos}.")
-            self.seq_array = np.delete(self.seq_array, np.s_[idx:end])
-        else:
-            raise ValueError("Unsupported mutation type.")
-        # update mask & index map
+            for (pos, alt), idx in zip(ins, idxs):
+                cnt = self.insertion_counters[pos]
+                eps = 1e-6
+                for i, nt in enumerate(alt):
+                    new_idx = pos + (cnt + i + 1) * eps
+                    new_rows.append((nt.encode(), new_idx, b'-', np.nan, True))
+                self.insertion_counters[pos] += len(alt)
+            merged = np.concatenate([self.seq_array, np.array(new_rows, dtype=self.seq_array.dtype)])
+            merged.sort(order='index')
+            self.seq_array = merged
+        # 3) Bulk deletions
+        if dels:
+            dels.sort(key=lambda x: x[0])
+            positions = np.array([p for p, _ in dels], dtype=coords.dtype)
+            idxs = np.searchsorted(self.seq_array['index'], positions)
+            mask = np.ones(len(self.seq_array), dtype=bool)
+            for (pos, ref), idx in zip(dels, idxs):
+                length = len(ref)
+                mask[idx:idx + length] = False
+            self.seq_array = self.seq_array[mask]
+        # Finalize valid mask
         self.seq_array['valid_mask'] = self.seq_array['nt'] != b'-'
-        self._build_index_map()
-        # restore orientation
-        if self.rev:
-            self.reverse_complement()
         return self
-    def _insert_rows(self, idx: int, rows: List[tuple]):
-        """Helper to insert new rows efficiently and resort."""
-        arr = self.seq_array.tolist()
-        arr[idx:idx] = rows
-        new = np.array(arr, dtype=self.seq_array.dtype)
-        new.sort(order='index')
-        self.seq_array = new
     def complement(self) -> SeqMat:
-        comp = {b'A':b'T', b'T':b'A', b'C':b'G', b'G':b'C', b'-':b'-'}
+        comp = {b'A': b'T', b'T': b'A', b'C': b'G', b'G': b'C', b'-': b'-'}
         nts = np.array([comp[x] for x in self.seq_array['nt']], dtype='S1')
         new = self.clone()
         new.seq_array['nt'] = nts
@@ -229,15 +213,15 @@ class SeqMat:
         return new
     def __getitem__(self, key: Union[int, slice]) -> np.ndarray:
-        idx = None
+        coords = self.seq_array['index']
         if isinstance(key, int):
-            idx = self._pos_to_idx.get(float(key))
-            if idx is None:
+            idx = np.searchsorted(coords, key)
+            if idx >= len(coords) or coords[idx] != key:
                 raise KeyError(f"Position {key} not found.")
             return self.seq_array[idx]
         if isinstance(key, slice):
-            start = key.start or self.min_index
-            stop = key.stop or self.max_index
-            mask = (self.seq_array['index'] >= start) & (self.seq_array['index'] <= stop)
+            start = key.start or coords.min()
+            stop = key.stop or coords.max()
+            mask = (coords >= start) & (coords <= stop)
             return self.seq_array[mask]
         raise TypeError("Invalid index type.")

{geney-1.4.18.dist-info → geney-1.4.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geney
-Version: 1.4.18
+Version: 1.4.19
 Summary: A Python package for gene expression modeling.
 Home-page: https://github.com/nicolaslynn/geney
 Author: Nicolas Lynn

{geney-1.4.18.dist-info → geney-1.4.19.dist-info}/RECORD RENAMED Viewed

@@ -37,7 +37,7 @@ geney/translation_initiation/tis_utils.py,sha256=AF3siFjuQH-Rs44EV-80zHdbxRMvN4w
 geney/translation_initiation/resources/kozak_pssm.json,sha256=pcd0Olziutq-6H3mFWDCD9cujQ_AlZO-iiOvBl82hqE,1165
 geney/translation_initiation/resources/tis_regressor_model.joblib,sha256=IXb4DUDhJ5rBDKcqMk9zE3ECTZZcdj7Jixz3KpoZ7OA,2592025
 geney/utils/Fasta_segment.py,sha256=weB5NJ65P0XiyAJCiCHx4T9sHC1pWLpuQeOy0B85gyg,11364
-geney/utils/SeqMats.py,sha256=2tJYPGy-cCCaANRbiYkki5yNxnhgzysDQGWjRYRSnlQ,8767
+geney/utils/SeqMats.py,sha256=q858gWPsSoS4HUr6FD1CHYuUh5AE5u9KePHYT7FQw7g,8777
 geney/utils/SeqMatsOld.py,sha256=syRU5DAuTh3xUfGW_qP9wlcBO5pHsG_y5PlrfXTIxUY,18502
 geney/utils/TranscriptLibrary.py,sha256=ma_ZVPgglxXDDneEvdqxxeqxG8eSFL-zgLUXyC6BqY8,2070
 geney/utils/__init__.py,sha256=-nJ-DMx1JzP-ZCe_QuQCeM0ZYIT_16jxoXDhUaO_4Oc,714
@@ -46,7 +46,7 @@ geney/utils/pangolin_utils.py,sha256=JQSPbWxdzqGFYfWQktkfLMaMSGR28eGQhNzO7MLMe5M
 geney/utils/spliceai_utils.py,sha256=VtrIbjyQxk_3lw86eWjftRYyal9OzxArJ0GV5u_ymTg,2721
 geney/utils/splicing_utils.py,sha256=vPCGnCPR1ooEZEHR79yFHLmRQXEJHXEQjjxpBR-YWOs,20635
 geney/utils/utils.py,sha256=m51Vd0cEbrcIHo6_8BAuI9YSPcKRs22e5LfVd2Qj6Is,2181
-geney-1.4.18.dist-info/METADATA,sha256=WLdB6CQyOBWNDYLU05Gyj6o7kaNzA5qr7ROD0mzTgm4,990
-geney-1.4.18.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
-geney-1.4.18.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
-geney-1.4.18.dist-info/RECORD,,
+geney-1.4.19.dist-info/METADATA,sha256=BIX1e0_0ypaE0e5prBztdgdzp0S1UutUqJufy-qmpZs,990
+geney-1.4.19.dist-info/WHEEL,sha256=AHX6tWk3qWuce7vKLrj7lnulVHEdWoltgauo8bgCXgU,109
+geney-1.4.19.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
+geney-1.4.19.dist-info/RECORD,,

{geney-1.4.18.dist-info → geney-1.4.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{geney-1.4.18.dist-info → geney-1.4.19.dist-info}/top_level.txt RENAMED Viewed

File without changes

geney 1.4.18__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl

Potentially problematic release.

geney 1.4.18py2.py3-none-any.whl → 1.4.19py2.py3-none-any.whl