PyPI - darkprofiler - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

darkprofiler 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{darkprofiler-0.2.0/src/darkprofiler.egg-info → darkprofiler-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: darkprofiler
-Version: 0.2.0
+Version: 0.2.2
 Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
 Author-email: Hanjun Lee <hanjun@alum.mit.edu>
 License: MIT

{darkprofiler-0.2.0 → darkprofiler-0.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "darkprofiler"
-version = "0.2.0"
+version = "0.2.2"
 description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
 readme = "README.md"
 requires-python = ">=3.7"

{darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@ from .run import classify_peptides
 __all__ = ["classify_peptides"]
-__version__ = "0.2.0"
+__version__ = "0.2.2"

{darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler/run.py RENAMED Viewed

@@ -246,6 +246,9 @@ class ProteomeIndex:
     Uses mmap for large blobs; keys/offsets are loaded into memory arrays.
     """
     def __init__(self, idx_dir, require_aa2nt=False):
+        import threading
+        self._load_lock = threading.Lock()
         self.idx_dir = idx_dir
         self.tx_ids = []
         self.seq_offsets = []  # list of (start, length)
@@ -316,28 +319,32 @@ class ProteomeIndex:
     def _load_Lb(self, L, b):
         key = (L, b)
-        if key in self._keys:
+        if key in self._keys and key in self._offs and key in self._post_mm:
             return
-        keys_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.keys.bin")
-        offs_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.offs.bin")
-        post_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.post.bin")
+        with self._load_lock:
+            if key in self._keys and key in self._offs and key in self._post_mm:
+                return
-        # keys
-        with open(keys_path, "rb") as f:
-            data = f.read()
-        self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
+            keys_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.keys.bin")
+            offs_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.offs.bin")
+            post_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.post.bin")
-        # offsets
-        with open(offs_path, "rb") as f:
-            data = f.read()
-        self._offs[key] = [struct.unpack_from("<I", data, i)[0] for i in range(0, len(data), 4)]
+            # keys
+            with open(keys_path, "rb") as f:
+                data = f.read()
+            self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
-        # postings mmap
-        fh = open(post_path, "rb")
-        mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
-        self._post_files[key] = fh
-        self._post_mm[key] = mm
+            # offsets
+            with open(offs_path, "rb") as f:
+                data = f.read()
+            self._offs[key] = [struct.unpack_from("<I", data, i)[0] for i in range(0, len(data), 4)]
+            # postings mmap
+            fh = open(post_path, "rb")
+            mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
+            self._post_files[key] = fh
+            self._post_mm[key] = mm
     def get_tx_seq_window(self, tx_idx, aa_pos, L):
         start, ln = self.seq_offsets[tx_idx]
@@ -356,9 +363,6 @@ class ProteomeIndex:
         return struct.unpack_from("<I", self._aa2nt_values_mmap, byte_off)[0]
     def postings_for_block(self, L, block_id, block_str):
-        """
-        Returns list of (tx_idx, aa_pos) for this block key.
-        """
         self._load_Lb(L, block_id)
         key = (L, block_id)
         keys = self._keys[key]
@@ -366,18 +370,14 @@ class ProteomeIndex:
         k = _encode_block_u64(block_str)
         i = bisect_left(keys, k)
         if i >= len(keys) or keys[i] != k:
-            return []
+            return
         start = offs[i]
         end = offs[i + 1]
         mm = self._post_mm[key]
-        out = []
-        # each posting is 8 bytes: u32 tx_idx, u32 aa_pos
         for j in range(start, end):
             off = j * 8
             tx_idx, aa_pos = struct.unpack_from("<II", mm, off)
-            out.append((tx_idx, aa_pos))
-        return out
+            yield tx_idx, aa_pos
 def write_peptide_hits_fasta(hit_records, path):
     """
@@ -554,15 +554,29 @@ def classify_peptides(reference,
         core = rec_id.split()[0]
         return core.split(".")[0]
-    def read_peptide_fasta(path):
+    def read_peptide_fasta(path, L_min=7, L_max=12):
         peptides = OrderedDict()
+        skipped = 0
         for rec in SeqIO.parse(path, "fasta"):
             seq = str(rec.seq).strip().upper()
             if not seq:
                 continue
+            L = len(seq)
+            if L < L_min or L > L_max:
+                # skip and do not store anywhere
+                skipped += 1
+                continue
             peptides[rec.id] = seq
+        if skipped:
+            print(f"[INFO] Skipped {skipped} peptides outside {L_min}-{L_max} aa from {path}", file=sys.stderr)
         return peptides
     def load_transcriptome(path):
         tx = {}
         for rec in SeqIO.parse(path, "fasta"):
@@ -773,6 +787,12 @@ def classify_peptides(reference,
                 print(f"[WARN] Could not load index at {idx_dir}: {e}. Falling back to scan.", file=sys.stderr)
                 idx_obj = None
+        # PRELOAD: do this before any threading starts
+        if idx_obj is not None:
+            for L0 in range(index_L_min, index_L_max + 1):   # 7..12 by default
+                for b_id in (0, 1, 2):                       # index built_for_k=2 => 3 blocks exist
+                    idx_obj._load_Lb(L0, b_id)
         hit_records = []
         remaining = OrderedDict()
@@ -908,7 +928,7 @@ def classify_peptides(reference,
             aa2nt_map_by_txid=canonical_aa2nt
         )
-    peptides_all = read_peptide_fasta(peptide_fasta)
+    peptides_all = read_peptide_fasta(peptide_fasta, L_min=index_L_min, L_max=index_L_max)
     def coord_resolver_cds(ref_txid, idx_obj, aa_pos, L, ref_peptide):
         # idx_obj provides aa2nt mapping
@@ -1086,12 +1106,13 @@ def classify_peptides(reference,
             frame = int(frame_str)
         except ValueError:
             return None, None, None, None
         nt0 = frame + aa_pos * 3
         if tx_id not in transcriptome:
             return None, None, None, None
         if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
             return None, None, None, None
         if tx_id not in cds_bounds:
             region = "lncRNA"
         else:
@@ -1101,7 +1122,11 @@ def classify_peptides(reference,
             elif nt0 >= cds_end:
                 region = "dORF"
             else:
-                region = "intORF"
+                # inside CDS bounds: decide by frame
+                if ((nt0 - cds_start) % 3) == 0:
+                    region = "CDS"
+                else:
+                    region = "intORF"
         return tx_id, nt0 + 1, region, ref_peptide

{darkprofiler-0.2.0 → darkprofiler-0.2.2/src/darkprofiler.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: darkprofiler
-Version: 0.2.0
+Version: 0.2.2
 Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
 Author-email: Hanjun Lee <hanjun@alum.mit.edu>
 License: MIT