darkprofiler 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: darkprofiler
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
5
5
  Author-email: Hanjun Lee <hanjun@alum.mit.edu>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "darkprofiler"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.7"
@@ -2,5 +2,5 @@ from .run import classify_peptides
2
2
 
3
3
  __all__ = ["classify_peptides"]
4
4
 
5
- __version__ = "0.2.0"
5
+ __version__ = "0.2.2"
6
6
 
@@ -246,6 +246,9 @@ class ProteomeIndex:
246
246
  Uses mmap for large blobs; keys/offsets are loaded into memory arrays.
247
247
  """
248
248
  def __init__(self, idx_dir, require_aa2nt=False):
249
+ import threading
250
+ self._load_lock = threading.Lock()
251
+
249
252
  self.idx_dir = idx_dir
250
253
  self.tx_ids = []
251
254
  self.seq_offsets = [] # list of (start, length)
@@ -316,28 +319,32 @@ class ProteomeIndex:
316
319
 
317
320
  def _load_Lb(self, L, b):
318
321
  key = (L, b)
319
- if key in self._keys:
322
+ if key in self._keys and key in self._offs and key in self._post_mm:
320
323
  return
321
324
 
322
- keys_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.keys.bin")
323
- offs_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.offs.bin")
324
- post_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.post.bin")
325
+ with self._load_lock:
326
+ if key in self._keys and key in self._offs and key in self._post_mm:
327
+ return
325
328
 
326
- # keys
327
- with open(keys_path, "rb") as f:
328
- data = f.read()
329
- self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
329
+ keys_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.keys.bin")
330
+ offs_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.offs.bin")
331
+ post_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.post.bin")
330
332
 
331
- # offsets
332
- with open(offs_path, "rb") as f:
333
- data = f.read()
334
- self._offs[key] = [struct.unpack_from("<I", data, i)[0] for i in range(0, len(data), 4)]
333
+ # keys
334
+ with open(keys_path, "rb") as f:
335
+ data = f.read()
336
+ self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
335
337
 
336
- # postings mmap
337
- fh = open(post_path, "rb")
338
- mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
339
- self._post_files[key] = fh
340
- self._post_mm[key] = mm
338
+ # offsets
339
+ with open(offs_path, "rb") as f:
340
+ data = f.read()
341
+ self._offs[key] = [struct.unpack_from("<I", data, i)[0] for i in range(0, len(data), 4)]
342
+
343
+ # postings mmap
344
+ fh = open(post_path, "rb")
345
+ mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
346
+ self._post_files[key] = fh
347
+ self._post_mm[key] = mm
341
348
 
342
349
  def get_tx_seq_window(self, tx_idx, aa_pos, L):
343
350
  start, ln = self.seq_offsets[tx_idx]
@@ -356,9 +363,6 @@ class ProteomeIndex:
356
363
  return struct.unpack_from("<I", self._aa2nt_values_mmap, byte_off)[0]
357
364
 
358
365
  def postings_for_block(self, L, block_id, block_str):
359
- """
360
- Returns list of (tx_idx, aa_pos) for this block key.
361
- """
362
366
  self._load_Lb(L, block_id)
363
367
  key = (L, block_id)
364
368
  keys = self._keys[key]
@@ -366,18 +370,14 @@ class ProteomeIndex:
366
370
  k = _encode_block_u64(block_str)
367
371
  i = bisect_left(keys, k)
368
372
  if i >= len(keys) or keys[i] != k:
369
- return []
373
+ return
370
374
  start = offs[i]
371
375
  end = offs[i + 1]
372
376
  mm = self._post_mm[key]
373
- out = []
374
- # each posting is 8 bytes: u32 tx_idx, u32 aa_pos
375
377
  for j in range(start, end):
376
378
  off = j * 8
377
379
  tx_idx, aa_pos = struct.unpack_from("<II", mm, off)
378
- out.append((tx_idx, aa_pos))
379
- return out
380
-
380
+ yield tx_idx, aa_pos
381
381
 
382
382
  def write_peptide_hits_fasta(hit_records, path):
383
383
  """
@@ -554,15 +554,29 @@ def classify_peptides(reference,
554
554
  core = rec_id.split()[0]
555
555
  return core.split(".")[0]
556
556
 
557
- def read_peptide_fasta(path):
557
+ def read_peptide_fasta(path, L_min=7, L_max=12):
558
558
  peptides = OrderedDict()
559
+ skipped = 0
560
+
559
561
  for rec in SeqIO.parse(path, "fasta"):
560
562
  seq = str(rec.seq).strip().upper()
561
563
  if not seq:
562
564
  continue
565
+
566
+ L = len(seq)
567
+ if L < L_min or L > L_max:
568
+ # skip and do not store anywhere
569
+ skipped += 1
570
+ continue
571
+
563
572
  peptides[rec.id] = seq
573
+
574
+ if skipped:
575
+ print(f"[INFO] Skipped {skipped} peptides outside {L_min}-{L_max} aa from {path}", file=sys.stderr)
576
+
564
577
  return peptides
565
578
 
579
+
566
580
  def load_transcriptome(path):
567
581
  tx = {}
568
582
  for rec in SeqIO.parse(path, "fasta"):
@@ -773,6 +787,12 @@ def classify_peptides(reference,
773
787
  print(f"[WARN] Could not load index at {idx_dir}: {e}. Falling back to scan.", file=sys.stderr)
774
788
  idx_obj = None
775
789
 
790
+ # PRELOAD: do this before any threading starts
791
+ if idx_obj is not None:
792
+ for L0 in range(index_L_min, index_L_max + 1): # 7..12 by default
793
+ for b_id in (0, 1, 2): # index built_for_k=2 => 3 blocks exist
794
+ idx_obj._load_Lb(L0, b_id)
795
+
776
796
  hit_records = []
777
797
  remaining = OrderedDict()
778
798
 
@@ -908,7 +928,7 @@ def classify_peptides(reference,
908
928
  aa2nt_map_by_txid=canonical_aa2nt
909
929
  )
910
930
 
911
- peptides_all = read_peptide_fasta(peptide_fasta)
931
+ peptides_all = read_peptide_fasta(peptide_fasta, L_min=index_L_min, L_max=index_L_max)
912
932
 
913
933
  def coord_resolver_cds(ref_txid, idx_obj, aa_pos, L, ref_peptide):
914
934
  # idx_obj provides aa2nt mapping
@@ -1086,12 +1106,13 @@ def classify_peptides(reference,
1086
1106
  frame = int(frame_str)
1087
1107
  except ValueError:
1088
1108
  return None, None, None, None
1109
+
1089
1110
  nt0 = frame + aa_pos * 3
1090
1111
  if tx_id not in transcriptome:
1091
1112
  return None, None, None, None
1092
1113
  if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
1093
1114
  return None, None, None, None
1094
-
1115
+
1095
1116
  if tx_id not in cds_bounds:
1096
1117
  region = "lncRNA"
1097
1118
  else:
@@ -1101,7 +1122,11 @@ def classify_peptides(reference,
1101
1122
  elif nt0 >= cds_end:
1102
1123
  region = "dORF"
1103
1124
  else:
1104
- region = "intORF"
1125
+ # inside CDS bounds: decide by frame
1126
+ if ((nt0 - cds_start) % 3) == 0:
1127
+ region = "CDS"
1128
+ else:
1129
+ region = "intORF"
1105
1130
 
1106
1131
  return tx_id, nt0 + 1, region, ref_peptide
1107
1132
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: darkprofiler
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
5
5
  Author-email: Hanjun Lee <hanjun@alum.mit.edu>
6
6
  License: MIT
File without changes
File without changes
File without changes