darkprofiler 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {darkprofiler-0.2.0/src/darkprofiler.egg-info → darkprofiler-0.2.2}/PKG-INFO +1 -1
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/pyproject.toml +1 -1
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler/__init__.py +1 -1
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler/run.py +55 -30
- {darkprofiler-0.2.0 → darkprofiler-0.2.2/src/darkprofiler.egg-info}/PKG-INFO +1 -1
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/LICENSE.txt +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/README.md +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/setup.cfg +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler/cli.py +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler.egg-info/SOURCES.txt +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler.egg-info/dependency_links.txt +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler.egg-info/entry_points.txt +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler.egg-info/requires.txt +0 -0
- {darkprofiler-0.2.0 → darkprofiler-0.2.2}/src/darkprofiler.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: darkprofiler
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
|
|
5
5
|
Author-email: Hanjun Lee <hanjun@alum.mit.edu>
|
|
6
6
|
License: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "darkprofiler"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.7"
|
|
@@ -246,6 +246,9 @@ class ProteomeIndex:
|
|
|
246
246
|
Uses mmap for large blobs; keys/offsets are loaded into memory arrays.
|
|
247
247
|
"""
|
|
248
248
|
def __init__(self, idx_dir, require_aa2nt=False):
|
|
249
|
+
import threading
|
|
250
|
+
self._load_lock = threading.Lock()
|
|
251
|
+
|
|
249
252
|
self.idx_dir = idx_dir
|
|
250
253
|
self.tx_ids = []
|
|
251
254
|
self.seq_offsets = [] # list of (start, length)
|
|
@@ -316,28 +319,32 @@ class ProteomeIndex:
|
|
|
316
319
|
|
|
317
320
|
def _load_Lb(self, L, b):
|
|
318
321
|
key = (L, b)
|
|
319
|
-
if key in self._keys:
|
|
322
|
+
if key in self._keys and key in self._offs and key in self._post_mm:
|
|
320
323
|
return
|
|
321
324
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
+
with self._load_lock:
|
|
326
|
+
if key in self._keys and key in self._offs and key in self._post_mm:
|
|
327
|
+
return
|
|
325
328
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
|
|
329
|
+
keys_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.keys.bin")
|
|
330
|
+
offs_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.offs.bin")
|
|
331
|
+
post_path = os.path.join(self.idx_dir, f"L{L:02d}.b{b}.post.bin")
|
|
330
332
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
333
|
+
# keys
|
|
334
|
+
with open(keys_path, "rb") as f:
|
|
335
|
+
data = f.read()
|
|
336
|
+
self._keys[key] = [struct.unpack_from("<Q", data, i)[0] for i in range(0, len(data), 8)]
|
|
335
337
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
338
|
+
# offsets
|
|
339
|
+
with open(offs_path, "rb") as f:
|
|
340
|
+
data = f.read()
|
|
341
|
+
self._offs[key] = [struct.unpack_from("<I", data, i)[0] for i in range(0, len(data), 4)]
|
|
342
|
+
|
|
343
|
+
# postings mmap
|
|
344
|
+
fh = open(post_path, "rb")
|
|
345
|
+
mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
|
|
346
|
+
self._post_files[key] = fh
|
|
347
|
+
self._post_mm[key] = mm
|
|
341
348
|
|
|
342
349
|
def get_tx_seq_window(self, tx_idx, aa_pos, L):
|
|
343
350
|
start, ln = self.seq_offsets[tx_idx]
|
|
@@ -356,9 +363,6 @@ class ProteomeIndex:
|
|
|
356
363
|
return struct.unpack_from("<I", self._aa2nt_values_mmap, byte_off)[0]
|
|
357
364
|
|
|
358
365
|
def postings_for_block(self, L, block_id, block_str):
|
|
359
|
-
"""
|
|
360
|
-
Returns list of (tx_idx, aa_pos) for this block key.
|
|
361
|
-
"""
|
|
362
366
|
self._load_Lb(L, block_id)
|
|
363
367
|
key = (L, block_id)
|
|
364
368
|
keys = self._keys[key]
|
|
@@ -366,18 +370,14 @@ class ProteomeIndex:
|
|
|
366
370
|
k = _encode_block_u64(block_str)
|
|
367
371
|
i = bisect_left(keys, k)
|
|
368
372
|
if i >= len(keys) or keys[i] != k:
|
|
369
|
-
return
|
|
373
|
+
return
|
|
370
374
|
start = offs[i]
|
|
371
375
|
end = offs[i + 1]
|
|
372
376
|
mm = self._post_mm[key]
|
|
373
|
-
out = []
|
|
374
|
-
# each posting is 8 bytes: u32 tx_idx, u32 aa_pos
|
|
375
377
|
for j in range(start, end):
|
|
376
378
|
off = j * 8
|
|
377
379
|
tx_idx, aa_pos = struct.unpack_from("<II", mm, off)
|
|
378
|
-
|
|
379
|
-
return out
|
|
380
|
-
|
|
380
|
+
yield tx_idx, aa_pos
|
|
381
381
|
|
|
382
382
|
def write_peptide_hits_fasta(hit_records, path):
|
|
383
383
|
"""
|
|
@@ -554,15 +554,29 @@ def classify_peptides(reference,
|
|
|
554
554
|
core = rec_id.split()[0]
|
|
555
555
|
return core.split(".")[0]
|
|
556
556
|
|
|
557
|
-
def read_peptide_fasta(path):
|
|
557
|
+
def read_peptide_fasta(path, L_min=7, L_max=12):
|
|
558
558
|
peptides = OrderedDict()
|
|
559
|
+
skipped = 0
|
|
560
|
+
|
|
559
561
|
for rec in SeqIO.parse(path, "fasta"):
|
|
560
562
|
seq = str(rec.seq).strip().upper()
|
|
561
563
|
if not seq:
|
|
562
564
|
continue
|
|
565
|
+
|
|
566
|
+
L = len(seq)
|
|
567
|
+
if L < L_min or L > L_max:
|
|
568
|
+
# skip and do not store anywhere
|
|
569
|
+
skipped += 1
|
|
570
|
+
continue
|
|
571
|
+
|
|
563
572
|
peptides[rec.id] = seq
|
|
573
|
+
|
|
574
|
+
if skipped:
|
|
575
|
+
print(f"[INFO] Skipped {skipped} peptides outside {L_min}-{L_max} aa from {path}", file=sys.stderr)
|
|
576
|
+
|
|
564
577
|
return peptides
|
|
565
578
|
|
|
579
|
+
|
|
566
580
|
def load_transcriptome(path):
|
|
567
581
|
tx = {}
|
|
568
582
|
for rec in SeqIO.parse(path, "fasta"):
|
|
@@ -773,6 +787,12 @@ def classify_peptides(reference,
|
|
|
773
787
|
print(f"[WARN] Could not load index at {idx_dir}: {e}. Falling back to scan.", file=sys.stderr)
|
|
774
788
|
idx_obj = None
|
|
775
789
|
|
|
790
|
+
# PRELOAD: do this before any threading starts
|
|
791
|
+
if idx_obj is not None:
|
|
792
|
+
for L0 in range(index_L_min, index_L_max + 1): # 7..12 by default
|
|
793
|
+
for b_id in (0, 1, 2): # index built_for_k=2 => 3 blocks exist
|
|
794
|
+
idx_obj._load_Lb(L0, b_id)
|
|
795
|
+
|
|
776
796
|
hit_records = []
|
|
777
797
|
remaining = OrderedDict()
|
|
778
798
|
|
|
@@ -908,7 +928,7 @@ def classify_peptides(reference,
|
|
|
908
928
|
aa2nt_map_by_txid=canonical_aa2nt
|
|
909
929
|
)
|
|
910
930
|
|
|
911
|
-
peptides_all = read_peptide_fasta(peptide_fasta)
|
|
931
|
+
peptides_all = read_peptide_fasta(peptide_fasta, L_min=index_L_min, L_max=index_L_max)
|
|
912
932
|
|
|
913
933
|
def coord_resolver_cds(ref_txid, idx_obj, aa_pos, L, ref_peptide):
|
|
914
934
|
# idx_obj provides aa2nt mapping
|
|
@@ -1086,12 +1106,13 @@ def classify_peptides(reference,
|
|
|
1086
1106
|
frame = int(frame_str)
|
|
1087
1107
|
except ValueError:
|
|
1088
1108
|
return None, None, None, None
|
|
1109
|
+
|
|
1089
1110
|
nt0 = frame + aa_pos * 3
|
|
1090
1111
|
if tx_id not in transcriptome:
|
|
1091
1112
|
return None, None, None, None
|
|
1092
1113
|
if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
|
|
1093
1114
|
return None, None, None, None
|
|
1094
|
-
|
|
1115
|
+
|
|
1095
1116
|
if tx_id not in cds_bounds:
|
|
1096
1117
|
region = "lncRNA"
|
|
1097
1118
|
else:
|
|
@@ -1101,7 +1122,11 @@ def classify_peptides(reference,
|
|
|
1101
1122
|
elif nt0 >= cds_end:
|
|
1102
1123
|
region = "dORF"
|
|
1103
1124
|
else:
|
|
1104
|
-
|
|
1125
|
+
# inside CDS bounds: decide by frame
|
|
1126
|
+
if ((nt0 - cds_start) % 3) == 0:
|
|
1127
|
+
region = "CDS"
|
|
1128
|
+
else:
|
|
1129
|
+
region = "intORF"
|
|
1105
1130
|
|
|
1106
1131
|
return tx_id, nt0 + 1, region, ref_peptide
|
|
1107
1132
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: darkprofiler
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
|
|
5
5
|
Author-email: Hanjun Lee <hanjun@alum.mit.edu>
|
|
6
6
|
License: MIT
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|