darkprofiler 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {darkprofiler-0.2.2/src/darkprofiler.egg-info → darkprofiler-0.2.3}/PKG-INFO +2 -3
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/README.md +1 -2
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/pyproject.toml +1 -1
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/__init__.py +1 -1
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/cli.py +1 -1
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/run.py +163 -31
- {darkprofiler-0.2.2 → darkprofiler-0.2.3/src/darkprofiler.egg-info}/PKG-INFO +2 -3
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/LICENSE.txt +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/setup.cfg +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/SOURCES.txt +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/dependency_links.txt +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/entry_points.txt +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/requires.txt +0 -0
- {darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: darkprofiler
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
|
|
5
5
|
Author-email: Hanjun Lee <hanjun@alum.mit.edu>
|
|
6
6
|
License: MIT
|
|
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
|
|
|
244
244
|
- `mutanome.fa`
|
|
245
245
|
- `mutatedCanonicalTranscriptome.fa`
|
|
246
246
|
- `mutatedAlternativeTranslatome.fa`
|
|
247
|
-
- `mutatedAlternativeORFeome.fa`
|
|
248
247
|
|
|
249
248
|
DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
|
|
250
249
|
for example:
|
|
@@ -361,7 +360,7 @@ classify_peptides(
|
|
|
361
360
|
- **ORF region labels**
|
|
362
361
|
For alternative ORF hits, DarkProfiler labels the peptide start as:
|
|
363
362
|
- `uORF` (upstream of CDS start)
|
|
364
|
-
- `intORF` (inside annotated CDS span)
|
|
363
|
+
- `intORF` (out-of-frame peptdies from inside annotated CDS span)
|
|
365
364
|
- `dORF` (downstream of CDS end)
|
|
366
365
|
- `lncRNA` (no CDS annotation)
|
|
367
366
|
|
|
@@ -226,7 +226,6 @@ The database contains translated and derived proteomes as FASTA files:
|
|
|
226
226
|
- `mutanome.fa`
|
|
227
227
|
- `mutatedCanonicalTranscriptome.fa`
|
|
228
228
|
- `mutatedAlternativeTranslatome.fa`
|
|
229
|
-
- `mutatedAlternativeORFeome.fa`
|
|
230
229
|
|
|
231
230
|
DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
|
|
232
231
|
for example:
|
|
@@ -343,7 +342,7 @@ classify_peptides(
|
|
|
343
342
|
- **ORF region labels**
|
|
344
343
|
For alternative ORF hits, DarkProfiler labels the peptide start as:
|
|
345
344
|
- `uORF` (upstream of CDS start)
|
|
346
|
-
- `intORF` (inside annotated CDS span)
|
|
345
|
+
- `intORF` (out-of-frame peptdies from inside annotated CDS span)
|
|
347
346
|
- `dORF` (downstream of CDS end)
|
|
348
347
|
- `lncRNA` (no CDS annotation)
|
|
349
348
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "darkprofiler"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.7"
|
|
@@ -121,7 +121,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
121
121
|
"Optional path to existing database directory containing "
|
|
122
122
|
"canonicalProteome.fa, alternativeSplicing.fa, mutanome.fa, "
|
|
123
123
|
"mutatedCanonicalTranscriptome.fa, mutatedAlternativeTranslatome.fa, "
|
|
124
|
-
"
|
|
124
|
+
"and other index files"
|
|
125
125
|
),
|
|
126
126
|
)
|
|
127
127
|
p_run.add_argument(
|
|
@@ -481,7 +481,6 @@ def classify_peptides(reference,
|
|
|
481
481
|
|
|
482
482
|
required_db_files = [
|
|
483
483
|
"alternativeSplicing.fa",
|
|
484
|
-
"mutatedAlternativeORFeome.fa",
|
|
485
484
|
"canonicalProteome.fa",
|
|
486
485
|
"mutatedAlternativeTranslatome.fa",
|
|
487
486
|
"mutanome.fa",
|
|
@@ -1006,31 +1005,169 @@ def classify_peptides(reference,
|
|
|
1006
1005
|
|
|
1007
1006
|
report_step_done() # 5
|
|
1008
1007
|
|
|
1009
|
-
#
|
|
1010
|
-
# NOTE: keep your original SNV application as-is (omitted here for brevity),
|
|
1011
|
-
# but ensure translation uses translate_cds_with_map (ATG filter) and then build index.
|
|
1012
|
-
#
|
|
1013
|
-
# If you already have your SNV block, set:
|
|
1014
|
-
# mutated_canonical_transcripts = load_transcriptome(mutatedCanonicalTranscriptome.fa)
|
|
1015
|
-
# mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(mutated_canonical_transcripts, cds_map)
|
|
1016
|
-
# write mutanome.fa
|
|
1017
|
-
#
|
|
1008
|
+
# ---------- build mutanome.fa + mutatedCanonicalTranscriptome.fa ----------
|
|
1018
1009
|
mutated_canonical_tx_fa = os.path.join(database_dir, "mutatedCanonicalTranscriptome.fa")
|
|
1019
1010
|
mutanome_fa = os.path.join(database_dir, "mutanome.fa")
|
|
1020
1011
|
mutanome_idx_dir = os.path.join(database_dir, "mutanome.idx")
|
|
1021
1012
|
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1013
|
+
def _parse_gff_attributes(attr_str):
|
|
1014
|
+
attrs = {}
|
|
1015
|
+
for item in attr_str.strip().split(";"):
|
|
1016
|
+
item = item.strip()
|
|
1017
|
+
if not item:
|
|
1018
|
+
continue
|
|
1019
|
+
if "=" in item:
|
|
1020
|
+
k, v = item.split("=", 1)
|
|
1021
|
+
attrs[k] = v.strip().strip('"')
|
|
1022
|
+
else:
|
|
1023
|
+
parts = item.split()
|
|
1024
|
+
if len(parts) >= 2:
|
|
1025
|
+
attrs[parts[0]] = parts[1].strip('"')
|
|
1026
|
+
return attrs
|
|
1027
|
+
|
|
1028
|
+
# Build transcript exon model from GFF (exon features only)
|
|
1029
|
+
transcript_exons = defaultdict(list) # tx_id -> list[(start,end)] 1-based closed
|
|
1030
|
+
transcript_strand = {} # tx_id -> '+'/'-'
|
|
1031
|
+
transcript_chrom = {} # tx_id -> chrom (normalized)
|
|
1032
|
+
|
|
1033
|
+
with open(gff_path) as fh:
|
|
1034
|
+
for line in fh:
|
|
1035
|
+
if not line.strip() or line.startswith("#"):
|
|
1036
|
+
continue
|
|
1037
|
+
fields = line.rstrip("\n").split("\t")
|
|
1038
|
+
if len(fields) < 9:
|
|
1039
|
+
continue
|
|
1040
|
+
chrom, source, feature, start, end, score, strand, frame, attrs_str = fields
|
|
1041
|
+
if feature.lower() != "exon":
|
|
1042
|
+
continue
|
|
1043
|
+
try:
|
|
1044
|
+
start_i = int(start)
|
|
1045
|
+
end_i = int(end)
|
|
1046
|
+
except ValueError:
|
|
1047
|
+
continue
|
|
1048
|
+
attrs = _parse_gff_attributes(attrs_str)
|
|
1049
|
+
tx_id = attrs.get("transcript_id") or attrs.get("transcriptId")
|
|
1050
|
+
if tx_id is None:
|
|
1051
|
+
continue
|
|
1052
|
+
tx_id = normalize_gff_tx_id(tx_id)
|
|
1053
|
+
transcript_exons[tx_id].append((start_i, end_i))
|
|
1054
|
+
transcript_strand[tx_id] = strand
|
|
1055
|
+
transcript_chrom[tx_id] = normalize_chrom(chrom)
|
|
1056
|
+
|
|
1057
|
+
# Sort exons ascending by genomic start
|
|
1058
|
+
for tx_id in list(transcript_exons.keys()):
|
|
1059
|
+
transcript_exons[tx_id].sort(key=lambda x: x[0])
|
|
1060
|
+
|
|
1061
|
+
# Exon caches for coordinate mapping
|
|
1062
|
+
exon_order_cache = {} # tx_id -> (exons_sorted, exons_desc)
|
|
1063
|
+
for tx_id, exons_sorted in transcript_exons.items():
|
|
1064
|
+
exon_order_cache[tx_id] = (exons_sorted, list(reversed(exons_sorted)))
|
|
1065
|
+
|
|
1066
|
+
# Index SNVs by chromosome for speed
|
|
1067
|
+
snvs_by_chrom = defaultdict(list) # chrom -> list[(pos, ref, alt)]
|
|
1068
|
+
for chrom, pos, ref, alt in snvs:
|
|
1069
|
+
snvs_by_chrom[chrom].append((int(pos), ref, alt))
|
|
1070
|
+
for chrom in snvs_by_chrom:
|
|
1071
|
+
snvs_by_chrom[chrom].sort(key=lambda x: x[0])
|
|
1072
|
+
|
|
1073
|
+
# Simple base complement
|
|
1074
|
+
_complement = {"A":"T","T":"A","C":"G","G":"C","a":"t","t":"a","c":"g","g":"c"}
|
|
1075
|
+
def _complement_base(b):
|
|
1076
|
+
return _complement.get(b, b)
|
|
1077
|
+
|
|
1078
|
+
def _apply_snvs_to_one_transcript(tx_id):
|
|
1079
|
+
"""
|
|
1080
|
+
Returns: (tx_id, mutated_seq_string) OR (tx_id, None) if transcript not present.
|
|
1081
|
+
If no SNVs (or no mapping), returns original transcript sequence.
|
|
1082
|
+
"""
|
|
1083
|
+
if tx_id not in transcriptome:
|
|
1084
|
+
return tx_id, None
|
|
1085
|
+
|
|
1086
|
+
seq = transcriptome[tx_id]
|
|
1087
|
+
seq_list = list(seq)
|
|
1088
|
+
|
|
1089
|
+
chrom = transcript_chrom.get(tx_id)
|
|
1090
|
+
if chrom is None or chrom not in snvs_by_chrom:
|
|
1091
|
+
return tx_id, seq
|
|
1092
|
+
|
|
1093
|
+
if tx_id not in exon_order_cache:
|
|
1094
|
+
return tx_id, seq
|
|
1095
|
+
|
|
1096
|
+
exons_sorted, exons_desc = exon_order_cache[tx_id]
|
|
1097
|
+
strand = transcript_strand.get(tx_id, "+")
|
|
1098
|
+
|
|
1099
|
+
for pos, ref, alt in snvs_by_chrom[chrom]:
|
|
1100
|
+
if strand == "+":
|
|
1101
|
+
offset = 0
|
|
1102
|
+
within = False
|
|
1103
|
+
for s, e in exons_sorted:
|
|
1104
|
+
if pos < s:
|
|
1105
|
+
break
|
|
1106
|
+
if pos > e:
|
|
1107
|
+
offset += (e - s + 1)
|
|
1108
|
+
else:
|
|
1109
|
+
offset += (pos - s)
|
|
1110
|
+
within = True
|
|
1111
|
+
break
|
|
1112
|
+
if not within:
|
|
1113
|
+
continue
|
|
1114
|
+
tx_index = offset
|
|
1115
|
+
expected_ref = ref.upper()
|
|
1116
|
+
alt_base = alt.upper()
|
|
1117
|
+
else:
|
|
1118
|
+
offset = 0
|
|
1119
|
+
within = False
|
|
1120
|
+
for s, e in exons_desc:
|
|
1121
|
+
if pos > e:
|
|
1122
|
+
continue
|
|
1123
|
+
if pos < s:
|
|
1124
|
+
offset += (e - s + 1)
|
|
1125
|
+
else:
|
|
1126
|
+
offset += (e - pos)
|
|
1127
|
+
within = True
|
|
1128
|
+
break
|
|
1129
|
+
if not within:
|
|
1130
|
+
continue
|
|
1131
|
+
tx_index = offset
|
|
1132
|
+
expected_ref = _complement_base(ref.upper())
|
|
1133
|
+
alt_base = _complement_base(alt.upper())
|
|
1134
|
+
|
|
1135
|
+
if 0 <= tx_index < len(seq_list):
|
|
1136
|
+
if expected_ref and seq_list[tx_index].upper() != expected_ref:
|
|
1137
|
+
continue
|
|
1138
|
+
seq_list[tx_index] = alt_base
|
|
1139
|
+
|
|
1140
|
+
return tx_id, "".join(seq_list)
|
|
1141
|
+
|
|
1142
|
+
# Build mutated canonical transcriptome
|
|
1143
|
+
if build_database:
|
|
1144
|
+
canonical_tx_list = [tx for tx in canonical_tx_ids if tx in transcriptome]
|
|
1145
|
+
|
|
1146
|
+
mutated_canonical_tx_dict = {}
|
|
1147
|
+
if num_threads and num_threads > 1:
|
|
1148
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as ex:
|
|
1149
|
+
for tx_id, mutseq in ex.map(_apply_snvs_to_one_transcript, canonical_tx_list, chunksize=50):
|
|
1150
|
+
if mutseq is not None:
|
|
1151
|
+
mutated_canonical_tx_dict[tx_id] = mutseq
|
|
1152
|
+
else:
|
|
1153
|
+
for tx_id in canonical_tx_list:
|
|
1154
|
+
tx_id2, mutseq = _apply_snvs_to_one_transcript(tx_id)
|
|
1155
|
+
if mutseq is not None:
|
|
1156
|
+
mutated_canonical_tx_dict[tx_id2] = mutseq
|
|
1157
|
+
|
|
1158
|
+
with open(mutated_canonical_tx_fa, "w") as out:
|
|
1159
|
+
for tx_id, nt_seq in mutated_canonical_tx_dict.items():
|
|
1160
|
+
out.write(f">{tx_id}\n{nt_seq}\n")
|
|
1161
|
+
|
|
1162
|
+
mutated_canonical_transcripts = mutated_canonical_tx_dict
|
|
1027
1163
|
else:
|
|
1028
|
-
|
|
1029
|
-
mutated_canonical_transcripts = {} # placeholder
|
|
1030
|
-
# ---------- END: your existing SNV block should fill these ----------
|
|
1164
|
+
mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa)
|
|
1031
1165
|
|
|
1032
|
-
# Translate mutanome
|
|
1033
|
-
mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(
|
|
1166
|
+
# Translate mutanome (CDS only, ATG filtered)
|
|
1167
|
+
mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(
|
|
1168
|
+
mutated_canonical_transcripts,
|
|
1169
|
+
cds_map
|
|
1170
|
+
)
|
|
1034
1171
|
|
|
1035
1172
|
if build_database:
|
|
1036
1173
|
with open(mutanome_fa, "w") as out:
|
|
@@ -1063,8 +1200,7 @@ def classify_peptides(reference,
|
|
|
1063
1200
|
|
|
1064
1201
|
# -------------------------- alternative ORFs (3 frames) ----------------------
|
|
1065
1202
|
alt_orf_translatome_fa = os.path.join(database_dir, "mutatedAlternativeTranslatome.fa")
|
|
1066
|
-
|
|
1067
|
-
alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeORFeome.idx")
|
|
1203
|
+
alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeTranslatome.idx")
|
|
1068
1204
|
|
|
1069
1205
|
if build_database:
|
|
1070
1206
|
alt_orf_records = {}
|
|
@@ -1082,14 +1218,11 @@ def classify_peptides(reference,
|
|
|
1082
1218
|
with open(alt_orf_translatome_fa, "w") as out:
|
|
1083
1219
|
for rid, aa_seq in alt_orf_records.items():
|
|
1084
1220
|
out.write(f">{rid}\n{aa_seq}\n")
|
|
1085
|
-
with open(alt_orf_orfeome_fa, "w") as out:
|
|
1086
|
-
for rid, aa_seq in alt_orf_records.items():
|
|
1087
|
-
out.write(f">{rid}\n{aa_seq}\n")
|
|
1088
1221
|
|
|
1089
|
-
# Build index for
|
|
1222
|
+
# Build index for alt ORF translatome (no aa2nt needed)
|
|
1090
1223
|
if build_fast_index:
|
|
1091
1224
|
build_proteome_index(
|
|
1092
|
-
|
|
1225
|
+
alt_orf_translatome_fa,
|
|
1093
1226
|
alt_orf_idx_dir,
|
|
1094
1227
|
L_min=index_L_min,
|
|
1095
1228
|
L_max=index_L_max,
|
|
@@ -1106,13 +1239,13 @@ def classify_peptides(reference,
|
|
|
1106
1239
|
frame = int(frame_str)
|
|
1107
1240
|
except ValueError:
|
|
1108
1241
|
return None, None, None, None
|
|
1109
|
-
|
|
1242
|
+
|
|
1110
1243
|
nt0 = frame + aa_pos * 3
|
|
1111
1244
|
if tx_id not in transcriptome:
|
|
1112
1245
|
return None, None, None, None
|
|
1113
1246
|
if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
|
|
1114
1247
|
return None, None, None, None
|
|
1115
|
-
|
|
1248
|
+
|
|
1116
1249
|
if tx_id not in cds_bounds:
|
|
1117
1250
|
region = "lncRNA"
|
|
1118
1251
|
else:
|
|
@@ -1122,7 +1255,6 @@ def classify_peptides(reference,
|
|
|
1122
1255
|
elif nt0 >= cds_end:
|
|
1123
1256
|
region = "dORF"
|
|
1124
1257
|
else:
|
|
1125
|
-
# inside CDS bounds: decide by frame
|
|
1126
1258
|
if ((nt0 - cds_start) % 3) == 0:
|
|
1127
1259
|
region = "CDS"
|
|
1128
1260
|
else:
|
|
@@ -1147,7 +1279,7 @@ def classify_peptides(reference,
|
|
|
1147
1279
|
# -------------------------- amino acid mismatch ------------------------------
|
|
1148
1280
|
mismatch_hit_records, peptides_remaining = classify_with_index(
|
|
1149
1281
|
peptides_remaining,
|
|
1150
|
-
|
|
1282
|
+
alt_orf_translatome_fa,
|
|
1151
1283
|
alt_orf_idx_dir,
|
|
1152
1284
|
coord_resolver=coord_resolver_altorf,
|
|
1153
1285
|
step_label="amino acid mismatch search",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: darkprofiler
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
|
|
5
5
|
Author-email: Hanjun Lee <hanjun@alum.mit.edu>
|
|
6
6
|
License: MIT
|
|
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
|
|
|
244
244
|
- `mutanome.fa`
|
|
245
245
|
- `mutatedCanonicalTranscriptome.fa`
|
|
246
246
|
- `mutatedAlternativeTranslatome.fa`
|
|
247
|
-
- `mutatedAlternativeORFeome.fa`
|
|
248
247
|
|
|
249
248
|
DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
|
|
250
249
|
for example:
|
|
@@ -361,7 +360,7 @@ classify_peptides(
|
|
|
361
360
|
- **ORF region labels**
|
|
362
361
|
For alternative ORF hits, DarkProfiler labels the peptide start as:
|
|
363
362
|
- `uORF` (upstream of CDS start)
|
|
364
|
-
- `intORF` (inside annotated CDS span)
|
|
363
|
+
- `intORF` (out-of-frame peptdies from inside annotated CDS span)
|
|
365
364
|
- `dORF` (downstream of CDS end)
|
|
366
365
|
- `lncRNA` (no CDS annotation)
|
|
367
366
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|