darkprofiler 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: darkprofiler
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
5
5
  Author-email: Hanjun Lee <hanjun@alum.mit.edu>
6
6
  License: MIT
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
244
244
  - `mutanome.fa`
245
245
  - `mutatedCanonicalTranscriptome.fa`
246
246
  - `mutatedAlternativeTranslatome.fa`
247
- - `mutatedAlternativeORFeome.fa`
248
247
 
249
248
  DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
250
249
  for example:
@@ -361,7 +360,7 @@ classify_peptides(
361
360
  - **ORF region labels**
362
361
  For alternative ORF hits, DarkProfiler labels the peptide start as:
363
362
  - `uORF` (upstream of CDS start)
364
- - `intORF` (inside annotated CDS span)
363
+ - `intORF` (out-of-frame peptdies from inside annotated CDS span)
365
364
  - `dORF` (downstream of CDS end)
366
365
  - `lncRNA` (no CDS annotation)
367
366
 
@@ -226,7 +226,6 @@ The database contains translated and derived proteomes as FASTA files:
226
226
  - `mutanome.fa`
227
227
  - `mutatedCanonicalTranscriptome.fa`
228
228
  - `mutatedAlternativeTranslatome.fa`
229
- - `mutatedAlternativeORFeome.fa`
230
229
 
231
230
  DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
232
231
  for example:
@@ -343,7 +342,7 @@ classify_peptides(
343
342
  - **ORF region labels**
344
343
  For alternative ORF hits, DarkProfiler labels the peptide start as:
345
344
  - `uORF` (upstream of CDS start)
346
- - `intORF` (inside annotated CDS span)
345
+ - `intORF` (out-of-frame peptdies from inside annotated CDS span)
347
346
  - `dORF` (downstream of CDS end)
348
347
  - `lncRNA` (no CDS annotation)
349
348
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "darkprofiler"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.7"
@@ -2,5 +2,5 @@ from .run import classify_peptides
2
2
 
3
3
  __all__ = ["classify_peptides"]
4
4
 
5
- __version__ = "0.2.2"
5
+ __version__ = "0.2.3"
6
6
 
@@ -121,7 +121,7 @@ def build_parser() -> argparse.ArgumentParser:
121
121
  "Optional path to existing database directory containing "
122
122
  "canonicalProteome.fa, alternativeSplicing.fa, mutanome.fa, "
123
123
  "mutatedCanonicalTranscriptome.fa, mutatedAlternativeTranslatome.fa, "
124
- "mutatedAlternativeORFeome.fa."
124
+ "and other index files"
125
125
  ),
126
126
  )
127
127
  p_run.add_argument(
@@ -481,7 +481,6 @@ def classify_peptides(reference,
481
481
 
482
482
  required_db_files = [
483
483
  "alternativeSplicing.fa",
484
- "mutatedAlternativeORFeome.fa",
485
484
  "canonicalProteome.fa",
486
485
  "mutatedAlternativeTranslatome.fa",
487
486
  "mutanome.fa",
@@ -1006,31 +1005,169 @@ def classify_peptides(reference,
1006
1005
 
1007
1006
  report_step_done() # 5
1008
1007
 
1009
- # ----------------------------- mutated antigens (mutanome) -------------------
1010
- # NOTE: keep your original SNV application as-is (omitted here for brevity),
1011
- # but ensure translation uses translate_cds_with_map (ATG filter) and then build index.
1012
- #
1013
- # If you already have your SNV block, set:
1014
- # mutated_canonical_transcripts = load_transcriptome(mutatedCanonicalTranscriptome.fa)
1015
- # mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(mutated_canonical_transcripts, cds_map)
1016
- # write mutanome.fa
1017
- #
1008
+ # ---------- build mutanome.fa + mutatedCanonicalTranscriptome.fa ----------
1018
1009
  mutated_canonical_tx_fa = os.path.join(database_dir, "mutatedCanonicalTranscriptome.fa")
1019
1010
  mutanome_fa = os.path.join(database_dir, "mutanome.fa")
1020
1011
  mutanome_idx_dir = os.path.join(database_dir, "mutanome.idx")
1021
1012
 
1022
- # ---------- BEGIN: your existing SNV block should fill these ----------
1023
- # For runs reusing DB, mutated_canonical_tx_fa and mutanome_fa should already exist.
1024
- mutated_canonical_transcripts = {}
1025
- if os.path.exists(mutated_canonical_tx_fa):
1026
- mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa)
1013
+ def _parse_gff_attributes(attr_str):
1014
+ attrs = {}
1015
+ for item in attr_str.strip().split(";"):
1016
+ item = item.strip()
1017
+ if not item:
1018
+ continue
1019
+ if "=" in item:
1020
+ k, v = item.split("=", 1)
1021
+ attrs[k] = v.strip().strip('"')
1022
+ else:
1023
+ parts = item.split()
1024
+ if len(parts) >= 2:
1025
+ attrs[parts[0]] = parts[1].strip('"')
1026
+ return attrs
1027
+
1028
+ # Build transcript exon model from GFF (exon features only)
1029
+ transcript_exons = defaultdict(list) # tx_id -> list[(start,end)] 1-based closed
1030
+ transcript_strand = {} # tx_id -> '+'/'-'
1031
+ transcript_chrom = {} # tx_id -> chrom (normalized)
1032
+
1033
+ with open(gff_path) as fh:
1034
+ for line in fh:
1035
+ if not line.strip() or line.startswith("#"):
1036
+ continue
1037
+ fields = line.rstrip("\n").split("\t")
1038
+ if len(fields) < 9:
1039
+ continue
1040
+ chrom, source, feature, start, end, score, strand, frame, attrs_str = fields
1041
+ if feature.lower() != "exon":
1042
+ continue
1043
+ try:
1044
+ start_i = int(start)
1045
+ end_i = int(end)
1046
+ except ValueError:
1047
+ continue
1048
+ attrs = _parse_gff_attributes(attrs_str)
1049
+ tx_id = attrs.get("transcript_id") or attrs.get("transcriptId")
1050
+ if tx_id is None:
1051
+ continue
1052
+ tx_id = normalize_gff_tx_id(tx_id)
1053
+ transcript_exons[tx_id].append((start_i, end_i))
1054
+ transcript_strand[tx_id] = strand
1055
+ transcript_chrom[tx_id] = normalize_chrom(chrom)
1056
+
1057
+ # Sort exons ascending by genomic start
1058
+ for tx_id in list(transcript_exons.keys()):
1059
+ transcript_exons[tx_id].sort(key=lambda x: x[0])
1060
+
1061
+ # Exon caches for coordinate mapping
1062
+ exon_order_cache = {} # tx_id -> (exons_sorted, exons_desc)
1063
+ for tx_id, exons_sorted in transcript_exons.items():
1064
+ exon_order_cache[tx_id] = (exons_sorted, list(reversed(exons_sorted)))
1065
+
1066
+ # Index SNVs by chromosome for speed
1067
+ snvs_by_chrom = defaultdict(list) # chrom -> list[(pos, ref, alt)]
1068
+ for chrom, pos, ref, alt in snvs:
1069
+ snvs_by_chrom[chrom].append((int(pos), ref, alt))
1070
+ for chrom in snvs_by_chrom:
1071
+ snvs_by_chrom[chrom].sort(key=lambda x: x[0])
1072
+
1073
+ # Simple base complement
1074
+ _complement = {"A":"T","T":"A","C":"G","G":"C","a":"t","t":"a","c":"g","g":"c"}
1075
+ def _complement_base(b):
1076
+ return _complement.get(b, b)
1077
+
1078
+ def _apply_snvs_to_one_transcript(tx_id):
1079
+ """
1080
+ Returns: (tx_id, mutated_seq_string) OR (tx_id, None) if transcript not present.
1081
+ If no SNVs (or no mapping), returns original transcript sequence.
1082
+ """
1083
+ if tx_id not in transcriptome:
1084
+ return tx_id, None
1085
+
1086
+ seq = transcriptome[tx_id]
1087
+ seq_list = list(seq)
1088
+
1089
+ chrom = transcript_chrom.get(tx_id)
1090
+ if chrom is None or chrom not in snvs_by_chrom:
1091
+ return tx_id, seq
1092
+
1093
+ if tx_id not in exon_order_cache:
1094
+ return tx_id, seq
1095
+
1096
+ exons_sorted, exons_desc = exon_order_cache[tx_id]
1097
+ strand = transcript_strand.get(tx_id, "+")
1098
+
1099
+ for pos, ref, alt in snvs_by_chrom[chrom]:
1100
+ if strand == "+":
1101
+ offset = 0
1102
+ within = False
1103
+ for s, e in exons_sorted:
1104
+ if pos < s:
1105
+ break
1106
+ if pos > e:
1107
+ offset += (e - s + 1)
1108
+ else:
1109
+ offset += (pos - s)
1110
+ within = True
1111
+ break
1112
+ if not within:
1113
+ continue
1114
+ tx_index = offset
1115
+ expected_ref = ref.upper()
1116
+ alt_base = alt.upper()
1117
+ else:
1118
+ offset = 0
1119
+ within = False
1120
+ for s, e in exons_desc:
1121
+ if pos > e:
1122
+ continue
1123
+ if pos < s:
1124
+ offset += (e - s + 1)
1125
+ else:
1126
+ offset += (e - pos)
1127
+ within = True
1128
+ break
1129
+ if not within:
1130
+ continue
1131
+ tx_index = offset
1132
+ expected_ref = _complement_base(ref.upper())
1133
+ alt_base = _complement_base(alt.upper())
1134
+
1135
+ if 0 <= tx_index < len(seq_list):
1136
+ if expected_ref and seq_list[tx_index].upper() != expected_ref:
1137
+ continue
1138
+ seq_list[tx_index] = alt_base
1139
+
1140
+ return tx_id, "".join(seq_list)
1141
+
1142
+ # Build mutated canonical transcriptome
1143
+ if build_database:
1144
+ canonical_tx_list = [tx for tx in canonical_tx_ids if tx in transcriptome]
1145
+
1146
+ mutated_canonical_tx_dict = {}
1147
+ if num_threads and num_threads > 1:
1148
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as ex:
1149
+ for tx_id, mutseq in ex.map(_apply_snvs_to_one_transcript, canonical_tx_list, chunksize=50):
1150
+ if mutseq is not None:
1151
+ mutated_canonical_tx_dict[tx_id] = mutseq
1152
+ else:
1153
+ for tx_id in canonical_tx_list:
1154
+ tx_id2, mutseq = _apply_snvs_to_one_transcript(tx_id)
1155
+ if mutseq is not None:
1156
+ mutated_canonical_tx_dict[tx_id2] = mutseq
1157
+
1158
+ with open(mutated_canonical_tx_fa, "w") as out:
1159
+ for tx_id, nt_seq in mutated_canonical_tx_dict.items():
1160
+ out.write(f">{tx_id}\n{nt_seq}\n")
1161
+
1162
+ mutated_canonical_transcripts = mutated_canonical_tx_dict
1027
1163
  else:
1028
- # If you keep your original code, it will write this file when build_database=True.
1029
- mutated_canonical_transcripts = {} # placeholder
1030
- # ---------- END: your existing SNV block should fill these ----------
1164
+ mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa)
1031
1165
 
1032
- # Translate mutanome with ATG filter + mapping (even if file was precomputed)
1033
- mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(mutated_canonical_transcripts, cds_map)
1166
+ # Translate mutanome (CDS only, ATG filtered)
1167
+ mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(
1168
+ mutated_canonical_transcripts,
1169
+ cds_map
1170
+ )
1034
1171
 
1035
1172
  if build_database:
1036
1173
  with open(mutanome_fa, "w") as out:
@@ -1063,8 +1200,7 @@ def classify_peptides(reference,
1063
1200
 
1064
1201
  # -------------------------- alternative ORFs (3 frames) ----------------------
1065
1202
  alt_orf_translatome_fa = os.path.join(database_dir, "mutatedAlternativeTranslatome.fa")
1066
- alt_orf_orfeome_fa = os.path.join(database_dir, "mutatedAlternativeORFeome.fa")
1067
- alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeORFeome.idx")
1203
+ alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeTranslatome.idx")
1068
1204
 
1069
1205
  if build_database:
1070
1206
  alt_orf_records = {}
@@ -1082,14 +1218,11 @@ def classify_peptides(reference,
1082
1218
  with open(alt_orf_translatome_fa, "w") as out:
1083
1219
  for rid, aa_seq in alt_orf_records.items():
1084
1220
  out.write(f">{rid}\n{aa_seq}\n")
1085
- with open(alt_orf_orfeome_fa, "w") as out:
1086
- for rid, aa_seq in alt_orf_records.items():
1087
- out.write(f">{rid}\n{aa_seq}\n")
1088
1221
 
1089
- # Build index for ORFeome (no aa2nt needed)
1222
+ # Build index for alt ORF translatome (no aa2nt needed)
1090
1223
  if build_fast_index:
1091
1224
  build_proteome_index(
1092
- alt_orf_orfeome_fa,
1225
+ alt_orf_translatome_fa,
1093
1226
  alt_orf_idx_dir,
1094
1227
  L_min=index_L_min,
1095
1228
  L_max=index_L_max,
@@ -1106,13 +1239,13 @@ def classify_peptides(reference,
1106
1239
  frame = int(frame_str)
1107
1240
  except ValueError:
1108
1241
  return None, None, None, None
1109
-
1242
+
1110
1243
  nt0 = frame + aa_pos * 3
1111
1244
  if tx_id not in transcriptome:
1112
1245
  return None, None, None, None
1113
1246
  if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
1114
1247
  return None, None, None, None
1115
-
1248
+
1116
1249
  if tx_id not in cds_bounds:
1117
1250
  region = "lncRNA"
1118
1251
  else:
@@ -1122,7 +1255,6 @@ def classify_peptides(reference,
1122
1255
  elif nt0 >= cds_end:
1123
1256
  region = "dORF"
1124
1257
  else:
1125
- # inside CDS bounds: decide by frame
1126
1258
  if ((nt0 - cds_start) % 3) == 0:
1127
1259
  region = "CDS"
1128
1260
  else:
@@ -1147,7 +1279,7 @@ def classify_peptides(reference,
1147
1279
  # -------------------------- amino acid mismatch ------------------------------
1148
1280
  mismatch_hit_records, peptides_remaining = classify_with_index(
1149
1281
  peptides_remaining,
1150
- alt_orf_orfeome_fa,
1282
+ alt_orf_translatome_fa,
1151
1283
  alt_orf_idx_dir,
1152
1284
  coord_resolver=coord_resolver_altorf,
1153
1285
  step_label="amino acid mismatch search",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: darkprofiler
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
5
5
  Author-email: Hanjun Lee <hanjun@alum.mit.edu>
6
6
  License: MIT
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
244
244
  - `mutanome.fa`
245
245
  - `mutatedCanonicalTranscriptome.fa`
246
246
  - `mutatedAlternativeTranslatome.fa`
247
- - `mutatedAlternativeORFeome.fa`
248
247
 
249
248
  DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
250
249
  for example:
@@ -361,7 +360,7 @@ classify_peptides(
361
360
  - **ORF region labels**
362
361
  For alternative ORF hits, DarkProfiler labels the peptide start as:
363
362
  - `uORF` (upstream of CDS start)
364
- - `intORF` (inside annotated CDS span)
363
+ - `intORF` (out-of-frame peptdies from inside annotated CDS span)
365
364
  - `dORF` (downstream of CDS end)
366
365
  - `lncRNA` (no CDS annotation)
367
366
 
File without changes
File without changes