darkprofiler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ from .run import classify_peptides
2
+
3
+ __all__ = ["classify_peptides"]
4
+
5
+ __version__ = "0.1.0"
6
+
darkprofiler/cli.py ADDED
@@ -0,0 +1,142 @@
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import zipfile
5
+ from pathlib import Path
6
+ from urllib.request import urlopen
7
+
8
+ from .run import classify_peptides
9
+
10
+ SUPPORTED_REFERENCES = ("hg19", "hg38", "mm10", "mm39")
11
+
12
+ # Reference Zip Files from Elledge Lab
13
+ URL_PREFIX = "https://elledge.hms.harvard.edu/wp-content/uploads/2025/12/"
14
+
15
+
16
+ def _get_package_root() -> Path:
17
+ # Directory where this file lives (src/darkprofiler)
18
+ return Path(__file__).resolve().parent
19
+
20
+ def _download_reference(reference: str) -> None:
21
+ reference = reference.lower()
22
+ if reference not in SUPPORTED_REFERENCES:
23
+ raise SystemExit(
24
+ f"Unsupported reference '{reference}'. Must be one of: "
25
+ f"{', '.join(SUPPORTED_REFERENCES)}"
26
+ )
27
+
28
+ url_prefix = URL_PREFIX
29
+
30
+ pkg_root = _get_package_root()
31
+ genome_dir = pkg_root / "genome"
32
+ genome_dir.mkdir(exist_ok=True)
33
+
34
+ # URL like: {url_prefix}/darkprofiler_hg38.zip
35
+ url = f"{url_prefix.rstrip('/')}/darkprofiler_{reference}.zip"
36
+ zip_path = genome_dir / f"{reference}.zip"
37
+
38
+ print(f"[darkprofiler] Downloading {url} ...", file=sys.stderr)
39
+ try:
40
+ with urlopen(url) as resp, open(zip_path, "wb") as out_fh:
41
+ # Stream copy to disk
42
+ chunk = resp.read(8192)
43
+ while chunk:
44
+ out_fh.write(chunk)
45
+ chunk = resp.read(8192)
46
+ except Exception as e:
47
+ if zip_path.exists():
48
+ zip_path.unlink()
49
+ raise SystemExit(f"Failed to download {url}: {e}")
50
+
51
+ print(f"[darkprofiler] Extracting to {pkg_root} ...", file=sys.stderr)
52
+ try:
53
+ with zipfile.ZipFile(zip_path, "r") as zf:
54
+ zf.extractall(path=pkg_root)
55
+ except Exception as e:
56
+ raise SystemExit(f"Failed to extract {zip_path}: {e}")
57
+
58
+ print(
59
+ f"[darkprofiler] Finished. Reference '{reference}' is now available.",
60
+ file=sys.stderr,
61
+ )
62
+
63
+ def cmd_download(args: argparse.Namespace) -> None:
64
+ _download_reference(args.reference)
65
+
66
+
67
+ def cmd_run(args: argparse.Namespace) -> None:
68
+ classify_peptides(
69
+ reference=args.reference,
70
+ peptide_fasta=args.peptide_fasta,
71
+ output_dir=args.output_dir,
72
+ vcf_path=args.vcf_path,
73
+ database_path=args.database_path,
74
+ num_threads=args.num_threads,
75
+ )
76
+
77
+
78
+ def build_parser() -> argparse.ArgumentParser:
79
+ parser = argparse.ArgumentParser(
80
+ prog="darkprofiler",
81
+ description=(
82
+ "DarkProfiler: classify peptides into canonical, alternative, "
83
+ "mutant, and dark proteome categories."
84
+ ),
85
+ )
86
+ subparsers = parser.add_subparsers(dest="command", required=True)
87
+
88
+ # ---------------- download ----------------
89
+ p_download = subparsers.add_parser(
90
+ "download",
91
+ help="Download a reference genome bundle (hg19/hg38/mm10/mm39).",
92
+ )
93
+ p_download.add_argument(
94
+ "reference",
95
+ choices=SUPPORTED_REFERENCES,
96
+ help="Reference assembly version to download.",
97
+ )
98
+ p_download.set_defaults(func=cmd_download)
99
+
100
+ # ---------------- run ----------------
101
+ p_run = subparsers.add_parser(
102
+ "run",
103
+ help="Run DarkProfiler classification pipeline.",
104
+ )
105
+ p_run.add_argument(
106
+ "reference",
107
+ choices=SUPPORTED_REFERENCES,
108
+ help="Reference assembly version to use (must be downloaded first).",
109
+ )
110
+ p_run.add_argument("peptide_fasta", help="Path to peptide FASTA file.")
111
+ p_run.add_argument("output_dir", help="Output directory.")
112
+ p_run.add_argument(
113
+ "--vcf-path",
114
+ default=None,
115
+ help="Optional path to VCF or VCF.GZ file with SNVs.",
116
+ )
117
+ p_run.add_argument(
118
+ "--database-path",
119
+ default=None,
120
+ help=(
121
+ "Optional path to existing database directory containing "
122
+ "canonicalProteome.fa, alternativeSplicing.fa, mutanome.fa, "
123
+ "mutatedCanonicalTranscriptome.fa, mutatedAlternativeTranslatome.fa, "
124
+ "mutatedAlternativeORFeome.fa."
125
+ ),
126
+ )
127
+ p_run.add_argument(
128
+ "--num-threads",
129
+ type=int,
130
+ default=1,
131
+ help="Threads for amino acid misincorporation search.",
132
+ )
133
+ p_run.set_defaults(func=cmd_run)
134
+
135
+ return parser
136
+
137
+
138
+ def main(argv=None) -> None:
139
+ parser = build_parser()
140
+ args = parser.parse_args(argv)
141
+ args.func(args)
142
+
darkprofiler/run.py ADDED
@@ -0,0 +1,1057 @@
1
+ import os
2
+ import sys
3
+ import math
4
+ import gzip
5
+ import shutil
6
+ from collections import defaultdict, OrderedDict
7
+
8
+ import concurrent.futures
9
+ from Bio import SeqIO
10
+ from Bio.Seq import Seq
11
+
12
+ import matplotlib
13
+ matplotlib.use("Agg")
14
+ import matplotlib.pyplot as plt
15
+ from matplotlib.patches import Patch
16
+
17
+ def classify_peptides(reference,
18
+ peptide_fasta,
19
+ output_dir,
20
+ vcf_path=None,
21
+ database_path=None,
22
+ num_threads=1):
23
+ """
24
+ Classify peptides into categories based on canonical proteome, alternative splicing,
25
+ mutated antigens (mutanome), alternative ORFs, amino acid misincorporations, and unaligned.
26
+
27
+ Required arguments
28
+ ------------------
29
+ reference : str
30
+ Reference name (hg19, hg38, mm10, mm39). Case-insensitive.
31
+ peptide_fasta : str
32
+ Path to peptide FASTA file.
33
+ output_dir : str
34
+ Output directory path.
35
+
36
+ Optional arguments
37
+ ------------------
38
+ vcf_path : str or None
39
+ Path to VCF or VCF.GZ file. If None or file does not exist, no SNVs are used.
40
+ database_path : str or None
41
+ Path to a directory containing precomputed database FASTA files
42
+ (canonicalProteome.fa, alternativeSplicing.fa, mutanome.fa,
43
+ mutatedCanonicalTranscriptome.fa, mutatedAlternativeTranslatome.fa,
44
+ mutatedAlternativeORFeome.fa). If None, a new database directory
45
+ called "database" will be created under output_dir and populated.
46
+ If the provided directory is invalid or missing any required file,
47
+ it is ignored and a new database is built from scratch.
48
+ num_threads : int
49
+ Number of threads to use for amino acid misincorporation search (step 7).
50
+ If <= 1, runs single-threaded.
51
+ """
52
+
53
+ # -------------------------- simple step progress bar --------------------------
54
+ steps = [
55
+ "Filter VCF to exome",
56
+ "Setup and load transcriptome/CDS/knownCanonical",
57
+ "Build canonical / non-canonical transcript sets",
58
+ "Generate canonical proteome and classify canonical peptides",
59
+ "Generate alternative splicing proteome and classify peptides",
60
+ "Apply SNVs, generate mutanome and classify neoantigens",
61
+ "Generate alternative ORFs and classify peptides",
62
+ "Identify amino acid misincorporations",
63
+ "Write unaligned peptides and pie chart",
64
+ "Finalize",
65
+ ]
66
+ total_steps = len(steps)
67
+ current_step = 0
68
+
69
+ def report_step_done():
70
+ # use nonlocal to modify current_step inside nested function
71
+ nonlocal current_step
72
+ current_step += 1
73
+ bar_len = 40
74
+ filled = int(bar_len * float(current_step) / float(total_steps))
75
+ bar = "#" * filled + "-" * (bar_len - filled)
76
+ msg = "[{bar}] {i}/{total} - {desc}".format(
77
+ bar=bar,
78
+ i=current_step,
79
+ total=total_steps,
80
+ desc=steps[current_step - 1],
81
+ )
82
+ print(msg, file=sys.stderr)
83
+ sys.stderr.flush()
84
+
85
+ # -------------------------- helpers / normalizers ----------------------------
86
+
87
+ def safe_translate_nt(nt_seq):
88
+ """
89
+ Translate nucleotide sequence safely:
90
+ - Trim last 1 or 2 nt if length not multiple of 3 (to avoid partial codon warning).
91
+ - Return protein string (may be empty).
92
+ """
93
+ if nt_seq is None:
94
+ return ""
95
+ nt_seq = str(nt_seq).upper()
96
+ if not nt_seq:
97
+ return ""
98
+ remainder = len(nt_seq) % 3
99
+ if remainder != 0:
100
+ nt_seq = nt_seq[:-remainder]
101
+ if len(nt_seq) < 3:
102
+ return ""
103
+ return str(Seq(nt_seq).translate(to_stop=False))
104
+
105
+ def normalize_chrom(chrom):
106
+ """
107
+ Normalize chromosome names so VCF and GFF match.
108
+ Example: 'chr1' -> '1', '1' -> '1'
109
+ """
110
+ chrom = str(chrom)
111
+ if chrom.startswith("chr"):
112
+ chrom = chrom[3:]
113
+ return chrom
114
+
115
+ def normalize_gff_tx_id(tx_id):
116
+ """
117
+ Normalize GFF transcript IDs to match FASTA IDs.
118
+ Example: 'transcript:ENST00000335137.4' -> 'ENST00000335137.4'
119
+ """
120
+ tx_id = tx_id.split()[0]
121
+ if ":" in tx_id:
122
+ tx_id = tx_id.split(":", 1)[1]
123
+ return tx_id
124
+
125
+ # ----------------------------- basic paths & setup -----------------------------
126
+ reference = str(reference).lower()
127
+ if reference not in ("hg19", "hg38", "mm10", "mm39"):
128
+ raise ValueError("Unsupported reference: {} (expected hg19/hg38/mm10/mm39)".format(reference))
129
+
130
+ output_dir = os.path.abspath(output_dir)
131
+ if not os.path.exists(output_dir):
132
+ os.makedirs(output_dir)
133
+
134
+ # Decide on database directory and whether to build it
135
+ required_db_files = [
136
+ "alternativeSplicing.fa",
137
+ "mutatedAlternativeORFeome.fa",
138
+ "canonicalProteome.fa",
139
+ "mutatedAlternativeTranslatome.fa",
140
+ "mutanome.fa",
141
+ "mutatedCanonicalTranscriptome.fa",
142
+ ]
143
+
144
+ build_database = True
145
+
146
+ if database_path is not None:
147
+ candidate = os.path.abspath(database_path)
148
+ if os.path.isdir(candidate):
149
+ missing = [f for f in required_db_files if not os.path.exists(os.path.join(candidate, f))]
150
+ if not missing:
151
+ database_dir = candidate
152
+ build_database = False
153
+ else:
154
+ print(
155
+ "[WARN] Provided database_path '{}' is missing required files: {}. "
156
+ "Ignoring it and rebuilding database from scratch.".format(
157
+ candidate, ", ".join(missing)
158
+ ),
159
+ file=sys.stderr,
160
+ )
161
+ database_dir = os.path.join(output_dir, "database")
162
+ os.makedirs(database_dir, exist_ok=True)
163
+ else:
164
+ print(
165
+ "[WARN] Provided database_path '{}' is not an existing directory. "
166
+ "Ignoring it and rebuilding database from scratch.".format(candidate),
167
+ file=sys.stderr,
168
+ )
169
+ database_dir = os.path.join(output_dir, "database")
170
+ os.makedirs(database_dir, exist_ok=True)
171
+ else:
172
+ database_dir = os.path.join(output_dir, "database")
173
+ if not os.path.exists(database_dir):
174
+ os.makedirs(database_dir)
175
+
176
+ # Resolve genome directory relative to this file so installed package works
177
+ here = os.path.dirname(os.path.abspath(__file__))
178
+ genome_root = os.path.join(here, "genome", reference)
179
+
180
+ # helper to find a single file by glob-like pattern
181
+ def find_single_file(prefix, suffix):
182
+ """
183
+ Find file with given prefix & suffix inside genome_root.
184
+ Examples:
185
+ prefix="gencode", suffix=".{}.gff".format(reference)
186
+ prefix="knownCanonical", suffix=".{}.list".format(reference)
187
+ """
188
+ candidates = []
189
+ for fname in os.listdir(genome_root):
190
+ if fname.startswith(prefix) and fname.endswith(suffix):
191
+ candidates.append(os.path.join(genome_root, fname))
192
+ if not candidates:
193
+ raise IOError("Could not find file {}*{} in {}".format(prefix, suffix, genome_root))
194
+ if len(candidates) > 1:
195
+ # arbitrarily choose the first, but deterministic order
196
+ candidates.sort()
197
+ return candidates[0]
198
+
199
+ # Files described in the prompt
200
+ transcriptome_fa = os.path.join(genome_root, "transcriptome.{}.fa".format(reference))
201
+ cds_bed = os.path.join(genome_root, "transcriptome.{}.cds.bed".format(reference))
202
+ known_canonical_list = os.path.join(genome_root, "knownCanonical.{}.list".format(reference))
203
+ if not os.path.exists(known_canonical_list):
204
+ # allow versioned variants like knownCanonical.vX.hg38.list
205
+ known_canonical_list = find_single_file("knownCanonical", ".{}.list".format(reference))
206
+
207
+ gff_path = find_single_file("gencode", ".{}.gff".format(reference))
208
+
209
+ # ----------------------------- tiny helpers -----------------------------------
210
+
211
+ def read_canonical_ids(path):
212
+ ids = set()
213
+ with open(path) as fh:
214
+ for line in fh:
215
+ line = line.strip()
216
+ if not line:
217
+ continue
218
+ ids.add(line.split()[0])
219
+ return ids
220
+
221
+ canonical_ids = read_canonical_ids(known_canonical_list)
222
+
223
+ def fasta_id_core(rec_id):
224
+ """
225
+ Take first whitespace-separated token and then strip version suffix (.1, .2, etc).
226
+ """
227
+ core = rec_id.split()[0]
228
+ return core.split(".")[0]
229
+
230
+ def read_peptide_fasta(path):
231
+ peptides = OrderedDict()
232
+ for rec in SeqIO.parse(path, "fasta"):
233
+ seq = str(rec.seq).strip().upper()
234
+ if not seq:
235
+ continue
236
+ peptides[rec.id] = seq
237
+ return peptides
238
+
239
+ def write_fasta_single_line(records, path):
240
+ """
241
+ records:
242
+ - iterable of (id, seq)
243
+ - or dict id -> seq
244
+ - or dict id -> (seq, source_id) # e.g. seq + transcript/protein ID
245
+
246
+ If records contain (seq, source_id), the header becomes:
247
+ >peptideID | sourceID
248
+ """
249
+ with open(path, "w") as out:
250
+ if isinstance(records, dict):
251
+ iterator = records.items()
252
+ else:
253
+ iterator = records
254
+ for rid, val in iterator:
255
+ # Allow either seq or (seq, source_id)
256
+ if isinstance(val, tuple) and len(val) == 2:
257
+ seq, source_id = val
258
+ header_id = f"{rid} | {source_id}"
259
+ else:
260
+ seq = val
261
+ header_id = rid
262
+ out.write(f">{header_id}\n{str(seq).strip()}\n")
263
+
264
+ def load_transcriptome(path):
265
+ """
266
+ Return dict: transcript_id -> nucleotide sequence (str, upper-case)
267
+ """
268
+ tx = {}
269
+ for rec in SeqIO.parse(path, "fasta"):
270
+ tx_id = rec.id.split()[0]
271
+ tx[tx_id] = str(rec.seq).upper()
272
+ return tx
273
+
274
+ def load_cds_bed(path):
275
+ """
276
+ Return dict: transcript_id -> list of (start, end)
277
+ BED coordinates assumed 0-based, half-open [start,end).
278
+ """
279
+ cds = defaultdict(list)
280
+ with open(path) as fh:
281
+ for line in fh:
282
+ if not line.strip() or line.startswith("#"):
283
+ continue
284
+ fields = line.rstrip("\n").split("\t")
285
+ if len(fields) < 3:
286
+ continue
287
+ tx_id = fields[0]
288
+ try:
289
+ start = int(fields[1])-1
290
+ end = int(fields[2])
291
+ except ValueError:
292
+ continue
293
+ cds[tx_id].append((start, end))
294
+ # sort segments by start
295
+ for tx_id in list(cds.keys()):
296
+ cds[tx_id].sort(key=lambda x: x[0])
297
+ return cds
298
+
299
+ def translate_cds(transcripts, cds_map, tx_ids_subset=None):
300
+ """
301
+ Translate CDS regions into protein sequences using safe_translate_nt.
302
+ transcripts: dict transcript_id -> nt sequence
303
+ cds_map: dict transcript_id -> list of (start,end) (0-based, half-open)
304
+ tx_ids_subset: optional set of transcript_ids to restrict to
305
+ Returns dict: protein_id -> aa sequence
306
+ """
307
+ proteins = {}
308
+ for tx_id, seq in transcripts.items():
309
+ if tx_ids_subset is not None and tx_id not in tx_ids_subset:
310
+ continue
311
+ if tx_id not in cds_map:
312
+ continue
313
+ cds_seq = []
314
+ for start, end in cds_map[tx_id]:
315
+ if start < 0 or end > len(seq) or start >= end:
316
+ continue
317
+ cds_seq.append(seq[start:end])
318
+ if not cds_seq:
319
+ continue
320
+ nt_seq = "".join(cds_seq)
321
+ aa_seq = safe_translate_nt(nt_seq)
322
+ if not aa_seq:
323
+ continue
324
+ proteins[tx_id] = aa_seq
325
+ return proteins
326
+
327
+ def find_exact_matches(peptides, proteome_fasta_path, step_label=None):
328
+ """
329
+ peptides: dict pep_id -> seq
330
+ proteome_fasta_path: FASTA with protein sequences
331
+
332
+ Return: (matches_dict, remaining_peptides_dict)
333
+ where:
334
+ matches_dict: pep_id -> (pep_seq, ref_entry_id)
335
+ remaining_peptides_dict: pep_id -> pep_seq
336
+ """
337
+ # Load all proteins as (normalized_id, seq),
338
+ # where normalized_id = first substring before '_'
339
+ proteins = []
340
+ for rec in SeqIO.parse(proteome_fasta_path, "fasta"):
341
+ full_id = rec.id.split()[0] # drop whitespace stuff
342
+ norm_id = full_id.split("_", 1)[0] # take first chunk before '_'
343
+ proteins.append((norm_id, str(rec.seq).upper()))
344
+
345
+ matched = OrderedDict()
346
+ remaining = OrderedDict()
347
+ total = float(len(peptides)) if peptides else 1.0
348
+ idx = 0
349
+
350
+ for pid, pep_seq in peptides.items():
351
+ idx += 1
352
+ found = False
353
+ found_ref_id = None
354
+ for prot_id, prot_seq in proteins:
355
+ if pep_seq in prot_seq:
356
+ found = True
357
+ found_ref_id = prot_id
358
+ break
359
+ if found:
360
+ matched[pid] = (pep_seq, found_ref_id)
361
+ else:
362
+ remaining[pid] = pep_seq
363
+
364
+ if step_label is not None and idx % 100 == 0:
365
+ frac = idx / total
366
+ bar_len = 30
367
+ filled = int(bar_len * frac)
368
+ bar = "#" * filled + "-" * (bar_len - filled)
369
+ msg = "[{bar}] {done}/{tot} peptides - {label}".format(
370
+ bar=bar,
371
+ done=idx,
372
+ tot=int(total),
373
+ label=step_label,
374
+ )
375
+ print(msg, file=sys.stderr)
376
+ sys.stderr.flush()
377
+
378
+ return matched, remaining
379
+
380
+ def hamming_distance(s1, s2):
381
+ if len(s1) != len(s2):
382
+ return None
383
+ d = 0
384
+ for a, b in zip(s1, s2):
385
+ if a != b:
386
+ d += 1
387
+ if d > 1:
388
+ break
389
+ return d
390
+
391
+ def find_hamming_leq1_matches(peptides,
392
+ proteome_fasta_path,
393
+ step_label=None,
394
+ num_threads=1):
395
+ """
396
+ peptides: dict pep_id -> seq
397
+
398
+ Return: (matches_dict, remaining_peptides_dict)
399
+ where match if any substring of protein has Hamming distance <=1
400
+
401
+ Strategy:
402
+ - For each peptide, generate all sequences at Hamming distance <=1
403
+ (including the original).
404
+ - Use Python's fast substring search (in) over protein sequences.
405
+ - Parallelize over peptides.
406
+ """
407
+
408
+ # Load all proteins once as (normalized_id, seq)
409
+ proteins = []
410
+ for rec in SeqIO.parse(proteome_fasta_path, "fasta"):
411
+ full_id = rec.id.split()[0]
412
+ norm_id = full_id.split("_", 1)[0]
413
+ proteins.append((norm_id, str(rec.seq).upper()))
414
+
415
+ # Amino acid alphabet – adjust if you have non-standard residues
416
+ AA_ALPHABET = "ACDEFGHIKLMNPQRSTVWY*"
417
+
418
+ def generate_hamming_leq1_variants(seq):
419
+ """
420
+ Generate all sequences with Hamming distance <=1 from seq,
421
+ including seq itself.
422
+ """
423
+ variants = set()
424
+ L = len(seq)
425
+ # distance 0
426
+ variants.add(seq)
427
+ # distance 1
428
+ for i in range(L):
429
+ orig = seq[i]
430
+ for aa in AA_ALPHABET:
431
+ if aa == orig:
432
+ continue
433
+ variants.add(seq[:i] + aa + seq[i+1:])
434
+ return variants
435
+
436
+ matched = OrderedDict()
437
+ remaining = OrderedDict()
438
+ total = float(len(peptides)) if peptides else 1.0
439
+
440
+ def worker(item):
441
+ """
442
+ Check a single peptide for Hamming distance <=1 to any window
443
+ in any protein, using variant generation + substring search.
444
+
445
+ Returns (pid, pep_seq, found_bool, ref_entry_id_or_None).
446
+ """
447
+ pid, pep_seq = item
448
+ pep_len = len(pep_seq)
449
+ if pep_len == 0:
450
+ return pid, pep_seq, False, None
451
+
452
+ variants = generate_hamming_leq1_variants(pep_seq)
453
+
454
+ # Try to find any variant in any protein
455
+ for prot_id, prot_seq in proteins:
456
+ # quick length check – skip too-short proteins
457
+ if pep_len > len(prot_seq):
458
+ continue
459
+ for v in variants:
460
+ if v in prot_seq:
461
+ return pid, pep_seq, True, prot_id
462
+ return pid, pep_seq, False, None
463
+
464
+ items = list(peptides.items())
465
+ if num_threads is None or num_threads <= 1:
466
+ iterator = map(worker, items)
467
+ else:
468
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
469
+ iterator = executor.map(worker, items, chunksize=1)
470
+
471
+ idx = 0
472
+ for pid, pep_seq, found, ref_entry_id in iterator:
473
+ idx += 1
474
+ if found:
475
+ matched[pid] = (pep_seq, ref_entry_id)
476
+ else:
477
+ remaining[pid] = pep_seq
478
+
479
+ if step_label is not None and idx % 100 == 0:
480
+ frac = idx / total
481
+ bar_len = 30
482
+ filled = int(bar_len * frac)
483
+ bar = "#" * filled + "-" * (bar_len - filled)
484
+ msg = "[{bar}] {done}/{tot} peptides - {label}".format(
485
+ bar=bar,
486
+ done=idx,
487
+ tot=int(total),
488
+ label=step_label,
489
+ )
490
+ print(msg, file=sys.stderr)
491
+ sys.stderr.flush()
492
+
493
+ return matched, remaining
494
+
495
+ # mark: step 1 done after loading transcriptome/CDS/knownCanonical etc
496
+ # (we finish this after VCF loading + transcriptome/CDS load below)
497
+
498
+ # ----------------------------- VCF handling + exome filter -------------------
499
+ # Only needed if we are actually building the database. If we are reusing
500
+ # an existing database, we skip reading the VCF entirely.
501
+ if build_database:
502
+ snvs = [] # list of (chrom, pos_int, ref, alt)
503
+
504
+ exome_bed = os.path.join(genome_root, f"exome.{reference}.bed")
505
+
506
+ def load_exome_bed(path):
507
+ """
508
+ Load exome BED (1-based, inclusive) into a per-chrom index suitable
509
+ for fast position lookup.
510
+ Returns: dict chrom -> (starts_list, ends_list), both sorted.
511
+ """
512
+ from bisect import bisect_right # not actually used here but fine
513
+ exome_raw = defaultdict(list)
514
+ with open(path) as fh:
515
+ for line in fh:
516
+ line = line.strip()
517
+ if not line or line.startswith("#"):
518
+ continue
519
+ fields = line.split("\t")
520
+ if len(fields) < 3:
521
+ continue
522
+ chrom = normalize_chrom(fields[0])
523
+ try:
524
+ start = int(fields[1])
525
+ end = int(fields[2])
526
+ except ValueError:
527
+ continue
528
+ if start <= end:
529
+ exome_raw[chrom].append((start, end))
530
+
531
+ exome_index = {}
532
+ for chrom, intervals in exome_raw.items():
533
+ intervals.sort(key=lambda x: x[0])
534
+ starts = [s for s, e in intervals]
535
+ ends = [e for s, e in intervals]
536
+ exome_index[chrom] = (starts, ends)
537
+ return exome_index
538
+
539
+ def pos_in_exome(chrom, pos, exome_index):
540
+ """
541
+ Check if a 1-based position (pos) on chrom lies inside any exome interval.
542
+ """
543
+ from bisect import bisect_right
544
+ if exome_index is None:
545
+ return True # no exome file → accept all
546
+ if chrom not in exome_index:
547
+ return False
548
+ starts, ends = exome_index[chrom]
549
+ idx = bisect_right(starts, pos) - 1
550
+ if idx < 0:
551
+ return False
552
+ return pos <= ends[idx]
553
+
554
+ # Try to load exome index if file exists
555
+ exome_index = None
556
+ if os.path.exists(exome_bed):
557
+ exome_index = load_exome_bed(exome_bed)
558
+
559
+ def parse_vcf_line(line):
560
+ """
561
+ Parse a single VCF line into a list of SNV tuples.
562
+ Returns: list[(chrom, pos_int, ref, alt)]
563
+ """
564
+ line = line.strip()
565
+ if not line or line.startswith("#"):
566
+ return []
567
+ fields = line.split("\t")
568
+ if len(fields) < 5:
569
+ return []
570
+ chrom = normalize_chrom(fields[0])
571
+ try:
572
+ pos = int(fields[1]) # VCF POS is 1-based
573
+ except ValueError:
574
+ return []
575
+ ref = fields[3].upper()
576
+ alts = fields[4].split(",")
577
+ out = []
578
+ for alt in alts:
579
+ alt = alt.upper()
580
+ if len(ref) == 1 and len(alt) == 1:
581
+ out.append((chrom, pos, ref, alt))
582
+ return out
583
+
584
+ def vcf_line_iterator(path):
585
+ """
586
+ Stream lines from VCF or VCF.GZ.
587
+ """
588
+ if path.endswith(".gz"):
589
+ fh = gzip.open(path, "rt")
590
+ else:
591
+ fh = open(path, "r")
592
+ with fh:
593
+ for line in fh:
594
+ if not line.strip() or line.startswith("#"):
595
+ continue
596
+ yield line
597
+
598
+ if vcf_path is not None and os.path.exists(vcf_path):
599
+ vcf_path_abs = os.path.abspath(vcf_path)
600
+
601
+ # Single-threaded streaming parse + exome filter
602
+ for line in vcf_line_iterator(vcf_path_abs):
603
+ for chrom, pos, ref, alt in parse_vcf_line(line):
604
+ if pos_in_exome(chrom, pos, exome_index):
605
+ snvs.append((chrom, pos, ref, alt))
606
+ else:
607
+ snvs = []
608
+ else:
609
+ # Not building database: we won't use SNVs at all.
610
+ snvs = []
611
+
612
+ report_step_done() # step 1 done
613
+
614
+ # ----------------------------- transcriptome and CDS -------------------------
615
+
616
+ # Load whole transcriptome once
617
+ transcriptome = load_transcriptome(transcriptome_fa)
618
+ cds_map = load_cds_bed(cds_bed)
619
+
620
+ report_step_done() # step 2 done
621
+
622
+ # Build canonical / non-canonical transcript ID sets (full IDs as in FASTA)
623
+ canonical_tx_ids = set()
624
+ noncanonical_tx_ids = set()
625
+ for tx_id in transcriptome.keys():
626
+ core = fasta_id_core(tx_id)
627
+ if core in canonical_ids:
628
+ canonical_tx_ids.add(tx_id)
629
+ else:
630
+ noncanonical_tx_ids.add(tx_id)
631
+
632
+ report_step_done() # step 3 done
633
+
634
+ # ----------------------------- canonical proteome ----------------------------
635
+
636
+ canonical_proteome_fa_tmp = os.path.join(database_dir, "canonicalProteome.fa")
637
+ if build_database:
638
+ canonical_proteins = translate_cds(transcriptome, cds_map, tx_ids_subset=canonical_tx_ids)
639
+ write_fasta_single_line(canonical_proteins, canonical_proteome_fa_tmp)
640
+
641
+ # Load peptides
642
+ peptides_all = read_peptide_fasta(peptide_fasta)
643
+
644
+ # Find canonical matches
645
+ canonical_hits, peptides_remaining = find_exact_matches(
646
+ peptides_all,
647
+ canonical_proteome_fa_tmp,
648
+ step_label="canonical classification"
649
+ )
650
+ canonical_out_fa = os.path.join(output_dir, "canonicalProteome.fa")
651
+ if canonical_hits:
652
+ write_fasta_single_line(canonical_hits, canonical_out_fa)
653
+
654
+ report_step_done() # step 4 done
655
+
656
+ # -------------------------- alternative splicing proteome --------------------
657
+
658
+ alt_splicing_proteome_fa_tmp = os.path.join(database_dir, "alternativeSplicing.fa")
659
+ if build_database:
660
+ alt_splicing_proteins = translate_cds(transcriptome, cds_map, tx_ids_subset=noncanonical_tx_ids)
661
+ write_fasta_single_line(alt_splicing_proteins, alt_splicing_proteome_fa_tmp)
662
+
663
+ alt_splicing_hits, peptides_remaining = find_exact_matches(
664
+ peptides_remaining,
665
+ alt_splicing_proteome_fa_tmp,
666
+ step_label="alternative splicing classification"
667
+ )
668
+ alt_splicing_out_fa = os.path.join(output_dir, "alternativeSplicing.fa")
669
+ if alt_splicing_hits:
670
+ write_fasta_single_line(alt_splicing_hits, alt_splicing_out_fa)
671
+
672
+ report_step_done() # step 5 done
673
+
674
+ # ----------------------------- mutated antigens (mutanome) -------------------
675
+
676
+ mutated_canonical_tx_fa_tmp = os.path.join(database_dir, "mutatedCanonicalTranscriptome.fa")
677
+ mutanome_fa_tmp = os.path.join(database_dir, "mutanome.fa")
678
+
679
+ if build_database:
680
+ # Parse GFF to map genomic SNVs to transcript coordinates
681
+ # We build transcript -> chrom, strand, exon intervals (genomic coordinates, 1-based closed)
682
+ transcript_exons = defaultdict(list)
683
+ transcript_strand = {}
684
+ transcript_chrom = {}
685
+
686
+ def parse_gff_attributes(attr_str):
687
+ attrs = {}
688
+ for item in attr_str.strip().split(";"):
689
+ item = item.strip()
690
+ if not item:
691
+ continue
692
+ if "=" in item:
693
+ key, val = item.split("=", 1)
694
+ val = val.strip().strip('"')
695
+ else:
696
+ parts = item.split()
697
+ if len(parts) >= 2:
698
+ key = parts[0]
699
+ val = parts[1].strip('"')
700
+ else:
701
+ continue
702
+ attrs[key] = val
703
+ return attrs
704
+
705
+ with open(gff_path) as fh:
706
+ for line in fh:
707
+ if not line.strip() or line.startswith("#"):
708
+ continue
709
+ fields = line.rstrip("\n").split("\t")
710
+ if len(fields) < 9:
711
+ continue
712
+ chrom, source, feature, start, end, score, strand, frame, attrs_str = fields
713
+
714
+ # Use only exon features to avoid double counting exon+CDS
715
+ if feature.lower() != "exon":
716
+ continue
717
+
718
+ try:
719
+ start = int(start)
720
+ end = int(end)
721
+ except ValueError:
722
+ continue
723
+
724
+ attrs = parse_gff_attributes(attrs_str)
725
+
726
+ # Be strict: use only transcript_id / transcriptId, to match gff_to_bed
727
+ tx_id = attrs.get("transcript_id") or attrs.get("transcriptId")
728
+ if tx_id is None:
729
+ # No transcript-level ID → skip; prevents grouping by exon ID
730
+ continue
731
+
732
+ tx_id = normalize_gff_tx_id(tx_id)
733
+ chrom_norm = normalize_chrom(chrom)
734
+
735
+ transcript_exons[tx_id].append((start, end))
736
+ transcript_strand[tx_id] = strand
737
+ transcript_chrom[tx_id] = chrom_norm
738
+
739
+ # Sort exons
740
+ for tx_id in list(transcript_exons.keys()):
741
+ transcript_exons[tx_id].sort(key=lambda x: x[0])
742
+
743
+ # reverse complement helper (unchanged)
744
+ complement_map = {
745
+ "A": "T",
746
+ "T": "A",
747
+ "C": "G",
748
+ "G": "C",
749
+ "a": "t",
750
+ "t": "a",
751
+ "c": "g",
752
+ "g": "c"
753
+ }
754
+
755
+ def complement_base(b):
756
+ return complement_map.get(b, b)
757
+
758
+ # Precompute exon ordering & lengths per transcript for transcript coord mapping
759
+ exon_order_cache = {} # tx_id -> (ordered_exons, total_len, ordered_exons_desc)
760
+ for tx_id, exons in transcript_exons.items():
761
+ exons_sorted = sorted(exons, key=lambda x: x[0])
762
+ total_len = 0
763
+ for s, e in exons_sorted:
764
+ total_len += (e - s + 1)
765
+ exons_desc = list(reversed(exons_sorted))
766
+ exon_order_cache[tx_id] = (exons_sorted, total_len, exons_desc)
767
+
768
+ # NEW: index SNVs by chromosome
769
+ snvs_by_chrom = defaultdict(list)
770
+ for chrom, pos, ref, alt in snvs:
771
+ snvs_by_chrom[chrom].append((pos, ref, alt))
772
+ for chrom in snvs_by_chrom:
773
+ snvs_by_chrom[chrom].sort(key=lambda x: x[0])
774
+
775
+ def apply_snvs_to_transcript(tx_id):
776
+ """
777
+ Apply all SNVs to a single canonical transcript (if applicable).
778
+ Returns: (tx_id, list_of_chars_sequence)
779
+ """
780
+ # Only canonical transcripts that exist in transcriptome are mutated
781
+ if tx_id not in transcriptome:
782
+ return tx_id, []
783
+
784
+ seq_list = list(transcriptome[tx_id])
785
+ chrom = transcript_chrom.get(tx_id)
786
+ if chrom is None or chrom not in snvs_by_chrom:
787
+ # no SNVs on this chromosome → return original seq
788
+ return tx_id, seq_list
789
+
790
+ if tx_id not in exon_order_cache:
791
+ # no exon info for this transcript
792
+ return tx_id, seq_list
793
+
794
+ exons_sorted, total_len, exons_desc = exon_order_cache[tx_id]
795
+ strand = transcript_strand.get(tx_id, "+")
796
+
797
+ # SNVs already filtered to exome + indexed by chrom
798
+ for pos, ref, alt in snvs_by_chrom[chrom]:
799
+ if strand == "+":
800
+ offset = 0
801
+ within = False
802
+ for s, e in exons_sorted:
803
+ if pos < s:
804
+ break
805
+ if pos > e:
806
+ offset += (e - s + 1)
807
+ else:
808
+ offset += (pos - s)
809
+ within = True
810
+ break
811
+ if not within:
812
+ continue
813
+ tx_index = offset # 0-based index in transcript sequence
814
+ alt_base = alt
815
+ expected_ref = ref
816
+ else:
817
+ # minus strand: transcript 5'->3' is reverse complement,
818
+ # so exons in descending order
819
+ offset_from_5prime = 0
820
+ within = False
821
+ for s, e in exons_desc:
822
+ if pos > e:
823
+ # position is more 5' than this exon on minus strand
824
+ continue
825
+ if pos < s:
826
+ offset_from_5prime += (e - s + 1)
827
+ else:
828
+ offset_from_5prime += (e - pos)
829
+ within = True
830
+ break
831
+ if not within:
832
+ continue
833
+ tx_index = offset_from_5prime
834
+ alt_base = complement_base(alt)
835
+ expected_ref = complement_base(ref)
836
+
837
+ if 0 <= tx_index < len(seq_list):
838
+ current_ref = seq_list[tx_index].upper()
839
+ if expected_ref and current_ref != expected_ref:
840
+ # Warn but do not mutate
841
+ print(
842
+ f"[WARN] Ref base mismatch for {tx_id} at transcript index {tx_index}: "
843
+ f"expected {expected_ref}, saw {current_ref} (chrom {chrom}, pos {pos})",
844
+ file=sys.stderr
845
+ )
846
+ continue
847
+ seq_list[tx_index] = alt_base
848
+
849
+ return tx_id, seq_list
850
+
851
+ # NEW: parallel apply SNVs per transcript (canonical only)
852
+ mutated_transcripts = {}
853
+
854
+ canonical_tx_ids_list = [tx for tx in canonical_tx_ids if tx in transcriptome]
855
+
856
+ if num_threads is not None and num_threads > 1:
857
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
858
+ for tx_id, seq_list in executor.map(apply_snvs_to_transcript, canonical_tx_ids_list, chunksize=50):
859
+ if seq_list:
860
+ mutated_transcripts[tx_id] = seq_list
861
+ else:
862
+ # single-threaded fallback
863
+ for tx_id in canonical_tx_ids_list:
864
+ tx_id_res, seq_list = apply_snvs_to_transcript(tx_id)
865
+ if seq_list:
866
+ mutated_transcripts[tx_id_res] = seq_list
867
+
868
+ # Convert mutated_transcripts back to strings and write mutatedCanonicalTranscriptome.fa
869
+ mutated_canonical_tx_dict = {}
870
+ for tx_id, seq_list in mutated_transcripts.items():
871
+ mutated_canonical_tx_dict[tx_id] = "".join(seq_list)
872
+ write_fasta_single_line(mutated_canonical_tx_dict, mutated_canonical_tx_fa_tmp)
873
+
874
+ # Reload mutated canonical transcriptome into dict for translation
875
+ # IMPORTANT: mutanome = canonicalProteome with SNVs applied,
876
+ # so we must translate ONLY CDS regions, just like canonicalProteome.
877
+ mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa_tmp)
878
+
879
+ # Translate CDS of mutated canonical transcripts
880
+ # If there are no SNVs, mutated_canonical_transcripts == canonical transcripts,
881
+ # so mutanome_proteins == canonical_proteins (as desired).
882
+ mutanome_proteins = translate_cds(
883
+ mutated_canonical_transcripts,
884
+ cds_map,
885
+ tx_ids_subset=None # translated for all mutated canonical transcripts present in cds_map
886
+ )
887
+
888
+ write_fasta_single_line(mutanome_proteins, mutanome_fa_tmp)
889
+ else:
890
+ # Load precomputed mutated canonical transcriptome from database
891
+ mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa_tmp)
892
+
893
+ neoantigen_hits, peptides_remaining = find_exact_matches(
894
+ peptides_remaining,
895
+ mutanome_fa_tmp,
896
+ step_label="mutanome classification"
897
+ )
898
+ neoantigen_out_fa = os.path.join(output_dir, "neoantigen.fa")
899
+ if neoantigen_hits:
900
+ write_fasta_single_line(neoantigen_hits, neoantigen_out_fa)
901
+
902
+ report_step_done() # step 6 done
903
+
904
+ # -------------------------- alternative ORFs (3 reading frames) --------------
905
+
906
+ alt_orf_fa_translatome_tmp = os.path.join(database_dir, "mutatedAlternativeTranslatome.fa")
907
+ alt_orf_fa_orfeome_tmp = os.path.join(database_dir, "mutatedAlternativeORFeome.fa")
908
+
909
+ if build_database:
910
+ # Translate all three frames of mutatedCanonicalTranscriptome
911
+ alt_orf_records = {}
912
+ for tx_id, nt_seq in mutated_canonical_transcripts.items():
913
+ for frame in (0, 1, 2):
914
+ sub_seq = nt_seq[frame:]
915
+ aa_seq = safe_translate_nt(sub_seq)
916
+ if not aa_seq:
917
+ continue
918
+ rid = "{}_frame{}".format(tx_id, frame)
919
+ alt_orf_records[rid] = aa_seq
920
+
921
+ write_fasta_single_line(alt_orf_records, alt_orf_fa_translatome_tmp)
922
+ # keep both filenames as per prompt terminology
923
+ write_fasta_single_line(alt_orf_records, alt_orf_fa_orfeome_tmp)
924
+
925
+ alternative_orf_hits, peptides_remaining = find_exact_matches(
926
+ peptides_remaining,
927
+ alt_orf_fa_translatome_tmp,
928
+ step_label="alternative ORF classification"
929
+ )
930
+ alternative_orf_out_fa = os.path.join(output_dir, "alternativeReadingFrame.fa")
931
+ if alternative_orf_hits:
932
+ write_fasta_single_line(alternative_orf_hits, alternative_orf_out_fa)
933
+
934
+ report_step_done() # step 7 done
935
+
936
+ # -------------------------- amino acid misincorporations ----------------------
937
+
938
+ misincorporation_hits, peptides_remaining = find_hamming_leq1_matches(
939
+ peptides_remaining,
940
+ alt_orf_fa_orfeome_tmp,
941
+ step_label="amino acid misincorporation search",
942
+ num_threads=num_threads,
943
+ )
944
+ misincorporation_out_fa = os.path.join(output_dir, "aminoAcidMisincorporation.fa")
945
+ if misincorporation_hits:
946
+ write_fasta_single_line(misincorporation_hits, misincorporation_out_fa)
947
+
948
+ report_step_done() # step 8 done
949
+
950
+ # -------------------------- unaligned peptides --------------------------------
951
+
952
+ unaligned_out_fa = os.path.join(output_dir, "unknown.fa")
953
+ if peptides_remaining:
954
+ write_fasta_single_line(peptides_remaining, unaligned_out_fa)
955
+
956
+ # -------------------------- pie chart of category counts ----------------------
957
+
958
+ counts = OrderedDict()
959
+ counts["canonical"] = len(canonical_hits)
960
+ counts["alternativeSplicing"] = len(alt_splicing_hits)
961
+ counts["neoantigen"] = len(neoantigen_hits)
962
+ counts["alternativeReadingFrame"] = len(alternative_orf_hits)
963
+ counts["aminoAcidMisincorporation"] = len(misincorporation_hits)
964
+ counts["unknown"] = len(peptides_remaining)
965
+
966
+ # -------------------------- write pieChart.tsv ----------------------
967
+ tsv_path = os.path.join(output_dir, "pieChart.tsv")
968
+ with open(tsv_path, "w") as tsv:
969
+ tsv.write("Category\tCount\n")
970
+ for cat, cnt in counts.items():
971
+ tsv.write(f"{cat}\t{cnt}\n")
972
+
973
+ # -------------------------- produce pie chart (hex colors) --------------------
974
+ category_keys = [
975
+ "canonical",
976
+ "alternativeSplicing",
977
+ "neoantigen",
978
+ "alternativeReadingFrame",
979
+ "aminoAcidMisincorporation",
980
+ "unknown",
981
+ ]
982
+
983
+ legend_labels = [
984
+ "canonical proteome",
985
+ "alternative splicing",
986
+ "neoantigen",
987
+ "alternative reading frame",
988
+ "amino acid misincorporation",
989
+ "unknown",
990
+ ]
991
+
992
+ # Hexadecimal colors
993
+ colors = [
994
+ "#263b81",
995
+ "#0578a6",
996
+ "#64cdf6",
997
+ "#d71f26",
998
+ "#f493a9",
999
+ "#e5e5e5",
1000
+ ]
1001
+
1002
+ # Sizes in fixed order
1003
+ sizes_all = [counts[k] for k in category_keys]
1004
+
1005
+ nonzero_indices = [i for i, s in enumerate(sizes_all) if s > 0]
1006
+
1007
+ if nonzero_indices:
1008
+ pie_sizes = [sizes_all[i] for i in nonzero_indices]
1009
+ pie_colors = [colors[i] for i in nonzero_indices]
1010
+
1011
+ matplotlib.rcParams['font.family'] = 'Arial'
1012
+ matplotlib.rcParams['font.size'] = 14
1013
+
1014
+ max_label_len = max(len(lbl) for lbl in legend_labels)
1015
+ legend_width = max(2.0, 0.10 * max_label_len)
1016
+
1017
+ pie_width = 4.0
1018
+ pie_height = 4.0
1019
+ fig_width = pie_width + legend_width
1020
+ fig_height = pie_height
1021
+
1022
+ fig = plt.figure(figsize=(fig_width, fig_height))
1023
+ pie_ax_width_fraction = pie_width / fig_width
1024
+ ax = fig.add_axes([0.0, 0.0, pie_ax_width_fraction, 1.0])
1025
+
1026
+ wedges, _ = ax.pie(
1027
+ pie_sizes,
1028
+ colors=pie_colors,
1029
+ startangle=90,
1030
+ counterclock=False
1031
+ )
1032
+ ax.axis('equal')
1033
+
1034
+ legend_handles = [
1035
+ Patch(facecolor=colors[i], edgecolor='none')
1036
+ for i in range(len(category_keys))
1037
+ ]
1038
+
1039
+ ax.legend(
1040
+ legend_handles,
1041
+ legend_labels,
1042
+ loc='center left',
1043
+ bbox_to_anchor=(1.02, 0.5),
1044
+ fontsize=14,
1045
+ frameon=False
1046
+ )
1047
+
1048
+ pie_path = os.path.join(output_dir, "pieChart.pdf")
1049
+ fig.savefig(pie_path, format="pdf", dpi=1200, bbox_inches='tight')
1050
+ plt.close(fig)
1051
+
1052
+ report_step_done() # step 9 done
1053
+
1054
+ # -------------------------- finalize -----------------------------------------
1055
+
1056
+ report_step_done() # step 10 done
1057
+
@@ -0,0 +1,17 @@
1
+ MIT License
2
+ Copyright (c) 2025 Hanjun Lee, Stephen J. Elledge
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+ The above copyright notice and this permission notice shall be included in all
10
+ copies or substantial portions of the Software.
11
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17
+ SOFTWARE.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.1
2
+ Name: darkprofiler
3
+ Version: 0.1.0
4
+ Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
5
+ Author-email: Hanjun Lee <hanjun@alum.mit.edu>
6
+ License: MIT
7
+ Keywords: proteomics,immunopeptidomics,neoantigen,bioinformatics
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Requires-Python: >=3.7
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.txt
16
+ Requires-Dist: biopython>=1.78
17
+ Requires-Dist: matplotlib>=3.3
18
+
19
+ # DarkProfiler
20
+
21
+ **DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments**
22
+
23
+ DarkProfiler takes peptide sequences (e.g. from de novo sequencing) and classifies them into:
24
+
25
+ - **Canonical proteome**
26
+ - **Alternative splicing**
27
+ - **Neoantigens (SNV-derived mutanome)**
28
+ - **Alternative reading frame peptides**
29
+ - **Amino acid misincorporations**
30
+ - **Unknown / unaligned**
31
+
32
+ It supports human and mouse references: `hg19`, `hg38`, `mm10`, `mm39`.
33
+
34
+ ---
35
+
36
+ ## Installation
37
+
38
+ ### Install with pip (PyPI)
39
+
40
+ ```bash
41
+ pip install darkprofiler
42
+ ```
43
+
44
+ ### Install with conda (bioconda)
45
+
46
+ ```bash
47
+ conda install bioconda::darkprofiler
48
+ ```
49
+
50
+ ---
51
+
52
+ ## Reference genome
53
+
54
+ DarkProfiler supports human and mouse reference genomes.
55
+
56
+ Supported genome assemblies are:
57
+
58
+ ```
59
+ hg19 (GENCODE release 19)
60
+ hg38 (GENCODE release 37)
61
+ mm10 (GENCODE release M19)
62
+ mm39 (GENCODE release M37)
63
+ ```
64
+
65
+ ---
66
+
67
+ ## Command-line usage
68
+
69
+ ### Download reference data
70
+
71
+ ```bash
72
+ darkprofiler download hg38
73
+ ```
74
+
75
+ ### Run classification
76
+
77
+ ```bash
78
+ darkprofiler run hg38 peptides.fa output_dir
79
+ ```
80
+
81
+ Optional flags:
82
+
83
+ ```
84
+ --vcf-path FILE
85
+ --database-path DIR
86
+ --num-threads N
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Python API
92
+
93
+ ```python
94
+ from darkprofiler.run import classify_peptides
95
+
96
+ classify_peptides(
97
+ reference="hg38",
98
+ peptide_fasta="peptides.fa",
99
+ output_dir="output",
100
+ vcf_path=None,
101
+ database_path=None,
102
+ num_threads=4
103
+ )
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Outputs
109
+
110
+ - canonicalProteome.fa
111
+ - alternativeSplicing.fa
112
+ - neoantigen.fa
113
+ - alternativeReadingFrame.fa
114
+ - aminoAcidMisincorporation.fa
115
+ - unknown.fa
116
+ - pieChart.tsv
117
+ - pieChart.pdf
118
+
119
+ ---
120
+
121
+ ## License
122
+
123
+ MIT License
124
+ Copyright (c) 2025
@@ -0,0 +1,9 @@
1
+ darkprofiler/__init__.py,sha256=1cepXQEj2nBiT5EFnxwIaChLEz2hmL_U5k38Q4osdMM,92
2
+ darkprofiler/cli.py,sha256=kg3p8OpVPltMug6GiSizfwnZI1oj_ROMDbe9upDex3Y,4335
3
+ darkprofiler/run.py,sha256=swq6oMkcSYf0TA7QvVwz0tEZBQb_CruVOV80PHCynlg,39611
4
+ darkprofiler-0.1.0.dist-info/LICENSE.txt,sha256=N-0qqbgqr55PEF6BoI4x2DNBbqtOAADq5MbnqjRg-gc,1083
5
+ darkprofiler-0.1.0.dist-info/METADATA,sha256=pBY-1Y4QjrOne9Dxd4x4-f1r4Etxn1F6fgGB-bBZV1I,2391
6
+ darkprofiler-0.1.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
7
+ darkprofiler-0.1.0.dist-info/entry_points.txt,sha256=sn6AordIuksBoZLxFb3fRZAPZyhD_mSDUa7iADeDSKE,55
8
+ darkprofiler-0.1.0.dist-info/top_level.txt,sha256=wAKKCkthuvFrGWA1QsY_wBSrbP5sfBrCyDATLhgUer4,13
9
+ darkprofiler-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.3.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ darkprofiler = darkprofiler.cli:main
@@ -0,0 +1 @@
1
+ darkprofiler