PyPI - darkprofiler - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

darkprofiler 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{darkprofiler-0.2.2/src/darkprofiler.egg-info → darkprofiler-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: darkprofiler
-Version: 0.2.2
+Version: 0.2.3
 Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
 Author-email: Hanjun Lee <hanjun@alum.mit.edu>
 License: MIT
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
 - `mutanome.fa`
 - `mutatedCanonicalTranscriptome.fa`
 - `mutatedAlternativeTranslatome.fa`
-- `mutatedAlternativeORFeome.fa`
 DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
 for example:
@@ -361,7 +360,7 @@ classify_peptides(
 - **ORF region labels**
   For alternative ORF hits, DarkProfiler labels the peptide start as:
   - `uORF` (upstream of CDS start)
-  - `intORF` (inside annotated CDS span)
+  - `intORF` (out-of-frame peptdies from inside annotated CDS span)
   - `dORF` (downstream of CDS end)
   - `lncRNA` (no CDS annotation)

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/README.md RENAMED Viewed

@@ -226,7 +226,6 @@ The database contains translated and derived proteomes as FASTA files:
 - `mutanome.fa`
 - `mutatedCanonicalTranscriptome.fa`
 - `mutatedAlternativeTranslatome.fa`
-- `mutatedAlternativeORFeome.fa`
 DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
 for example:
@@ -343,7 +342,7 @@ classify_peptides(
 - **ORF region labels**
   For alternative ORF hits, DarkProfiler labels the peptide start as:
   - `uORF` (upstream of CDS start)
-  - `intORF` (inside annotated CDS span)
+  - `intORF` (out-of-frame peptdies from inside annotated CDS span)
   - `dORF` (downstream of CDS end)
   - `lncRNA` (no CDS annotation)

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "darkprofiler"
-version = "0.2.2"
+version = "0.2.3"
 description = "DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments."
 readme = "README.md"
 requires-python = ">=3.7"

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/__init__.py RENAMED Viewed

@@ -2,5 +2,5 @@ from .run import classify_peptides
 __all__ = ["classify_peptides"]
-__version__ = "0.2.2"
+__version__ = "0.2.3"

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/cli.py RENAMED Viewed

@@ -121,7 +121,7 @@ def build_parser() -> argparse.ArgumentParser:
             "Optional path to existing database directory containing "
             "canonicalProteome.fa, alternativeSplicing.fa, mutanome.fa, "
             "mutatedCanonicalTranscriptome.fa, mutatedAlternativeTranslatome.fa, "
-            "mutatedAlternativeORFeome.fa."
+            "and other index files"
         ),
     )
     p_run.add_argument(

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler/run.py RENAMED Viewed

@@ -481,7 +481,6 @@ def classify_peptides(reference,
     required_db_files = [
         "alternativeSplicing.fa",
-        "mutatedAlternativeORFeome.fa",
         "canonicalProteome.fa",
         "mutatedAlternativeTranslatome.fa",
         "mutanome.fa",
@@ -1006,31 +1005,169 @@ def classify_peptides(reference,
     report_step_done()  # 5
-    # ----------------------------- mutated antigens (mutanome) -------------------
-    # NOTE: keep your original SNV application as-is (omitted here for brevity),
-    # but ensure translation uses translate_cds_with_map (ATG filter) and then build index.
-    #
-    # If you already have your SNV block, set:
-    #   mutated_canonical_transcripts = load_transcriptome(mutatedCanonicalTranscriptome.fa)
-    #   mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(mutated_canonical_transcripts, cds_map)
-    #   write mutanome.fa
-    #
+    # ---------- build mutanome.fa + mutatedCanonicalTranscriptome.fa ----------
     mutated_canonical_tx_fa = os.path.join(database_dir, "mutatedCanonicalTranscriptome.fa")
     mutanome_fa = os.path.join(database_dir, "mutanome.fa")
     mutanome_idx_dir = os.path.join(database_dir, "mutanome.idx")
-    # ---------- BEGIN: your existing SNV block should fill these ----------
-    # For runs reusing DB, mutated_canonical_tx_fa and mutanome_fa should already exist.
-    mutated_canonical_transcripts = {}
-    if os.path.exists(mutated_canonical_tx_fa):
-        mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa)
+    def _parse_gff_attributes(attr_str):
+        attrs = {}
+        for item in attr_str.strip().split(";"):
+            item = item.strip()
+            if not item:
+                continue
+            if "=" in item:
+                k, v = item.split("=", 1)
+                attrs[k] = v.strip().strip('"')
+            else:
+                parts = item.split()
+                if len(parts) >= 2:
+                    attrs[parts[0]] = parts[1].strip('"')
+        return attrs
+    # Build transcript exon model from GFF (exon features only)
+    transcript_exons = defaultdict(list)    # tx_id -> list[(start,end)] 1-based closed
+    transcript_strand = {}                 # tx_id -> '+'/'-'
+    transcript_chrom = {}                  # tx_id -> chrom (normalized)
+    with open(gff_path) as fh:
+        for line in fh:
+            if not line.strip() or line.startswith("#"):
+                continue
+            fields = line.rstrip("\n").split("\t")
+            if len(fields) < 9:
+                continue
+            chrom, source, feature, start, end, score, strand, frame, attrs_str = fields
+            if feature.lower() != "exon":
+                continue
+            try:
+                start_i = int(start)
+                end_i = int(end)
+            except ValueError:
+                continue
+            attrs = _parse_gff_attributes(attrs_str)
+            tx_id = attrs.get("transcript_id") or attrs.get("transcriptId")
+            if tx_id is None:
+                continue
+            tx_id = normalize_gff_tx_id(tx_id)
+            transcript_exons[tx_id].append((start_i, end_i))
+            transcript_strand[tx_id] = strand
+            transcript_chrom[tx_id] = normalize_chrom(chrom)
+    # Sort exons ascending by genomic start
+    for tx_id in list(transcript_exons.keys()):
+        transcript_exons[tx_id].sort(key=lambda x: x[0])
+    # Exon caches for coordinate mapping
+    exon_order_cache = {}  # tx_id -> (exons_sorted, exons_desc)
+    for tx_id, exons_sorted in transcript_exons.items():
+        exon_order_cache[tx_id] = (exons_sorted, list(reversed(exons_sorted)))
+    # Index SNVs by chromosome for speed
+    snvs_by_chrom = defaultdict(list)  # chrom -> list[(pos, ref, alt)]
+    for chrom, pos, ref, alt in snvs:
+        snvs_by_chrom[chrom].append((int(pos), ref, alt))
+    for chrom in snvs_by_chrom:
+        snvs_by_chrom[chrom].sort(key=lambda x: x[0])
+    # Simple base complement
+    _complement = {"A":"T","T":"A","C":"G","G":"C","a":"t","t":"a","c":"g","g":"c"}
+    def _complement_base(b):
+        return _complement.get(b, b)
+    def _apply_snvs_to_one_transcript(tx_id):
+        """
+        Returns: (tx_id, mutated_seq_string) OR (tx_id, None) if transcript not present.
+        If no SNVs (or no mapping), returns original transcript sequence.
+        """
+        if tx_id not in transcriptome:
+            return tx_id, None
+        seq = transcriptome[tx_id]
+        seq_list = list(seq)
+        chrom = transcript_chrom.get(tx_id)
+        if chrom is None or chrom not in snvs_by_chrom:
+            return tx_id, seq
+        if tx_id not in exon_order_cache:
+            return tx_id, seq
+        exons_sorted, exons_desc = exon_order_cache[tx_id]
+        strand = transcript_strand.get(tx_id, "+")
+        for pos, ref, alt in snvs_by_chrom[chrom]:
+            if strand == "+":
+                offset = 0
+                within = False
+                for s, e in exons_sorted:
+                    if pos < s:
+                        break
+                    if pos > e:
+                        offset += (e - s + 1)
+                    else:
+                        offset += (pos - s)
+                        within = True
+                        break
+                if not within:
+                    continue
+                tx_index = offset
+                expected_ref = ref.upper()
+                alt_base = alt.upper()
+            else:
+                offset = 0
+                within = False
+                for s, e in exons_desc:
+                    if pos > e:
+                        continue
+                    if pos < s:
+                        offset += (e - s + 1)
+                    else:
+                        offset += (e - pos)
+                        within = True
+                        break
+                if not within:
+                    continue
+                tx_index = offset
+                expected_ref = _complement_base(ref.upper())
+                alt_base = _complement_base(alt.upper())
+            if 0 <= tx_index < len(seq_list):
+                if expected_ref and seq_list[tx_index].upper() != expected_ref:
+                    continue
+                seq_list[tx_index] = alt_base
+        return tx_id, "".join(seq_list)
+    # Build mutated canonical transcriptome
+    if build_database:
+        canonical_tx_list = [tx for tx in canonical_tx_ids if tx in transcriptome]
+        mutated_canonical_tx_dict = {}
+        if num_threads and num_threads > 1:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as ex:
+                for tx_id, mutseq in ex.map(_apply_snvs_to_one_transcript, canonical_tx_list, chunksize=50):
+                    if mutseq is not None:
+                        mutated_canonical_tx_dict[tx_id] = mutseq
+        else:
+            for tx_id in canonical_tx_list:
+                tx_id2, mutseq = _apply_snvs_to_one_transcript(tx_id)
+                if mutseq is not None:
+                    mutated_canonical_tx_dict[tx_id2] = mutseq
+        with open(mutated_canonical_tx_fa, "w") as out:
+            for tx_id, nt_seq in mutated_canonical_tx_dict.items():
+                out.write(f">{tx_id}\n{nt_seq}\n")
+        mutated_canonical_transcripts = mutated_canonical_tx_dict
     else:
-        # If you keep your original code, it will write this file when build_database=True.
-        mutated_canonical_transcripts = {}  # placeholder
-    # ---------- END: your existing SNV block should fill these ----------
+        mutated_canonical_transcripts = load_transcriptome(mutated_canonical_tx_fa)
-    # Translate mutanome with ATG filter + mapping (even if file was precomputed)
-    mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(mutated_canonical_transcripts, cds_map)
+    # Translate mutanome (CDS only, ATG filtered)
+    mutanome_proteins, mutanome_aa2nt = translate_cds_with_map(
+        mutated_canonical_transcripts,
+        cds_map
+    )
     if build_database:
         with open(mutanome_fa, "w") as out:
@@ -1063,8 +1200,7 @@ def classify_peptides(reference,
     # -------------------------- alternative ORFs (3 frames) ----------------------
     alt_orf_translatome_fa = os.path.join(database_dir, "mutatedAlternativeTranslatome.fa")
-    alt_orf_orfeome_fa = os.path.join(database_dir, "mutatedAlternativeORFeome.fa")
-    alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeORFeome.idx")
+    alt_orf_idx_dir = os.path.join(database_dir, "mutatedAlternativeTranslatome.idx")
     if build_database:
         alt_orf_records = {}
@@ -1082,14 +1218,11 @@ def classify_peptides(reference,
         with open(alt_orf_translatome_fa, "w") as out:
             for rid, aa_seq in alt_orf_records.items():
                 out.write(f">{rid}\n{aa_seq}\n")
-        with open(alt_orf_orfeome_fa, "w") as out:
-            for rid, aa_seq in alt_orf_records.items():
-                out.write(f">{rid}\n{aa_seq}\n")
-    # Build index for ORFeome (no aa2nt needed)
+    # Build index for alt ORF translatome (no aa2nt needed)
     if build_fast_index:
         build_proteome_index(
-            alt_orf_orfeome_fa,
+            alt_orf_translatome_fa,
             alt_orf_idx_dir,
             L_min=index_L_min,
             L_max=index_L_max,
@@ -1106,13 +1239,13 @@ def classify_peptides(reference,
             frame = int(frame_str)
         except ValueError:
             return None, None, None, None
         nt0 = frame + aa_pos * 3
         if tx_id not in transcriptome:
             return None, None, None, None
         if nt0 < 0 or nt0 >= len(transcriptome[tx_id]):
             return None, None, None, None
         if tx_id not in cds_bounds:
             region = "lncRNA"
         else:
@@ -1122,7 +1255,6 @@ def classify_peptides(reference,
             elif nt0 >= cds_end:
                 region = "dORF"
             else:
-                # inside CDS bounds: decide by frame
                 if ((nt0 - cds_start) % 3) == 0:
                     region = "CDS"
                 else:
@@ -1147,7 +1279,7 @@ def classify_peptides(reference,
     # -------------------------- amino acid mismatch ------------------------------
     mismatch_hit_records, peptides_remaining = classify_with_index(
         peptides_remaining,
-        alt_orf_orfeome_fa,
+        alt_orf_translatome_fa,
         alt_orf_idx_dir,
         coord_resolver=coord_resolver_altorf,
         step_label="amino acid mismatch search",

{darkprofiler-0.2.2 → darkprofiler-0.2.3/src/darkprofiler.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: darkprofiler
-Version: 0.2.2
+Version: 0.2.3
 Summary: DarkProfiler: Alignment and Classification of Peptides from Reference-Independent De Novo Peptide Sequencing Experiments.
 Author-email: Hanjun Lee <hanjun@alum.mit.edu>
 License: MIT
@@ -244,7 +244,6 @@ The database contains translated and derived proteomes as FASTA files:
 - `mutanome.fa`
 - `mutatedCanonicalTranscriptome.fa`
 - `mutatedAlternativeTranslatome.fa`
-- `mutatedAlternativeORFeome.fa`
 DarkProfiler also creates **persistent fast indices** under the same database directory to accelerate peptide search with Hamming distance:
 for example:
@@ -361,7 +360,7 @@ classify_peptides(
 - **ORF region labels**
   For alternative ORF hits, DarkProfiler labels the peptide start as:
   - `uORF` (upstream of CDS start)
-  - `intORF` (inside annotated CDS span)
+  - `intORF` (out-of-frame peptdies from inside annotated CDS span)
   - `dORF` (downstream of CDS end)
   - `lncRNA` (no CDS annotation)

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/LICENSE.txt RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/setup.cfg RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/entry_points.txt RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/requires.txt RENAMED Viewed

File without changes

{darkprofiler-0.2.2 → darkprofiler-0.2.3}/src/darkprofiler.egg-info/top_level.txt RENAMED Viewed

File without changes

darkprofiler 0.2.2__tar.gz → 0.2.3__tar.gz

darkprofiler 0.2.2tar.gz → 0.2.3tar.gz