PyPI - debase - Versions diffs - 0.4.4__tar.gz → 0.5.0__tar.gz - Mend

debase 0.4.4tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{debase-0.4.4/src/debase.egg-info → debase-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.4
+Version: 0.5.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.4.4 → debase-0.5.0}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.4"
+__version__ = "0.5.0"

{debase-0.4.4 → debase-0.5.0}/src/debase/cleanup_sequence.py RENAMED Viewed

@@ -30,6 +30,27 @@ except ImportError:  # pragma: no cover
 # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
 VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codons
+VALID_DNA_BASES = set("ACGT")
+# Genetic code table for DNA to amino acid translation
+GENETIC_CODE = {
+    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
+}
 # Gemini API configuration
 GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -182,6 +203,44 @@ class SequenceManipulator:
         """Validate that a sequence contains only valid amino acids."""
         return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
+    @staticmethod
+    def is_dna_sequence(seq: str) -> bool:
+        """Check if a sequence is DNA (contains only ACGT)."""
+        seq_upper = seq.upper().replace(" ", "").replace("\n", "")
+        return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
+    @staticmethod
+    def translate_dna_to_protein(dna_seq: str) -> str:
+        """Translate DNA sequence to protein sequence.
+        Args:
+            dna_seq: DNA sequence string
+        Returns:
+            Protein sequence string
+        """
+        # Clean the DNA sequence
+        dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
+        # Check if sequence length is multiple of 3
+        if len(dna_seq) % 3 != 0:
+            log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
+            dna_seq = dna_seq[:-(len(dna_seq) % 3)]
+        protein_seq = []
+        for i in range(0, len(dna_seq), 3):
+            codon = dna_seq[i:i+3]
+            if len(codon) == 3:
+                # Handle unknown codons (with N or other non-standard bases)
+                if codon in GENETIC_CODE:
+                    protein_seq.append(GENETIC_CODE[codon])
+                else:
+                    # If codon contains non-standard bases, add 'X' for unknown amino acid
+                    protein_seq.append('X')
+                    log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
+        return ''.join(protein_seq)
     @staticmethod
     def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
         """Determine whether mutations use 0-based or 1-based indexing."""
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
         # Detect and handle column format automatically
         self._normalize_columns()
+        # Translate DNA sequences to protein sequences if needed
+        self._translate_dna_sequences()
         log.info(
             f"Loaded {len(self.df)} rows, "
             f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
         # Initialize generator
         self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
+    def _translate_dna_sequences(self) -> None:
+        """Translate DNA sequences to protein sequences if no amino acid sequences exist."""
+        manipulator = SequenceManipulator()
+        # First check if ANY sequences are amino acid sequences
+        has_amino_acid = False
+        for idx, row in self.df.iterrows():
+            seq = str(row.get("protein_sequence", "")).strip()
+            if seq and seq.lower() not in ["nan", "none", ""]:
+                if not manipulator.is_dna_sequence(seq):
+                    has_amino_acid = True
+                    break
+        # If we found amino acid sequences, don't translate anything
+        if has_amino_acid:
+            log.info("Found amino acid sequences in data, skipping DNA translation")
+            return
+        # No amino acid sequences found, check for DNA sequences in dna_seq column
+        if "dna_seq" in self.df.columns:
+            dna_count = 0
+            for idx, row in self.df.iterrows():
+                protein_seq = str(row.get("protein_sequence", "")).strip()
+                dna_seq = str(row.get("dna_seq", "")).strip()
+                # If protein_sequence is empty but dna_seq has content, translate it
+                if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
+                   (dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
+                    if manipulator.is_dna_sequence(dna_seq):
+                        # Translate DNA to protein
+                        translated_seq = manipulator.translate_dna_to_protein(dna_seq)
+                        self.df.at[idx, "protein_sequence"] = translated_seq
+                        # Add flag to indicate this was translated from DNA
+                        if "flag" not in self.df.columns:
+                            self.df["flag"] = ""
+                        existing_flag = str(self.df.at[idx, "flag"]).strip()
+                        self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
+                        dna_count += 1
+            if dna_count > 0:
+                log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
+        # Also check if DNA sequences are mistakenly in protein_sequence column
+        dna_count = 0
+        for idx, row in self.df.iterrows():
+            seq = str(row.get("protein_sequence", "")).strip()
+            if seq and seq.lower() not in ["nan", "none", ""]:
+                if manipulator.is_dna_sequence(seq):
+                    # Translate DNA to protein
+                    protein_seq = manipulator.translate_dna_to_protein(seq)
+                    self.df.at[idx, "protein_sequence"] = protein_seq
+                    # Add flag to indicate this was translated from DNA
+                    existing_flag = str(self.df.at[idx, "flag"]).strip()
+                    self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
+                    dna_count += 1
+        if dna_count > 0:
+            log.info(f"Translated {dna_count} DNA sequences to protein sequences")
     def _normalize_columns(self) -> None:
         """Automatically detect and normalize column names from different formats."""
         # Check if this is enzyme_lineage_extractor format

debase 0.4.4__tar.gz → 0.5.0__tar.gz

debase 0.4.4tar.gz → 0.5.0tar.gz