debase 0.4.4__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {debase-0.4.4/src/debase.egg-info → debase-0.5.0}/PKG-INFO +1 -1
  2. {debase-0.4.4 → debase-0.5.0}/src/debase/_version.py +1 -1
  3. {debase-0.4.4 → debase-0.5.0}/src/debase/cleanup_sequence.py +123 -0
  4. {debase-0.4.4 → debase-0.5.0}/src/debase/enzyme_lineage_extractor.py +254 -315
  5. {debase-0.4.4 → debase-0.5.0}/src/debase/lineage_format.py +22 -18
  6. {debase-0.4.4 → debase-0.5.0}/src/debase/reaction_info_extractor.py +180 -62
  7. {debase-0.4.4 → debase-0.5.0}/src/debase/substrate_scope_extractor.py +3 -2
  8. {debase-0.4.4 → debase-0.5.0/src/debase.egg-info}/PKG-INFO +1 -1
  9. {debase-0.4.4 → debase-0.5.0}/.gitignore +0 -0
  10. {debase-0.4.4 → debase-0.5.0}/LICENSE +0 -0
  11. {debase-0.4.4 → debase-0.5.0}/MANIFEST.in +0 -0
  12. {debase-0.4.4 → debase-0.5.0}/README.md +0 -0
  13. {debase-0.4.4 → debase-0.5.0}/environment.yml +0 -0
  14. {debase-0.4.4 → debase-0.5.0}/pyproject.toml +0 -0
  15. {debase-0.4.4 → debase-0.5.0}/setup.cfg +0 -0
  16. {debase-0.4.4 → debase-0.5.0}/setup.py +0 -0
  17. {debase-0.4.4 → debase-0.5.0}/src/__init__.py +0 -0
  18. {debase-0.4.4 → debase-0.5.0}/src/debase/__init__.py +0 -0
  19. {debase-0.4.4 → debase-0.5.0}/src/debase/__main__.py +0 -0
  20. {debase-0.4.4 → debase-0.5.0}/src/debase/build_db.py +0 -0
  21. {debase-0.4.4 → debase-0.5.0}/src/debase/wrapper.py +0 -0
  22. {debase-0.4.4 → debase-0.5.0}/src/debase.egg-info/SOURCES.txt +0 -0
  23. {debase-0.4.4 → debase-0.5.0}/src/debase.egg-info/dependency_links.txt +0 -0
  24. {debase-0.4.4 → debase-0.5.0}/src/debase.egg-info/entry_points.txt +0 -0
  25. {debase-0.4.4 → debase-0.5.0}/src/debase.egg-info/requires.txt +0 -0
  26. {debase-0.4.4 → debase-0.5.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.4.4
3
+ Version: 0.5.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.4"
3
+ __version__ = "0.5.0"
@@ -30,6 +30,27 @@ except ImportError: # pragma: no cover
30
30
  # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
31
31
 
32
32
  VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
33
+ VALID_DNA_BASES = set("ACGT")
34
+
35
+ # Genetic code table for DNA to amino acid translation
36
+ GENETIC_CODE = {
37
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
38
+ 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
39
+ 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
40
+ 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
41
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
42
+ 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
43
+ 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
44
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
45
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
46
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
47
+ 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
48
+ 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
49
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
50
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
51
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
52
+ 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
53
+ }
33
54
 
34
55
  # Gemini API configuration
35
56
  GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -182,6 +203,44 @@ class SequenceManipulator:
182
203
  """Validate that a sequence contains only valid amino acids."""
183
204
  return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
184
205
 
206
+ @staticmethod
207
+ def is_dna_sequence(seq: str) -> bool:
208
+ """Check if a sequence is DNA (contains only ACGT)."""
209
+ seq_upper = seq.upper().replace(" ", "").replace("\n", "")
210
+ return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
211
+
212
+ @staticmethod
213
+ def translate_dna_to_protein(dna_seq: str) -> str:
214
+ """Translate DNA sequence to protein sequence.
215
+
216
+ Args:
217
+ dna_seq: DNA sequence string
218
+
219
+ Returns:
220
+ Protein sequence string
221
+ """
222
+ # Clean the DNA sequence
223
+ dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
224
+
225
+ # Check if sequence length is multiple of 3
226
+ if len(dna_seq) % 3 != 0:
227
+ log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
228
+ dna_seq = dna_seq[:-(len(dna_seq) % 3)]
229
+
230
+ protein_seq = []
231
+ for i in range(0, len(dna_seq), 3):
232
+ codon = dna_seq[i:i+3]
233
+ if len(codon) == 3:
234
+ # Handle unknown codons (with N or other non-standard bases)
235
+ if codon in GENETIC_CODE:
236
+ protein_seq.append(GENETIC_CODE[codon])
237
+ else:
238
+ # If codon contains non-standard bases, add 'X' for unknown amino acid
239
+ protein_seq.append('X')
240
+ log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
241
+
242
+ return ''.join(protein_seq)
243
+
185
244
  @staticmethod
186
245
  def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
187
246
  """Determine whether mutations use 0-based or 1-based indexing."""
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
1141
1200
  # Detect and handle column format automatically
1142
1201
  self._normalize_columns()
1143
1202
 
1203
+ # Translate DNA sequences to protein sequences if needed
1204
+ self._translate_dna_sequences()
1205
+
1144
1206
  log.info(
1145
1207
  f"Loaded {len(self.df)} rows, "
1146
1208
  f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
1153
1215
  # Initialize generator
1154
1216
  self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1155
1217
 
1218
+ def _translate_dna_sequences(self) -> None:
1219
+ """Translate DNA sequences to protein sequences if no amino acid sequences exist."""
1220
+ manipulator = SequenceManipulator()
1221
+
1222
+ # First check if ANY sequences are amino acid sequences
1223
+ has_amino_acid = False
1224
+ for idx, row in self.df.iterrows():
1225
+ seq = str(row.get("protein_sequence", "")).strip()
1226
+ if seq and seq.lower() not in ["nan", "none", ""]:
1227
+ if not manipulator.is_dna_sequence(seq):
1228
+ has_amino_acid = True
1229
+ break
1230
+
1231
+ # If we found amino acid sequences, don't translate anything
1232
+ if has_amino_acid:
1233
+ log.info("Found amino acid sequences in data, skipping DNA translation")
1234
+ return
1235
+
1236
+ # No amino acid sequences found, check for DNA sequences in dna_seq column
1237
+ if "dna_seq" in self.df.columns:
1238
+ dna_count = 0
1239
+ for idx, row in self.df.iterrows():
1240
+ protein_seq = str(row.get("protein_sequence", "")).strip()
1241
+ dna_seq = str(row.get("dna_seq", "")).strip()
1242
+
1243
+ # If protein_sequence is empty but dna_seq has content, translate it
1244
+ if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
1245
+ (dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
1246
+ if manipulator.is_dna_sequence(dna_seq):
1247
+ # Translate DNA to protein
1248
+ translated_seq = manipulator.translate_dna_to_protein(dna_seq)
1249
+ self.df.at[idx, "protein_sequence"] = translated_seq
1250
+
1251
+ # Add flag to indicate this was translated from DNA
1252
+ if "flag" not in self.df.columns:
1253
+ self.df["flag"] = ""
1254
+ existing_flag = str(self.df.at[idx, "flag"]).strip()
1255
+ self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
1256
+ dna_count += 1
1257
+
1258
+ if dna_count > 0:
1259
+ log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
1260
+
1261
+ # Also check if DNA sequences are mistakenly in protein_sequence column
1262
+ dna_count = 0
1263
+ for idx, row in self.df.iterrows():
1264
+ seq = str(row.get("protein_sequence", "")).strip()
1265
+ if seq and seq.lower() not in ["nan", "none", ""]:
1266
+ if manipulator.is_dna_sequence(seq):
1267
+ # Translate DNA to protein
1268
+ protein_seq = manipulator.translate_dna_to_protein(seq)
1269
+ self.df.at[idx, "protein_sequence"] = protein_seq
1270
+
1271
+ # Add flag to indicate this was translated from DNA
1272
+ existing_flag = str(self.df.at[idx, "flag"]).strip()
1273
+ self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
1274
+ dna_count += 1
1275
+
1276
+ if dna_count > 0:
1277
+ log.info(f"Translated {dna_count} DNA sequences to protein sequences")
1278
+
1156
1279
  def _normalize_columns(self) -> None:
1157
1280
  """Automatically detect and normalize column names from different formats."""
1158
1281
  # Check if this is enzyme_lineage_extractor format