debase 0.4.5__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.4.5/src/debase.egg-info → debase-0.5.1}/PKG-INFO +1 -1
- {debase-0.4.5 → debase-0.5.1}/src/debase/_version.py +1 -1
- {debase-0.4.5 → debase-0.5.1}/src/debase/cleanup_sequence.py +123 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/enzyme_lineage_extractor.py +243 -309
- {debase-0.4.5 → debase-0.5.1}/src/debase/reaction_info_extractor.py +192 -68
- {debase-0.4.5 → debase-0.5.1/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.4.5 → debase-0.5.1}/.gitignore +0 -0
- {debase-0.4.5 → debase-0.5.1}/LICENSE +0 -0
- {debase-0.4.5 → debase-0.5.1}/MANIFEST.in +0 -0
- {debase-0.4.5 → debase-0.5.1}/README.md +0 -0
- {debase-0.4.5 → debase-0.5.1}/environment.yml +0 -0
- {debase-0.4.5 → debase-0.5.1}/pyproject.toml +0 -0
- {debase-0.4.5 → debase-0.5.1}/setup.cfg +0 -0
- {debase-0.4.5 → debase-0.5.1}/setup.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/__init__.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/__init__.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/__main__.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/build_db.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/lineage_format.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase/wrapper.py +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.4.5 → debase-0.5.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -30,6 +30,27 @@ except ImportError: # pragma: no cover
|
|
30
30
|
# === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
|
31
31
|
|
32
32
|
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
|
33
|
+
VALID_DNA_BASES = set("ACGT")
|
34
|
+
|
35
|
+
# Genetic code table for DNA to amino acid translation
|
36
|
+
GENETIC_CODE = {
|
37
|
+
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
38
|
+
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
39
|
+
'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
|
40
|
+
'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
|
41
|
+
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
42
|
+
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
43
|
+
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
44
|
+
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
45
|
+
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
46
|
+
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
47
|
+
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
48
|
+
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
49
|
+
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
50
|
+
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
51
|
+
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
52
|
+
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
|
53
|
+
}
|
33
54
|
|
34
55
|
# Gemini API configuration
|
35
56
|
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
@@ -182,6 +203,44 @@ class SequenceManipulator:
|
|
182
203
|
"""Validate that a sequence contains only valid amino acids."""
|
183
204
|
return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
|
184
205
|
|
206
|
+
@staticmethod
|
207
|
+
def is_dna_sequence(seq: str) -> bool:
|
208
|
+
"""Check if a sequence is DNA (contains only ACGT)."""
|
209
|
+
seq_upper = seq.upper().replace(" ", "").replace("\n", "")
|
210
|
+
return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def translate_dna_to_protein(dna_seq: str) -> str:
|
214
|
+
"""Translate DNA sequence to protein sequence.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
dna_seq: DNA sequence string
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Protein sequence string
|
221
|
+
"""
|
222
|
+
# Clean the DNA sequence
|
223
|
+
dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
|
224
|
+
|
225
|
+
# Check if sequence length is multiple of 3
|
226
|
+
if len(dna_seq) % 3 != 0:
|
227
|
+
log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
|
228
|
+
dna_seq = dna_seq[:-(len(dna_seq) % 3)]
|
229
|
+
|
230
|
+
protein_seq = []
|
231
|
+
for i in range(0, len(dna_seq), 3):
|
232
|
+
codon = dna_seq[i:i+3]
|
233
|
+
if len(codon) == 3:
|
234
|
+
# Handle unknown codons (with N or other non-standard bases)
|
235
|
+
if codon in GENETIC_CODE:
|
236
|
+
protein_seq.append(GENETIC_CODE[codon])
|
237
|
+
else:
|
238
|
+
# If codon contains non-standard bases, add 'X' for unknown amino acid
|
239
|
+
protein_seq.append('X')
|
240
|
+
log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
|
241
|
+
|
242
|
+
return ''.join(protein_seq)
|
243
|
+
|
185
244
|
@staticmethod
|
186
245
|
def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
|
187
246
|
"""Determine whether mutations use 0-based or 1-based indexing."""
|
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
|
|
1141
1200
|
# Detect and handle column format automatically
|
1142
1201
|
self._normalize_columns()
|
1143
1202
|
|
1203
|
+
# Translate DNA sequences to protein sequences if needed
|
1204
|
+
self._translate_dna_sequences()
|
1205
|
+
|
1144
1206
|
log.info(
|
1145
1207
|
f"Loaded {len(self.df)} rows, "
|
1146
1208
|
f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
|
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
|
|
1153
1215
|
# Initialize generator
|
1154
1216
|
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1155
1217
|
|
1218
|
+
def _translate_dna_sequences(self) -> None:
|
1219
|
+
"""Translate DNA sequences to protein sequences if no amino acid sequences exist."""
|
1220
|
+
manipulator = SequenceManipulator()
|
1221
|
+
|
1222
|
+
# First check if ANY sequences are amino acid sequences
|
1223
|
+
has_amino_acid = False
|
1224
|
+
for idx, row in self.df.iterrows():
|
1225
|
+
seq = str(row.get("protein_sequence", "")).strip()
|
1226
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1227
|
+
if not manipulator.is_dna_sequence(seq):
|
1228
|
+
has_amino_acid = True
|
1229
|
+
break
|
1230
|
+
|
1231
|
+
# If we found amino acid sequences, don't translate anything
|
1232
|
+
if has_amino_acid:
|
1233
|
+
log.info("Found amino acid sequences in data, skipping DNA translation")
|
1234
|
+
return
|
1235
|
+
|
1236
|
+
# No amino acid sequences found, check for DNA sequences in dna_seq column
|
1237
|
+
if "dna_seq" in self.df.columns:
|
1238
|
+
dna_count = 0
|
1239
|
+
for idx, row in self.df.iterrows():
|
1240
|
+
protein_seq = str(row.get("protein_sequence", "")).strip()
|
1241
|
+
dna_seq = str(row.get("dna_seq", "")).strip()
|
1242
|
+
|
1243
|
+
# If protein_sequence is empty but dna_seq has content, translate it
|
1244
|
+
if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
|
1245
|
+
(dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
|
1246
|
+
if manipulator.is_dna_sequence(dna_seq):
|
1247
|
+
# Translate DNA to protein
|
1248
|
+
translated_seq = manipulator.translate_dna_to_protein(dna_seq)
|
1249
|
+
self.df.at[idx, "protein_sequence"] = translated_seq
|
1250
|
+
|
1251
|
+
# Add flag to indicate this was translated from DNA
|
1252
|
+
if "flag" not in self.df.columns:
|
1253
|
+
self.df["flag"] = ""
|
1254
|
+
existing_flag = str(self.df.at[idx, "flag"]).strip()
|
1255
|
+
self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
|
1256
|
+
dna_count += 1
|
1257
|
+
|
1258
|
+
if dna_count > 0:
|
1259
|
+
log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
|
1260
|
+
|
1261
|
+
# Also check if DNA sequences are mistakenly in protein_sequence column
|
1262
|
+
dna_count = 0
|
1263
|
+
for idx, row in self.df.iterrows():
|
1264
|
+
seq = str(row.get("protein_sequence", "")).strip()
|
1265
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1266
|
+
if manipulator.is_dna_sequence(seq):
|
1267
|
+
# Translate DNA to protein
|
1268
|
+
protein_seq = manipulator.translate_dna_to_protein(seq)
|
1269
|
+
self.df.at[idx, "protein_sequence"] = protein_seq
|
1270
|
+
|
1271
|
+
# Add flag to indicate this was translated from DNA
|
1272
|
+
existing_flag = str(self.df.at[idx, "flag"]).strip()
|
1273
|
+
self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
|
1274
|
+
dna_count += 1
|
1275
|
+
|
1276
|
+
if dna_count > 0:
|
1277
|
+
log.info(f"Translated {dna_count} DNA sequences to protein sequences")
|
1278
|
+
|
1156
1279
|
def _normalize_columns(self) -> None:
|
1157
1280
|
"""Automatically detect and normalize column names from different formats."""
|
1158
1281
|
# Check if this is enzyme_lineage_extractor format
|