debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
debase/lineage_format.py
ADDED
@@ -0,0 +1,808 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
lineage_flattener.py
|
4
|
+
====================
|
5
|
+
A **complete rewrite** of the original `lineage_format.py`, structured in the
|
6
|
+
same sectioned style as `enzyme_lineage_extractor.py`, but **without any non-ASCII
|
7
|
+
characters**. All input and output column names are declared once as top-level
|
8
|
+
constants to prevent accidental drift.
|
9
|
+
|
10
|
+
The tool reads an annotated CSV containing enzyme variant information (lineage,
|
11
|
+
sequences, reaction data, fitness, etc.) and produces a flat reaction table
|
12
|
+
(one row per product) suitable for robotic plate builders or downstream ML.
|
13
|
+
|
14
|
+
-------------------------------------------------------------------------------
|
15
|
+
SECTION GUIDE (grep-able):
|
16
|
+
# === 1. CONFIG & CONSTANTS ===
|
17
|
+
# === 2. DOMAIN MODELS ===
|
18
|
+
# === 3. LOGGING HELPERS ===
|
19
|
+
# === 4. CACHE & DB HELPERS ===
|
20
|
+
# === 5. SEQUENCE / MUTATION HELPERS ===
|
21
|
+
# === 6. SMILES CONVERSION HELPERS ===
|
22
|
+
# === 7. FLATTENING CORE ===
|
23
|
+
# === 8. PIPELINE ORCHESTRATOR ===
|
24
|
+
# === 9. CLI ENTRYPOINT ===
|
25
|
+
-------------------------------------------------------------------------------
|
26
|
+
"""
|
27
|
+
|
28
|
+
# === 1. CONFIG & CONSTANTS ===================================================
|
29
|
+
from __future__ import annotations
|
30
|
+
|
31
|
+
import argparse
|
32
|
+
import csv
|
33
|
+
import json
|
34
|
+
import logging
|
35
|
+
import os
|
36
|
+
import pickle
|
37
|
+
import re
|
38
|
+
import sqlite3
|
39
|
+
import sys
|
40
|
+
import time
|
41
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
42
|
+
from dataclasses import dataclass, field
|
43
|
+
from pathlib import Path
|
44
|
+
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
45
|
+
|
46
|
+
import pandas as pd
|
47
|
+
from tqdm import tqdm
|
48
|
+
|
49
|
+
try:
|
50
|
+
from rdkit import Chem # type: ignore
|
51
|
+
RDKIT_OK = True
|
52
|
+
except ImportError: # pragma: no cover
|
53
|
+
RDKIT_OK = False
|
54
|
+
|
55
|
+
# Input columns that MUST be present ------------------------------------------------
|
56
|
+
INPUT_REQUIRED: Tuple[str, ...] = (
|
57
|
+
"enzyme_id",
|
58
|
+
"substrate_iupac_list", # preferred source for SMILES lookup
|
59
|
+
"product_iupac_list", # preferred source for SMILES lookup
|
60
|
+
)
|
61
|
+
|
62
|
+
# Alternative column names that can be used instead
|
63
|
+
COLUMN_ALIASES: Dict[str, str] = {
|
64
|
+
"enzyme": "enzyme_id", # Handle 'enzyme' as an alias for 'enzyme_id'
|
65
|
+
}
|
66
|
+
|
67
|
+
# Optional but recognized input fields ----------------------------------------------
|
68
|
+
OPTIONAL_INPUT: Tuple[str, ...] = (
|
69
|
+
"parent_enzyme_id",
|
70
|
+
"generation",
|
71
|
+
"protein_sequence",
|
72
|
+
"aa_sequence",
|
73
|
+
"nucleotide_sequence",
|
74
|
+
"nt_sequence",
|
75
|
+
"ttn",
|
76
|
+
"yield",
|
77
|
+
"reaction_temperature",
|
78
|
+
"reaction_ph",
|
79
|
+
"reaction_other_conditions",
|
80
|
+
"reaction_substrate_concentration",
|
81
|
+
"cofactor_iupac_list",
|
82
|
+
"cofactor_list",
|
83
|
+
"ee",
|
84
|
+
"data_type", # either "lineage" or "substrate_scope"
|
85
|
+
"substrate", # fallback names
|
86
|
+
"substrate_name",
|
87
|
+
"compound",
|
88
|
+
"product",
|
89
|
+
"product_name",
|
90
|
+
)
|
91
|
+
|
92
|
+
# Output columns --------------------------------------------------------------------
|
93
|
+
OUTPUT_COLUMNS: Tuple[str, ...] = (
|
94
|
+
"id",
|
95
|
+
"barcode_plate",
|
96
|
+
"plate",
|
97
|
+
"well",
|
98
|
+
"smiles_string",
|
99
|
+
"smiles_reaction",
|
100
|
+
"alignment_count",
|
101
|
+
"alignment_probability",
|
102
|
+
"nucleotide_mutation",
|
103
|
+
"amino_acid_substitutions",
|
104
|
+
"nt_sequence",
|
105
|
+
"aa_sequence",
|
106
|
+
"x_coordinate",
|
107
|
+
"y_coordinate",
|
108
|
+
"fitness_value",
|
109
|
+
"cofactor",
|
110
|
+
"reaction_condition",
|
111
|
+
"ee",
|
112
|
+
"additional_information",
|
113
|
+
)
|
114
|
+
|
115
|
+
# Plate layout constants -------------------------------------------------------------
|
116
|
+
PLATE_SIZE: int = 96
|
117
|
+
BARCODE_START: int = 1
|
118
|
+
|
119
|
+
# Batch / parallelism ----------------------------------------------------------------
|
120
|
+
MAX_WORKERS: int = min(32, (os.cpu_count() or 4) * 2)
|
121
|
+
BATCH_SIZE: int = 50
|
122
|
+
|
123
|
+
# Cache files ------------------------------------------------------------------------
|
124
|
+
CACHE_DIR: Path = Path(os.environ.get("LINEAGE_CACHE_DIR", "./.cache"))
|
125
|
+
SMILES_CACHE_FILE: Path = CACHE_DIR / "smiles_cache.pkl"
|
126
|
+
SUBSTRATE_CACHE_FILE: Path = CACHE_DIR / "substrate_smiles_cache.pkl"
|
127
|
+
CANONICAL_CACHE_FILE: Path = CACHE_DIR / "canonical_smiles_cache.pkl"
|
128
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
129
|
+
|
130
|
+
# Local PubChem DB (optional) --------------------------------------------------------
|
131
|
+
PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
|
132
|
+
|
133
|
+
# Miscellaneous ----------------------------------------------------------------------
|
134
|
+
WELL_ROWS: str = "ABCDEFGH" # 8 rows, 12 cols => 96 wells
|
135
|
+
|
136
|
+
|
137
|
+
# === 2. DOMAIN MODELS ===============================================================
|
138
|
+
@dataclass
|
139
|
+
class VariantRecord:
|
140
|
+
"""Minimal representation of an enzyme variant row from the input CSV."""
|
141
|
+
|
142
|
+
row: Dict[str, str]
|
143
|
+
|
144
|
+
def __post_init__(self) -> None:
|
145
|
+
# Apply column aliases
|
146
|
+
for alias, canonical in COLUMN_ALIASES.items():
|
147
|
+
if alias in self.row and canonical not in self.row:
|
148
|
+
self.row[canonical] = self.row[alias]
|
149
|
+
|
150
|
+
missing = [c for c in INPUT_REQUIRED if c not in self.row]
|
151
|
+
if missing:
|
152
|
+
raise ValueError(f"Missing required columns: {', '.join(missing)}")
|
153
|
+
|
154
|
+
# Convenience accessors ---------------------------------------------------------
|
155
|
+
@property
|
156
|
+
def eid(self) -> str:
|
157
|
+
return str(self.row["enzyme_id"]).strip()
|
158
|
+
|
159
|
+
@property
|
160
|
+
def parent_id(self) -> str:
|
161
|
+
return str(self.row.get("parent_enzyme_id", "")).strip()
|
162
|
+
|
163
|
+
@property
|
164
|
+
def generation(self) -> str:
|
165
|
+
return str(self.row.get("generation", "")).strip()
|
166
|
+
|
167
|
+
@property
|
168
|
+
def aa_seq(self) -> str:
|
169
|
+
return (
|
170
|
+
str(self.row.get("protein_sequence", ""))
|
171
|
+
or str(self.row.get("aa_sequence", ""))
|
172
|
+
).strip()
|
173
|
+
|
174
|
+
@property
|
175
|
+
def nt_seq(self) -> str:
|
176
|
+
# First try to get actual NT sequence
|
177
|
+
nt = (
|
178
|
+
str(self.row.get("nucleotide_sequence", ""))
|
179
|
+
or str(self.row.get("nt_sequence", ""))
|
180
|
+
).strip()
|
181
|
+
|
182
|
+
# If no NT sequence but we have AA sequence, reverse translate
|
183
|
+
if (not nt or nt == "nan") and self.aa_seq:
|
184
|
+
nt = _rev_translate(self.aa_seq)
|
185
|
+
|
186
|
+
return nt
|
187
|
+
|
188
|
+
# Reaction-related -------------------------------------------------------------
|
189
|
+
def substrate_iupac(self) -> List[str]:
|
190
|
+
raw = str(self.row.get("substrate_iupac_list", "")).strip()
|
191
|
+
return _split_list(raw)
|
192
|
+
|
193
|
+
def product_iupac(self) -> List[str]:
|
194
|
+
raw = str(self.row.get("product_iupac_list", "")).strip()
|
195
|
+
return _split_list(raw)
|
196
|
+
|
197
|
+
|
198
|
+
def ttn_or_yield(self) -> Optional[float]:
|
199
|
+
for col in ("ttn", "yield"):
|
200
|
+
val = self.row.get(col)
|
201
|
+
if val is not None and pd.notna(val):
|
202
|
+
try:
|
203
|
+
return float(val)
|
204
|
+
except (ValueError, TypeError):
|
205
|
+
continue
|
206
|
+
return None
|
207
|
+
|
208
|
+
|
209
|
+
@dataclass
|
210
|
+
class FlatRow:
|
211
|
+
"""Row for the output CSV. Only validated on demand."""
|
212
|
+
|
213
|
+
id: str
|
214
|
+
barcode_plate: int
|
215
|
+
plate: str
|
216
|
+
well: str
|
217
|
+
smiles_string: str
|
218
|
+
smiles_reaction: str
|
219
|
+
alignment_count: int = 1
|
220
|
+
alignment_probability: float = 1.0
|
221
|
+
nucleotide_mutation: str = ""
|
222
|
+
amino_acid_substitutions: str = ""
|
223
|
+
nt_sequence: str = ""
|
224
|
+
aa_sequence: str = ""
|
225
|
+
x_coordinate: str = ""
|
226
|
+
y_coordinate: str = ""
|
227
|
+
fitness_value: Optional[float] = None
|
228
|
+
cofactor: str = ""
|
229
|
+
reaction_condition: str = ""
|
230
|
+
ee: str = ""
|
231
|
+
additional_information: str = ""
|
232
|
+
|
233
|
+
def as_dict(self) -> Dict[str, str]:
|
234
|
+
data = {
|
235
|
+
"id": self.id,
|
236
|
+
"barcode_plate": self.barcode_plate,
|
237
|
+
"plate": self.plate,
|
238
|
+
"well": self.well,
|
239
|
+
"smiles_string": self.smiles_string,
|
240
|
+
"smiles_reaction": self.smiles_reaction,
|
241
|
+
"alignment_count": self.alignment_count,
|
242
|
+
"alignment_probability": self.alignment_probability,
|
243
|
+
"nucleotide_mutation": self.nucleotide_mutation,
|
244
|
+
"amino_acid_substitutions": self.amino_acid_substitutions,
|
245
|
+
"nt_sequence": self.nt_sequence,
|
246
|
+
"aa_sequence": self.aa_sequence,
|
247
|
+
"x_coordinate": self.x_coordinate,
|
248
|
+
"y_coordinate": self.y_coordinate,
|
249
|
+
"fitness_value": self.fitness_value,
|
250
|
+
"cofactor": self.cofactor,
|
251
|
+
"reaction_condition": self.reaction_condition,
|
252
|
+
"ee": self.ee,
|
253
|
+
"additional_information": self.additional_information,
|
254
|
+
}
|
255
|
+
# Convert None to empty string for CSV friendliness
|
256
|
+
return {k: ("" if v is None else v) for k, v in data.items()}
|
257
|
+
|
258
|
+
|
259
|
+
# === 3. LOGGING HELPERS =============================================================
|
260
|
+
|
261
|
+
def get_logger(name: str = __name__) -> logging.Logger:
|
262
|
+
logger = logging.getLogger(name)
|
263
|
+
if not logger.handlers:
|
264
|
+
handler = logging.StreamHandler()
|
265
|
+
fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
266
|
+
handler.setFormatter(logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S"))
|
267
|
+
logger.addHandler(handler)
|
268
|
+
logger.setLevel(logging.INFO)
|
269
|
+
return logger
|
270
|
+
|
271
|
+
log = get_logger(__name__)
|
272
|
+
|
273
|
+
|
274
|
+
# === 4. CACHE & DB HELPERS ==========================================================
|
275
|
+
|
276
|
+
def _load_pickle(path: Path) -> Dict[str, str]:
|
277
|
+
if path.exists():
|
278
|
+
try:
|
279
|
+
with path.open("rb") as fh:
|
280
|
+
return pickle.load(fh)
|
281
|
+
except Exception as exc: # pragma: no cover
|
282
|
+
log.warning("Could not read cache %s: %s", path, exc)
|
283
|
+
return {}
|
284
|
+
|
285
|
+
|
286
|
+
def _save_pickle(obj: Dict[str, str], path: Path) -> None:
|
287
|
+
try:
|
288
|
+
with path.open("wb") as fh:
|
289
|
+
pickle.dump(obj, fh)
|
290
|
+
except Exception as exc: # pragma: no cover
|
291
|
+
log.warning("Could not write cache %s: %s", path, exc)
|
292
|
+
|
293
|
+
|
294
|
+
SMILES_CACHE: Dict[str, str] = _load_pickle(SMILES_CACHE_FILE)
|
295
|
+
SUBSTRATE_CACHE: Dict[str, str] = _load_pickle(SUBSTRATE_CACHE_FILE)
|
296
|
+
CANONICAL_CACHE: Dict[str, str] = _load_pickle(CANONICAL_CACHE_FILE)
|
297
|
+
|
298
|
+
|
299
|
+
# --- Database lookup ---------------------------------------------------------------
|
300
|
+
class PubChemDB:
|
301
|
+
"""Very thin wrapper around a local SQLite mapping IUPAC -> SMILES."""
|
302
|
+
|
303
|
+
def __init__(self, path: Path | str) -> None:
|
304
|
+
self.path = Path(path)
|
305
|
+
self._conn: Optional[sqlite3.Connection] = None
|
306
|
+
if not self.path.exists():
|
307
|
+
log.warning("Local PubChem DB not found at %s", self.path)
|
308
|
+
|
309
|
+
def _connect(self) -> sqlite3.Connection:
|
310
|
+
if self._conn is None:
|
311
|
+
self._conn = sqlite3.connect(str(self.path))
|
312
|
+
return self._conn
|
313
|
+
|
314
|
+
def lookup(self, name: str) -> Optional[str]:
|
315
|
+
if not self.path.exists():
|
316
|
+
return None
|
317
|
+
sql = "SELECT smiles FROM x WHERE name = ? LIMIT 1"
|
318
|
+
try:
|
319
|
+
# Create a new connection for thread safety
|
320
|
+
conn = sqlite3.connect(str(self.path))
|
321
|
+
cur = conn.execute(sql, (name.lower(),))
|
322
|
+
row = cur.fetchone()
|
323
|
+
conn.close()
|
324
|
+
return row[0] if row else None
|
325
|
+
except Exception: # pragma: no cover
|
326
|
+
return None
|
327
|
+
|
328
|
+
|
329
|
+
PC_DB = PubChemDB(PUBCHEM_DB_PATH)
|
330
|
+
|
331
|
+
|
332
|
+
# === 5. SEQUENCE / MUTATION HELPERS ================================================
|
333
|
+
|
334
|
+
# Genetic code for naive reverse translation --------------------------------
|
335
|
+
CODON: Dict[str, str] = {
|
336
|
+
# One representative codon per amino acid (simplified)
|
337
|
+
"A": "GCT", "R": "CGT", "N": "AAT", "D": "GAT", "C": "TGT", "Q": "CAA",
|
338
|
+
"E": "GAA", "G": "GGT", "H": "CAT", "I": "ATT", "L": "CTT", "K": "AAA",
|
339
|
+
"M": "ATG", "F": "TTT", "P": "CCT", "S": "TCT", "T": "ACT", "W": "TGG",
|
340
|
+
"Y": "TAT", "V": "GTT", "*": "TAA",
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
def _rev_translate(aa: str) -> str:
|
345
|
+
"""Rudimentary AA -> DNA translation (three-letter codon table above)."""
|
346
|
+
return "".join(CODON.get(res, "NNN") for res in aa)
|
347
|
+
|
348
|
+
|
349
|
+
def _aa_mut(parent: str, child: str) -> str:
|
350
|
+
"""Return simple mutation descriptor P12V_P34L ... comparing AA sequences."""
|
351
|
+
mutations = []
|
352
|
+
for idx, (p, c) in enumerate(zip(parent, child), start=1):
|
353
|
+
if p != c:
|
354
|
+
mutations.append(f"{p}{idx}{c}")
|
355
|
+
return "_".join(mutations)
|
356
|
+
|
357
|
+
|
358
|
+
def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str = "") -> str:
|
359
|
+
"""Return mutations at nucleotide level (uses reverse translation if needed)."""
|
360
|
+
if parent_nt and child_nt and len(parent_nt) > 0 and len(child_nt) > 0:
|
361
|
+
# Use actual nucleotide sequences if both are available
|
362
|
+
muts = []
|
363
|
+
for idx, (p, c) in enumerate(zip(parent_nt, child_nt), start=1):
|
364
|
+
if p != c:
|
365
|
+
muts.append(f"{p}{idx}{c}")
|
366
|
+
return "_".join(muts)
|
367
|
+
else:
|
368
|
+
# Fall back to reverse translation from protein sequences
|
369
|
+
p_seq = _rev_translate(parent_aa) if parent_aa else ""
|
370
|
+
c_seq = _rev_translate(child_aa) if child_aa else ""
|
371
|
+
muts = []
|
372
|
+
for idx, (p, c) in enumerate(zip(p_seq, c_seq), start=1):
|
373
|
+
if p != c:
|
374
|
+
muts.append(f"{p}{idx}{c}")
|
375
|
+
return "_".join(muts)
|
376
|
+
|
377
|
+
|
378
|
+
# === 6. SMILES CONVERSION HELPERS ==================================================
|
379
|
+
|
380
|
+
def _split_list(raw: str) -> List[str]:
|
381
|
+
if not raw or str(raw).lower() == 'nan':
|
382
|
+
return []
|
383
|
+
return [s.strip() for s in raw.split(";") if s.strip() and s.strip().lower() != 'nan']
|
384
|
+
|
385
|
+
|
386
|
+
def _canonical_smiles(smiles: str) -> str:
|
387
|
+
if not smiles or not RDKIT_OK:
|
388
|
+
return smiles
|
389
|
+
if smiles in CANONICAL_CACHE:
|
390
|
+
return CANONICAL_CACHE[smiles]
|
391
|
+
try:
|
392
|
+
mol = Chem.MolFromSmiles(smiles) # type: ignore[attr-defined]
|
393
|
+
if mol:
|
394
|
+
canon = Chem.MolToSmiles(mol, canonical=True) # type: ignore[attr-defined]
|
395
|
+
CANONICAL_CACHE[smiles] = canon
|
396
|
+
return canon
|
397
|
+
except Exception: # pragma: no cover
|
398
|
+
pass
|
399
|
+
return smiles
|
400
|
+
|
401
|
+
|
402
|
+
def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
403
|
+
"""Convert IUPAC (preferred) or plain name to SMILES with multi-tier lookup."""
|
404
|
+
# NO CACHING - Always try fresh conversion
|
405
|
+
|
406
|
+
# Filter out invalid values that shouldn't be converted
|
407
|
+
if not name or name.lower() in ['nan', 'none', 'null', 'n/a', 'na', '']:
|
408
|
+
return ""
|
409
|
+
|
410
|
+
# 1. Local DB (fast, offline)
|
411
|
+
db_smiles = PC_DB.lookup(name)
|
412
|
+
if db_smiles:
|
413
|
+
return db_smiles
|
414
|
+
|
415
|
+
# 2. OPSIN (if installed) ---------------------------------------------------
|
416
|
+
try:
|
417
|
+
import subprocess
|
418
|
+
|
419
|
+
# Use stdin to avoid shell interpretation issues with special characters
|
420
|
+
result = subprocess.run(
|
421
|
+
["opsin", "-osmi"], input=name, capture_output=True, text=True, check=False
|
422
|
+
)
|
423
|
+
if result.returncode == 0 and result.stdout.strip():
|
424
|
+
# OPSIN output may include a header line, so get the last non-empty line
|
425
|
+
lines = [line.strip() for line in result.stdout.strip().split("\n") if line.strip()]
|
426
|
+
if lines:
|
427
|
+
opsin_smiles = lines[-1]
|
428
|
+
return opsin_smiles
|
429
|
+
except FileNotFoundError:
|
430
|
+
pass # OPSIN not installed
|
431
|
+
|
432
|
+
# 3. PubChem PUG REST (online) ---------------------------------------------
|
433
|
+
try:
|
434
|
+
import requests
|
435
|
+
|
436
|
+
url = (
|
437
|
+
f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{requests.utils.quote(name)}/property/IsomericSMILES/TXT"
|
438
|
+
)
|
439
|
+
resp = requests.get(url, timeout=10)
|
440
|
+
if resp.ok:
|
441
|
+
pug_smiles = resp.text.strip().split("\n")[0]
|
442
|
+
return pug_smiles
|
443
|
+
except Exception: # pragma: no cover
|
444
|
+
pass
|
445
|
+
|
446
|
+
# Return empty string if all methods fail
|
447
|
+
return ""
|
448
|
+
|
449
|
+
|
450
|
+
def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
|
451
|
+
"""Convert a batch of names to SMILES in parallel."""
|
452
|
+
out: Dict[str, str] = {}
|
453
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
|
454
|
+
futures = {pool.submit(_name_to_smiles, n, is_substrate): n for n in names}
|
455
|
+
for fut in tqdm(as_completed(futures), total=len(futures), desc="SMILES"):
|
456
|
+
name = futures[fut]
|
457
|
+
try:
|
458
|
+
result = fut.result()
|
459
|
+
# Only store successful conversions
|
460
|
+
if result:
|
461
|
+
out[name] = result
|
462
|
+
else:
|
463
|
+
log.debug("SMILES conversion failed for %s", name)
|
464
|
+
except Exception as exc: # pragma: no cover
|
465
|
+
log.debug("SMILES conversion exception for %s: %s", name, exc)
|
466
|
+
return out
|
467
|
+
|
468
|
+
|
469
|
+
# === 7. FLATTENING CORE ============================================================
|
470
|
+
|
471
|
+
def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
472
|
+
"""Fill missing sequences in substrate scope entries from lineage entries."""
|
473
|
+
# Create lookup for sequences by enzyme_id
|
474
|
+
seq_lookup = {}
|
475
|
+
|
476
|
+
# First pass: collect all available sequences from lineage entries
|
477
|
+
for _, row in df.iterrows():
|
478
|
+
if row.get("data_type") == "lineage" or pd.notna(row.get("protein_sequence")) or pd.notna(row.get("aa_sequence")):
|
479
|
+
eid = str(row["enzyme_id"])
|
480
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
481
|
+
nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
|
482
|
+
if aa_seq and aa_seq != "nan":
|
483
|
+
seq_lookup[eid] = {
|
484
|
+
"aa_sequence": aa_seq,
|
485
|
+
"nt_sequence": nt_seq if nt_seq != "nan" else ""
|
486
|
+
}
|
487
|
+
|
488
|
+
# Second pass: fill missing sequences in substrate scope entries
|
489
|
+
filled_count = 0
|
490
|
+
for idx, row in df.iterrows():
|
491
|
+
eid = str(row["enzyme_id"])
|
492
|
+
|
493
|
+
# Check if this row needs sequence filling
|
494
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
495
|
+
if (not aa_seq or aa_seq == "nan") and eid in seq_lookup:
|
496
|
+
df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
|
497
|
+
df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
|
498
|
+
if seq_lookup[eid]["nt_sequence"]:
|
499
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
|
500
|
+
df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
|
501
|
+
filled_count += 1
|
502
|
+
|
503
|
+
if filled_count > 0:
|
504
|
+
log.info(f"Filled sequences for {filled_count} entries")
|
505
|
+
|
506
|
+
return df
|
507
|
+
|
508
|
+
def _plate_and_well(index: int) -> Tuple[int, str, str]:
|
509
|
+
"""Return (barcode_plate, plate_name, well) for the given running index."""
|
510
|
+
plate_number = index // PLATE_SIZE + BARCODE_START
|
511
|
+
idx_in_plate = index % PLATE_SIZE
|
512
|
+
row = WELL_ROWS[idx_in_plate // 12]
|
513
|
+
col = idx_in_plate % 12 + 1
|
514
|
+
well = f"{row}{col:02d}"
|
515
|
+
plate_name = f"Plate_{plate_number}"
|
516
|
+
return plate_number, plate_name, well
|
517
|
+
|
518
|
+
|
519
|
+
def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
|
520
|
+
"""Get root enzyme id, falling back to generation 0 ancestor or self."""
|
521
|
+
if eid in lineage_roots:
|
522
|
+
return lineage_roots[eid]
|
523
|
+
cur = eid
|
524
|
+
seen: set[str] = set()
|
525
|
+
while cur and cur not in seen:
|
526
|
+
seen.add(cur)
|
527
|
+
row = idmap.get(cur, {})
|
528
|
+
# Look for generation 0 as the root
|
529
|
+
if str(row.get("generation", "")).strip() == "0":
|
530
|
+
return cur
|
531
|
+
parent = row.get("parent_enzyme_id", "")
|
532
|
+
if not parent:
|
533
|
+
# If no parent, this is the root
|
534
|
+
return cur
|
535
|
+
cur = parent
|
536
|
+
return eid
|
537
|
+
|
538
|
+
|
539
|
+
def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
540
|
+
"""Infer lineage roots using generation numbers and simple sequence similarity."""
|
541
|
+
idmap: Dict[str, Dict[str, str]] = {str(r["enzyme_id"]): r for _, r in df.iterrows()}
|
542
|
+
roots: Dict[str, str] = {}
|
543
|
+
# Look for generation 0 as the root
|
544
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "0"}
|
545
|
+
# If no gen0 found, fall back to gen1
|
546
|
+
if not gen0:
|
547
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "1"}
|
548
|
+
|
549
|
+
def _seq_sim(a: str, b: str) -> float:
|
550
|
+
if not a or not b:
|
551
|
+
return 0.0
|
552
|
+
matches = sum(1 for x, y in zip(a, b) if x == y)
|
553
|
+
return matches / max(len(a), len(b))
|
554
|
+
|
555
|
+
for _, row in df.iterrows():
|
556
|
+
eid = row["enzyme_id"]
|
557
|
+
if eid in gen0:
|
558
|
+
roots[eid] = eid
|
559
|
+
continue
|
560
|
+
cur = eid
|
561
|
+
lineage_path: List[str] = []
|
562
|
+
while cur and cur not in lineage_path:
|
563
|
+
lineage_path.append(cur)
|
564
|
+
cur_row = idmap.get(cur, {})
|
565
|
+
parent = cur_row.get("parent_enzyme_id", "")
|
566
|
+
if not parent:
|
567
|
+
break
|
568
|
+
cur = parent
|
569
|
+
# If we found a gen0 ancestor in the path, use it
|
570
|
+
for anc in reversed(lineage_path):
|
571
|
+
if anc in gen0:
|
572
|
+
roots[eid] = anc
|
573
|
+
break
|
574
|
+
else:
|
575
|
+
# Fall back to closest by sequence similarity among gen0
|
576
|
+
aa_seq = (
|
577
|
+
str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
578
|
+
)
|
579
|
+
best_match = None
|
580
|
+
best_sim = 0.0
|
581
|
+
for g0 in gen0:
|
582
|
+
g0_row = idmap[g0]
|
583
|
+
g0_seq = (
|
584
|
+
str(g0_row.get("protein_sequence", ""))
|
585
|
+
or str(g0_row.get("aa_sequence", ""))
|
586
|
+
)
|
587
|
+
sim = _seq_sim(aa_seq, g0_seq)
|
588
|
+
if sim > best_sim:
|
589
|
+
best_sim, best_match = sim, g0
|
590
|
+
roots[eid] = best_match if best_match else eid
|
591
|
+
return roots
|
592
|
+
|
593
|
+
|
594
|
+
def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
595
|
+
"""Main public API: returns a DataFrame in the flat output format."""
|
596
|
+
# Apply column aliases to the dataframe
|
597
|
+
for alias, canonical in COLUMN_ALIASES.items():
|
598
|
+
if alias in df.columns and canonical not in df.columns:
|
599
|
+
df = df.rename(columns={alias: canonical})
|
600
|
+
|
601
|
+
# Fill missing sequences in substrate scope entries from lineage data
|
602
|
+
df = _fill_missing_sequences(df)
|
603
|
+
|
604
|
+
# 1. Generate lineage roots once -----------------------------------------
|
605
|
+
lineage_roots = _generate_lineage_roots(df)
|
606
|
+
|
607
|
+
# 2. Precompute SMILES in bulk -------------------------------------------
|
608
|
+
all_products: List[str] = []
|
609
|
+
all_subs: List[str] = []
|
610
|
+
for _, r in df.iterrows():
|
611
|
+
rec = VariantRecord(r.to_dict())
|
612
|
+
all_products.extend(rec.product_iupac())
|
613
|
+
all_subs.extend(rec.substrate_iupac())
|
614
|
+
prod_cache = _batch_convert(list(set(all_products)), is_substrate=False)
|
615
|
+
sub_cache = _batch_convert(list(set(all_subs)), is_substrate=True)
|
616
|
+
|
617
|
+
# NO CACHING - Comment out cache updates
|
618
|
+
# SMILES_CACHE.update(prod_cache)
|
619
|
+
# SUBSTRATE_CACHE.update(sub_cache)
|
620
|
+
# _save_pickle(SMILES_CACHE, SMILES_CACHE_FILE)
|
621
|
+
# _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
|
622
|
+
|
623
|
+
# 3. Flatten rows ---------------------------------------------------------
|
624
|
+
idmap = {str(r["enzyme_id"]): r.to_dict() for _, r in df.iterrows()}
|
625
|
+
output_rows: List[Dict[str, str]] = []
|
626
|
+
for idx, (_, row) in enumerate(df.iterrows()):
|
627
|
+
rec = VariantRecord(row.to_dict())
|
628
|
+
eid = rec.eid
|
629
|
+
|
630
|
+
# Reaction data -------------------------------------------------------
|
631
|
+
subs = rec.substrate_iupac()
|
632
|
+
prods = rec.product_iupac()
|
633
|
+
data_type = rec.row.get("data_type", "")
|
634
|
+
|
635
|
+
if not subs or not prods:
|
636
|
+
# Skip entries without reaction info unless it's marked as lineage only
|
637
|
+
if data_type == "lineage":
|
638
|
+
subs, prods = [""], [""] # placeholders
|
639
|
+
else:
|
640
|
+
log.debug("Skipping %s due to missing reaction data", eid)
|
641
|
+
continue
|
642
|
+
|
643
|
+
sub_smiles = [sub_cache.get(s, "") for s in subs]
|
644
|
+
prod_smiles = [prod_cache.get(p, "") for p in prods]
|
645
|
+
|
646
|
+
smiles_string = ".".join(prod_smiles)
|
647
|
+
smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
|
648
|
+
smiles_string = _canonical_smiles(smiles_string)
|
649
|
+
|
650
|
+
# Mutations -----------------------------------------------------------
|
651
|
+
root_id = _root_enzyme_id(eid, idmap, lineage_roots)
|
652
|
+
root_row = idmap[root_id]
|
653
|
+
root_aa = (
|
654
|
+
str(root_row.get("protein_sequence", ""))
|
655
|
+
or str(root_row.get("aa_sequence", ""))
|
656
|
+
)
|
657
|
+
root_nt = (
|
658
|
+
str(root_row.get("nucleotide_sequence", ""))
|
659
|
+
or str(root_row.get("nt_sequence", ""))
|
660
|
+
)
|
661
|
+
# If root doesn't have NT sequence but has AA sequence, reverse translate
|
662
|
+
if (not root_nt or root_nt == "nan") and root_aa:
|
663
|
+
root_nt = _rev_translate(root_aa)
|
664
|
+
|
665
|
+
aa_muts = _aa_mut(root_aa, rec.aa_seq) if rec.aa_seq and root_aa else ""
|
666
|
+
nt_muts = _nt_mut(root_aa, rec.aa_seq, root_nt, rec.nt_seq) if root_aa or root_nt else ""
|
667
|
+
|
668
|
+
# Plate / well --------------------------------------------------------
|
669
|
+
barcode_plate, plate_name, well = _plate_and_well(idx)
|
670
|
+
|
671
|
+
# Reaction conditions -------------------------------------------------
|
672
|
+
cond_parts = []
|
673
|
+
for fld in (
|
674
|
+
"reaction_temperature",
|
675
|
+
"reaction_ph",
|
676
|
+
"reaction_other_conditions",
|
677
|
+
"reaction_substrate_concentration",
|
678
|
+
):
|
679
|
+
if row.get(fld):
|
680
|
+
cond_parts.append(f"{fld}:{row[fld]}")
|
681
|
+
reaction_condition = ";".join(cond_parts)
|
682
|
+
|
683
|
+
# Cofactor (IUPAC list preferred, fallback plain list) ---------------
|
684
|
+
cof_iupac = str(row.get("cofactor_iupac_list", "")).strip()
|
685
|
+
cof_list = str(row.get("cofactor_list", "")).strip()
|
686
|
+
cofactor = cof_iupac or cof_list
|
687
|
+
|
688
|
+
# Additional info -----------------------------------------------------
|
689
|
+
extra: Dict[str, str] = {
|
690
|
+
k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
|
691
|
+
}
|
692
|
+
if rec.ttn_or_yield() is not None:
|
693
|
+
ttn_val = row.get("ttn")
|
694
|
+
extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
|
695
|
+
additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
|
696
|
+
|
697
|
+
flat = FlatRow(
|
698
|
+
id=eid,
|
699
|
+
barcode_plate=barcode_plate,
|
700
|
+
plate=plate_name,
|
701
|
+
well=well,
|
702
|
+
smiles_string=smiles_string,
|
703
|
+
smiles_reaction=smiles_reaction,
|
704
|
+
nucleotide_mutation=nt_muts,
|
705
|
+
amino_acid_substitutions=aa_muts,
|
706
|
+
nt_sequence=rec.nt_seq,
|
707
|
+
aa_sequence=rec.aa_seq,
|
708
|
+
fitness_value=rec.ttn_or_yield(),
|
709
|
+
cofactor=cofactor,
|
710
|
+
reaction_condition=reaction_condition,
|
711
|
+
ee=str(row.get("ee", "")),
|
712
|
+
additional_information=additional_information,
|
713
|
+
)
|
714
|
+
output_rows.append(flat.as_dict())
|
715
|
+
|
716
|
+
out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
|
717
|
+
return out_df
|
718
|
+
|
719
|
+
|
720
|
+
# === 8. PIPELINE ORCHESTRATOR ======================================================
|
721
|
+
|
722
|
+
def run_pipeline(reaction_csv: str | Path | None = None,
|
723
|
+
substrate_scope_csv: str | Path | None = None,
|
724
|
+
output_csv: str | Path | None = None) -> pd.DataFrame:
|
725
|
+
"""Run the pipeline on reaction and/or substrate scope CSV files.
|
726
|
+
|
727
|
+
Args:
|
728
|
+
reaction_csv: Path to reaction/lineage data CSV (optional)
|
729
|
+
substrate_scope_csv: Path to substrate scope data CSV (optional)
|
730
|
+
output_csv: Path to write the formatted output CSV
|
731
|
+
|
732
|
+
Returns:
|
733
|
+
DataFrame with flattened lineage data
|
734
|
+
"""
|
735
|
+
t0 = time.perf_counter()
|
736
|
+
|
737
|
+
dfs = []
|
738
|
+
|
739
|
+
# Load reaction data if provided
|
740
|
+
if reaction_csv:
|
741
|
+
df_reaction = pd.read_csv(reaction_csv)
|
742
|
+
df_reaction['data_type'] = 'lineage'
|
743
|
+
# Handle column aliasing for reaction data
|
744
|
+
if 'enzyme' in df_reaction.columns and 'enzyme_id' not in df_reaction.columns:
|
745
|
+
df_reaction['enzyme_id'] = df_reaction['enzyme']
|
746
|
+
log.info("Loaded %d reaction entries from %s", len(df_reaction), reaction_csv)
|
747
|
+
dfs.append(df_reaction)
|
748
|
+
|
749
|
+
# Load substrate scope data if provided
|
750
|
+
if substrate_scope_csv:
|
751
|
+
df_substrate = pd.read_csv(substrate_scope_csv)
|
752
|
+
df_substrate['data_type'] = 'substrate_scope'
|
753
|
+
log.info("Loaded %d substrate scope entries from %s", len(df_substrate), substrate_scope_csv)
|
754
|
+
dfs.append(df_substrate)
|
755
|
+
|
756
|
+
if not dfs:
|
757
|
+
raise ValueError("At least one input CSV must be provided")
|
758
|
+
|
759
|
+
# Combine dataframes
|
760
|
+
if len(dfs) > 1:
|
761
|
+
df_in = pd.concat(dfs, ignore_index=True)
|
762
|
+
log.info("Combined data: %d total entries", len(df_in))
|
763
|
+
else:
|
764
|
+
df_in = dfs[0]
|
765
|
+
|
766
|
+
df_out = flatten_dataframe(df_in)
|
767
|
+
log.info("Flattened to %d rows", len(df_out))
|
768
|
+
|
769
|
+
if output_csv:
|
770
|
+
df_out.to_csv(output_csv, index=False)
|
771
|
+
log.info("Wrote output CSV to %s (%.1f kB)", output_csv, Path(output_csv).stat().st_size / 1024)
|
772
|
+
|
773
|
+
log.info("Pipeline finished in %.2f s", time.perf_counter() - t0)
|
774
|
+
return df_out
|
775
|
+
|
776
|
+
|
777
|
+
# === 9. CLI ENTRYPOINT =============================================================
|
778
|
+
|
779
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
780
|
+
p = argparse.ArgumentParser(
|
781
|
+
prog="lineage_flattener",
|
782
|
+
description="Flatten enzyme lineage CSV into reaction table for automation",
|
783
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
784
|
+
)
|
785
|
+
p.add_argument("-r", "--reaction", help="Reaction/lineage data CSV file")
|
786
|
+
p.add_argument("-s", "--substrate-scope", help="Substrate scope data CSV file")
|
787
|
+
p.add_argument("-o", "--output", help="Path to write flattened CSV")
|
788
|
+
p.add_argument("-v", "--verbose", action="count", default=0, help="Increase verbosity (-v, -vv)")
|
789
|
+
return p
|
790
|
+
|
791
|
+
|
792
|
+
def main(argv: Optional[List[str]] = None) -> None:
|
793
|
+
args = _build_arg_parser().parse_args(argv)
|
794
|
+
level = logging.DEBUG if args.verbose and args.verbose > 1 else logging.INFO if args.verbose else logging.WARNING
|
795
|
+
logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
|
796
|
+
|
797
|
+
if not args.reaction and not args.substrate_scope:
|
798
|
+
log.error("At least one input file must be provided (--reaction or --substrate-scope)")
|
799
|
+
sys.exit(1)
|
800
|
+
|
801
|
+
run_pipeline(args.reaction, args.substrate_scope, args.output)
|
802
|
+
|
803
|
+
|
804
|
+
if __name__ == "__main__":
|
805
|
+
main()
|
806
|
+
|
807
|
+
# --------------------------------------------------------------------------- END ---
|
808
|
+
|