debase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,808 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ lineage_flattener.py
4
+ ====================
5
+ A **complete rewrite** of the original `lineage_format.py`, structured in the
6
+ same sectioned style as `enzyme_lineage_extractor.py`, but **without any non-ASCII
7
+ characters**. All input and output column names are declared once as top-level
8
+ constants to prevent accidental drift.
9
+
10
+ The tool reads an annotated CSV containing enzyme variant information (lineage,
11
+ sequences, reaction data, fitness, etc.) and produces a flat reaction table
12
+ (one row per product) suitable for robotic plate builders or downstream ML.
13
+
14
+ -------------------------------------------------------------------------------
15
+ SECTION GUIDE (grep-able):
16
+ # === 1. CONFIG & CONSTANTS ===
17
+ # === 2. DOMAIN MODELS ===
18
+ # === 3. LOGGING HELPERS ===
19
+ # === 4. CACHE & DB HELPERS ===
20
+ # === 5. SEQUENCE / MUTATION HELPERS ===
21
+ # === 6. SMILES CONVERSION HELPERS ===
22
+ # === 7. FLATTENING CORE ===
23
+ # === 8. PIPELINE ORCHESTRATOR ===
24
+ # === 9. CLI ENTRYPOINT ===
25
+ -------------------------------------------------------------------------------
26
+ """
27
+
28
+ # === 1. CONFIG & CONSTANTS ===================================================
29
+ from __future__ import annotations
30
+
31
+ import argparse
32
+ import csv
33
+ import json
34
+ import logging
35
+ import os
36
+ import pickle
37
+ import re
38
+ import sqlite3
39
+ import sys
40
+ import time
41
+ from concurrent.futures import ThreadPoolExecutor, as_completed
42
+ from dataclasses import dataclass, field
43
+ from pathlib import Path
44
+ from typing import Dict, List, Optional, Sequence, Tuple, Union
45
+
46
+ import pandas as pd
47
+ from tqdm import tqdm
48
+
49
+ try:
50
+ from rdkit import Chem # type: ignore
51
+ RDKIT_OK = True
52
+ except ImportError: # pragma: no cover
53
+ RDKIT_OK = False
54
+
55
+ # Input columns that MUST be present ------------------------------------------------
56
+ INPUT_REQUIRED: Tuple[str, ...] = (
57
+ "enzyme_id",
58
+ "substrate_iupac_list", # preferred source for SMILES lookup
59
+ "product_iupac_list", # preferred source for SMILES lookup
60
+ )
61
+
62
+ # Alternative column names that can be used instead
63
+ COLUMN_ALIASES: Dict[str, str] = {
64
+ "enzyme": "enzyme_id", # Handle 'enzyme' as an alias for 'enzyme_id'
65
+ }
66
+
67
+ # Optional but recognized input fields ----------------------------------------------
68
+ OPTIONAL_INPUT: Tuple[str, ...] = (
69
+ "parent_enzyme_id",
70
+ "generation",
71
+ "protein_sequence",
72
+ "aa_sequence",
73
+ "nucleotide_sequence",
74
+ "nt_sequence",
75
+ "ttn",
76
+ "yield",
77
+ "reaction_temperature",
78
+ "reaction_ph",
79
+ "reaction_other_conditions",
80
+ "reaction_substrate_concentration",
81
+ "cofactor_iupac_list",
82
+ "cofactor_list",
83
+ "ee",
84
+ "data_type", # either "lineage" or "substrate_scope"
85
+ "substrate", # fallback names
86
+ "substrate_name",
87
+ "compound",
88
+ "product",
89
+ "product_name",
90
+ )
91
+
92
+ # Output columns --------------------------------------------------------------------
93
+ OUTPUT_COLUMNS: Tuple[str, ...] = (
94
+ "id",
95
+ "barcode_plate",
96
+ "plate",
97
+ "well",
98
+ "smiles_string",
99
+ "smiles_reaction",
100
+ "alignment_count",
101
+ "alignment_probability",
102
+ "nucleotide_mutation",
103
+ "amino_acid_substitutions",
104
+ "nt_sequence",
105
+ "aa_sequence",
106
+ "x_coordinate",
107
+ "y_coordinate",
108
+ "fitness_value",
109
+ "cofactor",
110
+ "reaction_condition",
111
+ "ee",
112
+ "additional_information",
113
+ )
114
+
115
+ # Plate layout constants -------------------------------------------------------------
116
+ PLATE_SIZE: int = 96
117
+ BARCODE_START: int = 1
118
+
119
+ # Batch / parallelism ----------------------------------------------------------------
120
+ MAX_WORKERS: int = min(32, (os.cpu_count() or 4) * 2)
121
+ BATCH_SIZE: int = 50
122
+
123
+ # Cache files ------------------------------------------------------------------------
124
+ CACHE_DIR: Path = Path(os.environ.get("LINEAGE_CACHE_DIR", "./.cache"))
125
+ SMILES_CACHE_FILE: Path = CACHE_DIR / "smiles_cache.pkl"
126
+ SUBSTRATE_CACHE_FILE: Path = CACHE_DIR / "substrate_smiles_cache.pkl"
127
+ CANONICAL_CACHE_FILE: Path = CACHE_DIR / "canonical_smiles_cache.pkl"
128
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
129
+
130
+ # Local PubChem DB (optional) --------------------------------------------------------
131
+ PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
132
+
133
+ # Miscellaneous ----------------------------------------------------------------------
134
+ WELL_ROWS: str = "ABCDEFGH" # 8 rows, 12 cols => 96 wells
135
+
136
+
137
+ # === 2. DOMAIN MODELS ===============================================================
138
+ @dataclass
139
+ class VariantRecord:
140
+ """Minimal representation of an enzyme variant row from the input CSV."""
141
+
142
+ row: Dict[str, str]
143
+
144
+ def __post_init__(self) -> None:
145
+ # Apply column aliases
146
+ for alias, canonical in COLUMN_ALIASES.items():
147
+ if alias in self.row and canonical not in self.row:
148
+ self.row[canonical] = self.row[alias]
149
+
150
+ missing = [c for c in INPUT_REQUIRED if c not in self.row]
151
+ if missing:
152
+ raise ValueError(f"Missing required columns: {', '.join(missing)}")
153
+
154
+ # Convenience accessors ---------------------------------------------------------
155
+ @property
156
+ def eid(self) -> str:
157
+ return str(self.row["enzyme_id"]).strip()
158
+
159
+ @property
160
+ def parent_id(self) -> str:
161
+ return str(self.row.get("parent_enzyme_id", "")).strip()
162
+
163
+ @property
164
+ def generation(self) -> str:
165
+ return str(self.row.get("generation", "")).strip()
166
+
167
+ @property
168
+ def aa_seq(self) -> str:
169
+ return (
170
+ str(self.row.get("protein_sequence", ""))
171
+ or str(self.row.get("aa_sequence", ""))
172
+ ).strip()
173
+
174
+ @property
175
+ def nt_seq(self) -> str:
176
+ # First try to get actual NT sequence
177
+ nt = (
178
+ str(self.row.get("nucleotide_sequence", ""))
179
+ or str(self.row.get("nt_sequence", ""))
180
+ ).strip()
181
+
182
+ # If no NT sequence but we have AA sequence, reverse translate
183
+ if (not nt or nt == "nan") and self.aa_seq:
184
+ nt = _rev_translate(self.aa_seq)
185
+
186
+ return nt
187
+
188
+ # Reaction-related -------------------------------------------------------------
189
+ def substrate_iupac(self) -> List[str]:
190
+ raw = str(self.row.get("substrate_iupac_list", "")).strip()
191
+ return _split_list(raw)
192
+
193
+ def product_iupac(self) -> List[str]:
194
+ raw = str(self.row.get("product_iupac_list", "")).strip()
195
+ return _split_list(raw)
196
+
197
+
198
+ def ttn_or_yield(self) -> Optional[float]:
199
+ for col in ("ttn", "yield"):
200
+ val = self.row.get(col)
201
+ if val is not None and pd.notna(val):
202
+ try:
203
+ return float(val)
204
+ except (ValueError, TypeError):
205
+ continue
206
+ return None
207
+
208
+
209
+ @dataclass
210
+ class FlatRow:
211
+ """Row for the output CSV. Only validated on demand."""
212
+
213
+ id: str
214
+ barcode_plate: int
215
+ plate: str
216
+ well: str
217
+ smiles_string: str
218
+ smiles_reaction: str
219
+ alignment_count: int = 1
220
+ alignment_probability: float = 1.0
221
+ nucleotide_mutation: str = ""
222
+ amino_acid_substitutions: str = ""
223
+ nt_sequence: str = ""
224
+ aa_sequence: str = ""
225
+ x_coordinate: str = ""
226
+ y_coordinate: str = ""
227
+ fitness_value: Optional[float] = None
228
+ cofactor: str = ""
229
+ reaction_condition: str = ""
230
+ ee: str = ""
231
+ additional_information: str = ""
232
+
233
+ def as_dict(self) -> Dict[str, str]:
234
+ data = {
235
+ "id": self.id,
236
+ "barcode_plate": self.barcode_plate,
237
+ "plate": self.plate,
238
+ "well": self.well,
239
+ "smiles_string": self.smiles_string,
240
+ "smiles_reaction": self.smiles_reaction,
241
+ "alignment_count": self.alignment_count,
242
+ "alignment_probability": self.alignment_probability,
243
+ "nucleotide_mutation": self.nucleotide_mutation,
244
+ "amino_acid_substitutions": self.amino_acid_substitutions,
245
+ "nt_sequence": self.nt_sequence,
246
+ "aa_sequence": self.aa_sequence,
247
+ "x_coordinate": self.x_coordinate,
248
+ "y_coordinate": self.y_coordinate,
249
+ "fitness_value": self.fitness_value,
250
+ "cofactor": self.cofactor,
251
+ "reaction_condition": self.reaction_condition,
252
+ "ee": self.ee,
253
+ "additional_information": self.additional_information,
254
+ }
255
+ # Convert None to empty string for CSV friendliness
256
+ return {k: ("" if v is None else v) for k, v in data.items()}
257
+
258
+
259
+ # === 3. LOGGING HELPERS =============================================================
260
+
261
+ def get_logger(name: str = __name__) -> logging.Logger:
262
+ logger = logging.getLogger(name)
263
+ if not logger.handlers:
264
+ handler = logging.StreamHandler()
265
+ fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
266
+ handler.setFormatter(logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S"))
267
+ logger.addHandler(handler)
268
+ logger.setLevel(logging.INFO)
269
+ return logger
270
+
271
+ log = get_logger(__name__)
272
+
273
+
274
+ # === 4. CACHE & DB HELPERS ==========================================================
275
+
276
+ def _load_pickle(path: Path) -> Dict[str, str]:
277
+ if path.exists():
278
+ try:
279
+ with path.open("rb") as fh:
280
+ return pickle.load(fh)
281
+ except Exception as exc: # pragma: no cover
282
+ log.warning("Could not read cache %s: %s", path, exc)
283
+ return {}
284
+
285
+
286
+ def _save_pickle(obj: Dict[str, str], path: Path) -> None:
287
+ try:
288
+ with path.open("wb") as fh:
289
+ pickle.dump(obj, fh)
290
+ except Exception as exc: # pragma: no cover
291
+ log.warning("Could not write cache %s: %s", path, exc)
292
+
293
+
294
+ SMILES_CACHE: Dict[str, str] = _load_pickle(SMILES_CACHE_FILE)
295
+ SUBSTRATE_CACHE: Dict[str, str] = _load_pickle(SUBSTRATE_CACHE_FILE)
296
+ CANONICAL_CACHE: Dict[str, str] = _load_pickle(CANONICAL_CACHE_FILE)
297
+
298
+
299
+ # --- Database lookup ---------------------------------------------------------------
300
+ class PubChemDB:
301
+ """Very thin wrapper around a local SQLite mapping IUPAC -> SMILES."""
302
+
303
+ def __init__(self, path: Path | str) -> None:
304
+ self.path = Path(path)
305
+ self._conn: Optional[sqlite3.Connection] = None
306
+ if not self.path.exists():
307
+ log.warning("Local PubChem DB not found at %s", self.path)
308
+
309
+ def _connect(self) -> sqlite3.Connection:
310
+ if self._conn is None:
311
+ self._conn = sqlite3.connect(str(self.path))
312
+ return self._conn
313
+
314
+ def lookup(self, name: str) -> Optional[str]:
315
+ if not self.path.exists():
316
+ return None
317
+ sql = "SELECT smiles FROM x WHERE name = ? LIMIT 1"
318
+ try:
319
+ # Create a new connection for thread safety
320
+ conn = sqlite3.connect(str(self.path))
321
+ cur = conn.execute(sql, (name.lower(),))
322
+ row = cur.fetchone()
323
+ conn.close()
324
+ return row[0] if row else None
325
+ except Exception: # pragma: no cover
326
+ return None
327
+
328
+
329
+ PC_DB = PubChemDB(PUBCHEM_DB_PATH)
330
+
331
+
332
+ # === 5. SEQUENCE / MUTATION HELPERS ================================================
333
+
334
+ # Genetic code for naive reverse translation --------------------------------
335
+ CODON: Dict[str, str] = {
336
+ # One representative codon per amino acid (simplified)
337
+ "A": "GCT", "R": "CGT", "N": "AAT", "D": "GAT", "C": "TGT", "Q": "CAA",
338
+ "E": "GAA", "G": "GGT", "H": "CAT", "I": "ATT", "L": "CTT", "K": "AAA",
339
+ "M": "ATG", "F": "TTT", "P": "CCT", "S": "TCT", "T": "ACT", "W": "TGG",
340
+ "Y": "TAT", "V": "GTT", "*": "TAA",
341
+ }
342
+
343
+
344
+ def _rev_translate(aa: str) -> str:
345
+ """Rudimentary AA -> DNA translation (three-letter codon table above)."""
346
+ return "".join(CODON.get(res, "NNN") for res in aa)
347
+
348
+
349
+ def _aa_mut(parent: str, child: str) -> str:
350
+ """Return simple mutation descriptor P12V_P34L ... comparing AA sequences."""
351
+ mutations = []
352
+ for idx, (p, c) in enumerate(zip(parent, child), start=1):
353
+ if p != c:
354
+ mutations.append(f"{p}{idx}{c}")
355
+ return "_".join(mutations)
356
+
357
+
358
+ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str = "") -> str:
359
+ """Return mutations at nucleotide level (uses reverse translation if needed)."""
360
+ if parent_nt and child_nt and len(parent_nt) > 0 and len(child_nt) > 0:
361
+ # Use actual nucleotide sequences if both are available
362
+ muts = []
363
+ for idx, (p, c) in enumerate(zip(parent_nt, child_nt), start=1):
364
+ if p != c:
365
+ muts.append(f"{p}{idx}{c}")
366
+ return "_".join(muts)
367
+ else:
368
+ # Fall back to reverse translation from protein sequences
369
+ p_seq = _rev_translate(parent_aa) if parent_aa else ""
370
+ c_seq = _rev_translate(child_aa) if child_aa else ""
371
+ muts = []
372
+ for idx, (p, c) in enumerate(zip(p_seq, c_seq), start=1):
373
+ if p != c:
374
+ muts.append(f"{p}{idx}{c}")
375
+ return "_".join(muts)
376
+
377
+
378
+ # === 6. SMILES CONVERSION HELPERS ==================================================
379
+
380
+ def _split_list(raw: str) -> List[str]:
381
+ if not raw or str(raw).lower() == 'nan':
382
+ return []
383
+ return [s.strip() for s in raw.split(";") if s.strip() and s.strip().lower() != 'nan']
384
+
385
+
386
+ def _canonical_smiles(smiles: str) -> str:
387
+ if not smiles or not RDKIT_OK:
388
+ return smiles
389
+ if smiles in CANONICAL_CACHE:
390
+ return CANONICAL_CACHE[smiles]
391
+ try:
392
+ mol = Chem.MolFromSmiles(smiles) # type: ignore[attr-defined]
393
+ if mol:
394
+ canon = Chem.MolToSmiles(mol, canonical=True) # type: ignore[attr-defined]
395
+ CANONICAL_CACHE[smiles] = canon
396
+ return canon
397
+ except Exception: # pragma: no cover
398
+ pass
399
+ return smiles
400
+
401
+
402
+ def _name_to_smiles(name: str, is_substrate: bool) -> str:
403
+ """Convert IUPAC (preferred) or plain name to SMILES with multi-tier lookup."""
404
+ # NO CACHING - Always try fresh conversion
405
+
406
+ # Filter out invalid values that shouldn't be converted
407
+ if not name or name.lower() in ['nan', 'none', 'null', 'n/a', 'na', '']:
408
+ return ""
409
+
410
+ # 1. Local DB (fast, offline)
411
+ db_smiles = PC_DB.lookup(name)
412
+ if db_smiles:
413
+ return db_smiles
414
+
415
+ # 2. OPSIN (if installed) ---------------------------------------------------
416
+ try:
417
+ import subprocess
418
+
419
+ # Use stdin to avoid shell interpretation issues with special characters
420
+ result = subprocess.run(
421
+ ["opsin", "-osmi"], input=name, capture_output=True, text=True, check=False
422
+ )
423
+ if result.returncode == 0 and result.stdout.strip():
424
+ # OPSIN output may include a header line, so get the last non-empty line
425
+ lines = [line.strip() for line in result.stdout.strip().split("\n") if line.strip()]
426
+ if lines:
427
+ opsin_smiles = lines[-1]
428
+ return opsin_smiles
429
+ except FileNotFoundError:
430
+ pass # OPSIN not installed
431
+
432
+ # 3. PubChem PUG REST (online) ---------------------------------------------
433
+ try:
434
+ import requests
435
+
436
+ url = (
437
+ f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{requests.utils.quote(name)}/property/IsomericSMILES/TXT"
438
+ )
439
+ resp = requests.get(url, timeout=10)
440
+ if resp.ok:
441
+ pug_smiles = resp.text.strip().split("\n")[0]
442
+ return pug_smiles
443
+ except Exception: # pragma: no cover
444
+ pass
445
+
446
+ # Return empty string if all methods fail
447
+ return ""
448
+
449
+
450
+ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
451
+ """Convert a batch of names to SMILES in parallel."""
452
+ out: Dict[str, str] = {}
453
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
454
+ futures = {pool.submit(_name_to_smiles, n, is_substrate): n for n in names}
455
+ for fut in tqdm(as_completed(futures), total=len(futures), desc="SMILES"):
456
+ name = futures[fut]
457
+ try:
458
+ result = fut.result()
459
+ # Only store successful conversions
460
+ if result:
461
+ out[name] = result
462
+ else:
463
+ log.debug("SMILES conversion failed for %s", name)
464
+ except Exception as exc: # pragma: no cover
465
+ log.debug("SMILES conversion exception for %s: %s", name, exc)
466
+ return out
467
+
468
+
469
+ # === 7. FLATTENING CORE ============================================================
470
+
471
+ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
472
+ """Fill missing sequences in substrate scope entries from lineage entries."""
473
+ # Create lookup for sequences by enzyme_id
474
+ seq_lookup = {}
475
+
476
+ # First pass: collect all available sequences from lineage entries
477
+ for _, row in df.iterrows():
478
+ if row.get("data_type") == "lineage" or pd.notna(row.get("protein_sequence")) or pd.notna(row.get("aa_sequence")):
479
+ eid = str(row["enzyme_id"])
480
+ aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
481
+ nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
482
+ if aa_seq and aa_seq != "nan":
483
+ seq_lookup[eid] = {
484
+ "aa_sequence": aa_seq,
485
+ "nt_sequence": nt_seq if nt_seq != "nan" else ""
486
+ }
487
+
488
+ # Second pass: fill missing sequences in substrate scope entries
489
+ filled_count = 0
490
+ for idx, row in df.iterrows():
491
+ eid = str(row["enzyme_id"])
492
+
493
+ # Check if this row needs sequence filling
494
+ aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
495
+ if (not aa_seq or aa_seq == "nan") and eid in seq_lookup:
496
+ df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
497
+ df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
498
+ if seq_lookup[eid]["nt_sequence"]:
499
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
500
+ df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
501
+ filled_count += 1
502
+
503
+ if filled_count > 0:
504
+ log.info(f"Filled sequences for {filled_count} entries")
505
+
506
+ return df
507
+
508
+ def _plate_and_well(index: int) -> Tuple[int, str, str]:
509
+ """Return (barcode_plate, plate_name, well) for the given running index."""
510
+ plate_number = index // PLATE_SIZE + BARCODE_START
511
+ idx_in_plate = index % PLATE_SIZE
512
+ row = WELL_ROWS[idx_in_plate // 12]
513
+ col = idx_in_plate % 12 + 1
514
+ well = f"{row}{col:02d}"
515
+ plate_name = f"Plate_{plate_number}"
516
+ return plate_number, plate_name, well
517
+
518
+
519
+ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
520
+ """Get root enzyme id, falling back to generation 0 ancestor or self."""
521
+ if eid in lineage_roots:
522
+ return lineage_roots[eid]
523
+ cur = eid
524
+ seen: set[str] = set()
525
+ while cur and cur not in seen:
526
+ seen.add(cur)
527
+ row = idmap.get(cur, {})
528
+ # Look for generation 0 as the root
529
+ if str(row.get("generation", "")).strip() == "0":
530
+ return cur
531
+ parent = row.get("parent_enzyme_id", "")
532
+ if not parent:
533
+ # If no parent, this is the root
534
+ return cur
535
+ cur = parent
536
+ return eid
537
+
538
+
539
+ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
540
+ """Infer lineage roots using generation numbers and simple sequence similarity."""
541
+ idmap: Dict[str, Dict[str, str]] = {str(r["enzyme_id"]): r for _, r in df.iterrows()}
542
+ roots: Dict[str, str] = {}
543
+ # Look for generation 0 as the root
544
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "0"}
545
+ # If no gen0 found, fall back to gen1
546
+ if not gen0:
547
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "1"}
548
+
549
+ def _seq_sim(a: str, b: str) -> float:
550
+ if not a or not b:
551
+ return 0.0
552
+ matches = sum(1 for x, y in zip(a, b) if x == y)
553
+ return matches / max(len(a), len(b))
554
+
555
+ for _, row in df.iterrows():
556
+ eid = row["enzyme_id"]
557
+ if eid in gen0:
558
+ roots[eid] = eid
559
+ continue
560
+ cur = eid
561
+ lineage_path: List[str] = []
562
+ while cur and cur not in lineage_path:
563
+ lineage_path.append(cur)
564
+ cur_row = idmap.get(cur, {})
565
+ parent = cur_row.get("parent_enzyme_id", "")
566
+ if not parent:
567
+ break
568
+ cur = parent
569
+ # If we found a gen0 ancestor in the path, use it
570
+ for anc in reversed(lineage_path):
571
+ if anc in gen0:
572
+ roots[eid] = anc
573
+ break
574
+ else:
575
+ # Fall back to closest by sequence similarity among gen0
576
+ aa_seq = (
577
+ str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
578
+ )
579
+ best_match = None
580
+ best_sim = 0.0
581
+ for g0 in gen0:
582
+ g0_row = idmap[g0]
583
+ g0_seq = (
584
+ str(g0_row.get("protein_sequence", ""))
585
+ or str(g0_row.get("aa_sequence", ""))
586
+ )
587
+ sim = _seq_sim(aa_seq, g0_seq)
588
+ if sim > best_sim:
589
+ best_sim, best_match = sim, g0
590
+ roots[eid] = best_match if best_match else eid
591
+ return roots
592
+
593
+
594
+ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
595
+ """Main public API: returns a DataFrame in the flat output format."""
596
+ # Apply column aliases to the dataframe
597
+ for alias, canonical in COLUMN_ALIASES.items():
598
+ if alias in df.columns and canonical not in df.columns:
599
+ df = df.rename(columns={alias: canonical})
600
+
601
+ # Fill missing sequences in substrate scope entries from lineage data
602
+ df = _fill_missing_sequences(df)
603
+
604
+ # 1. Generate lineage roots once -----------------------------------------
605
+ lineage_roots = _generate_lineage_roots(df)
606
+
607
+ # 2. Precompute SMILES in bulk -------------------------------------------
608
+ all_products: List[str] = []
609
+ all_subs: List[str] = []
610
+ for _, r in df.iterrows():
611
+ rec = VariantRecord(r.to_dict())
612
+ all_products.extend(rec.product_iupac())
613
+ all_subs.extend(rec.substrate_iupac())
614
+ prod_cache = _batch_convert(list(set(all_products)), is_substrate=False)
615
+ sub_cache = _batch_convert(list(set(all_subs)), is_substrate=True)
616
+
617
+ # NO CACHING - Comment out cache updates
618
+ # SMILES_CACHE.update(prod_cache)
619
+ # SUBSTRATE_CACHE.update(sub_cache)
620
+ # _save_pickle(SMILES_CACHE, SMILES_CACHE_FILE)
621
+ # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
622
+
623
+ # 3. Flatten rows ---------------------------------------------------------
624
+ idmap = {str(r["enzyme_id"]): r.to_dict() for _, r in df.iterrows()}
625
+ output_rows: List[Dict[str, str]] = []
626
+ for idx, (_, row) in enumerate(df.iterrows()):
627
+ rec = VariantRecord(row.to_dict())
628
+ eid = rec.eid
629
+
630
+ # Reaction data -------------------------------------------------------
631
+ subs = rec.substrate_iupac()
632
+ prods = rec.product_iupac()
633
+ data_type = rec.row.get("data_type", "")
634
+
635
+ if not subs or not prods:
636
+ # Skip entries without reaction info unless it's marked as lineage only
637
+ if data_type == "lineage":
638
+ subs, prods = [""], [""] # placeholders
639
+ else:
640
+ log.debug("Skipping %s due to missing reaction data", eid)
641
+ continue
642
+
643
+ sub_smiles = [sub_cache.get(s, "") for s in subs]
644
+ prod_smiles = [prod_cache.get(p, "") for p in prods]
645
+
646
+ smiles_string = ".".join(prod_smiles)
647
+ smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
648
+ smiles_string = _canonical_smiles(smiles_string)
649
+
650
+ # Mutations -----------------------------------------------------------
651
+ root_id = _root_enzyme_id(eid, idmap, lineage_roots)
652
+ root_row = idmap[root_id]
653
+ root_aa = (
654
+ str(root_row.get("protein_sequence", ""))
655
+ or str(root_row.get("aa_sequence", ""))
656
+ )
657
+ root_nt = (
658
+ str(root_row.get("nucleotide_sequence", ""))
659
+ or str(root_row.get("nt_sequence", ""))
660
+ )
661
+ # If root doesn't have NT sequence but has AA sequence, reverse translate
662
+ if (not root_nt or root_nt == "nan") and root_aa:
663
+ root_nt = _rev_translate(root_aa)
664
+
665
+ aa_muts = _aa_mut(root_aa, rec.aa_seq) if rec.aa_seq and root_aa else ""
666
+ nt_muts = _nt_mut(root_aa, rec.aa_seq, root_nt, rec.nt_seq) if root_aa or root_nt else ""
667
+
668
+ # Plate / well --------------------------------------------------------
669
+ barcode_plate, plate_name, well = _plate_and_well(idx)
670
+
671
+ # Reaction conditions -------------------------------------------------
672
+ cond_parts = []
673
+ for fld in (
674
+ "reaction_temperature",
675
+ "reaction_ph",
676
+ "reaction_other_conditions",
677
+ "reaction_substrate_concentration",
678
+ ):
679
+ if row.get(fld):
680
+ cond_parts.append(f"{fld}:{row[fld]}")
681
+ reaction_condition = ";".join(cond_parts)
682
+
683
+ # Cofactor (IUPAC list preferred, fallback plain list) ---------------
684
+ cof_iupac = str(row.get("cofactor_iupac_list", "")).strip()
685
+ cof_list = str(row.get("cofactor_list", "")).strip()
686
+ cofactor = cof_iupac or cof_list
687
+
688
+ # Additional info -----------------------------------------------------
689
+ extra: Dict[str, str] = {
690
+ k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
691
+ }
692
+ if rec.ttn_or_yield() is not None:
693
+ ttn_val = row.get("ttn")
694
+ extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
695
+ additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
696
+
697
+ flat = FlatRow(
698
+ id=eid,
699
+ barcode_plate=barcode_plate,
700
+ plate=plate_name,
701
+ well=well,
702
+ smiles_string=smiles_string,
703
+ smiles_reaction=smiles_reaction,
704
+ nucleotide_mutation=nt_muts,
705
+ amino_acid_substitutions=aa_muts,
706
+ nt_sequence=rec.nt_seq,
707
+ aa_sequence=rec.aa_seq,
708
+ fitness_value=rec.ttn_or_yield(),
709
+ cofactor=cofactor,
710
+ reaction_condition=reaction_condition,
711
+ ee=str(row.get("ee", "")),
712
+ additional_information=additional_information,
713
+ )
714
+ output_rows.append(flat.as_dict())
715
+
716
+ out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
717
+ return out_df
718
+
719
+
720
+ # === 8. PIPELINE ORCHESTRATOR ======================================================
721
+
722
+ def run_pipeline(reaction_csv: str | Path | None = None,
723
+ substrate_scope_csv: str | Path | None = None,
724
+ output_csv: str | Path | None = None) -> pd.DataFrame:
725
+ """Run the pipeline on reaction and/or substrate scope CSV files.
726
+
727
+ Args:
728
+ reaction_csv: Path to reaction/lineage data CSV (optional)
729
+ substrate_scope_csv: Path to substrate scope data CSV (optional)
730
+ output_csv: Path to write the formatted output CSV
731
+
732
+ Returns:
733
+ DataFrame with flattened lineage data
734
+ """
735
+ t0 = time.perf_counter()
736
+
737
+ dfs = []
738
+
739
+ # Load reaction data if provided
740
+ if reaction_csv:
741
+ df_reaction = pd.read_csv(reaction_csv)
742
+ df_reaction['data_type'] = 'lineage'
743
+ # Handle column aliasing for reaction data
744
+ if 'enzyme' in df_reaction.columns and 'enzyme_id' not in df_reaction.columns:
745
+ df_reaction['enzyme_id'] = df_reaction['enzyme']
746
+ log.info("Loaded %d reaction entries from %s", len(df_reaction), reaction_csv)
747
+ dfs.append(df_reaction)
748
+
749
+ # Load substrate scope data if provided
750
+ if substrate_scope_csv:
751
+ df_substrate = pd.read_csv(substrate_scope_csv)
752
+ df_substrate['data_type'] = 'substrate_scope'
753
+ log.info("Loaded %d substrate scope entries from %s", len(df_substrate), substrate_scope_csv)
754
+ dfs.append(df_substrate)
755
+
756
+ if not dfs:
757
+ raise ValueError("At least one input CSV must be provided")
758
+
759
+ # Combine dataframes
760
+ if len(dfs) > 1:
761
+ df_in = pd.concat(dfs, ignore_index=True)
762
+ log.info("Combined data: %d total entries", len(df_in))
763
+ else:
764
+ df_in = dfs[0]
765
+
766
+ df_out = flatten_dataframe(df_in)
767
+ log.info("Flattened to %d rows", len(df_out))
768
+
769
+ if output_csv:
770
+ df_out.to_csv(output_csv, index=False)
771
+ log.info("Wrote output CSV to %s (%.1f kB)", output_csv, Path(output_csv).stat().st_size / 1024)
772
+
773
+ log.info("Pipeline finished in %.2f s", time.perf_counter() - t0)
774
+ return df_out
775
+
776
+
777
+ # === 9. CLI ENTRYPOINT =============================================================
778
+
779
+ def _build_arg_parser() -> argparse.ArgumentParser:
780
+ p = argparse.ArgumentParser(
781
+ prog="lineage_flattener",
782
+ description="Flatten enzyme lineage CSV into reaction table for automation",
783
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
784
+ )
785
+ p.add_argument("-r", "--reaction", help="Reaction/lineage data CSV file")
786
+ p.add_argument("-s", "--substrate-scope", help="Substrate scope data CSV file")
787
+ p.add_argument("-o", "--output", help="Path to write flattened CSV")
788
+ p.add_argument("-v", "--verbose", action="count", default=0, help="Increase verbosity (-v, -vv)")
789
+ return p
790
+
791
+
792
+ def main(argv: Optional[List[str]] = None) -> None:
793
+ args = _build_arg_parser().parse_args(argv)
794
+ level = logging.DEBUG if args.verbose and args.verbose > 1 else logging.INFO if args.verbose else logging.WARNING
795
+ logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
796
+
797
+ if not args.reaction and not args.substrate_scope:
798
+ log.error("At least one input file must be provided (--reaction or --substrate-scope)")
799
+ sys.exit(1)
800
+
801
+ run_pipeline(args.reaction, args.substrate_scope, args.output)
802
+
803
+
804
+ if __name__ == "__main__":
805
+ main()
806
+
807
+ # --------------------------------------------------------------------------- END ---
808
+