debase 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ # DEBase Pipeline Flow
2
+
3
+ ## Overview
4
+ The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
+
6
+ ## Pipeline Architecture
7
+
8
+ ```
9
+ ┌─────────────────────┐ ┌─────────────────────┐
10
+ │ Manuscript PDF │ │ SI PDF │
11
+ └──────────┬──────────┘ └──────────┬──────────┘
12
+ │ │
13
+ └───────────┬───────────────┘
14
+
15
+
16
+ ┌─────────────────────────────┐
17
+ │ 1. enzyme_lineage_extractor │
18
+ │ - Extract enzyme variants │
19
+ │ - Parse mutations │
20
+ │ - Get basic metadata │
21
+ └─────────────┬───────────────┘
22
+
23
+
24
+ ┌─────────────────────────────┐
25
+ │ 2. cleanup_sequence │
26
+ │ - Validate sequences │
27
+ │ - Fix formatting issues │
28
+ │ - Generate full sequences │
29
+ └─────────────┬───────────────┘
30
+
31
+ ┌───────────┴───────────────┐
32
+ │ │
33
+ ▼ ▼
34
+ ┌─────────────────────────┐ ┌─────────────────────────┐
35
+ │ 3a. reaction_info │ │ 3b. substrate_scope │
36
+ │ _extractor │ │ _extractor │
37
+ │ - Performance metrics │ │ - Substrate variations │
38
+ │ - Model reaction │ │ - Additional variants │
39
+ │ - Conditions │ │ - Scope data │
40
+ └───────────┬─────────────┘ └───────────┬─────────────┘
41
+ │ │
42
+ └───────────┬───────────────┘
43
+
44
+
45
+ ┌─────────────────────────────┐
46
+ │ 4. lineage_format_o3 │
47
+ │ - Merge all data │
48
+ │ - Fill missing sequences │
49
+ │ - Format final output │
50
+ └─────────────┬───────────────┘
51
+
52
+
53
+ ┌─────────────┐
54
+ │ Final CSV │
55
+ └─────────────┘
56
+ ```
57
+
58
+ ## Module Details
59
+
60
+ ### 1. enzyme_lineage_extractor.py
61
+ - **Input**: Manuscript PDF, SI PDF
62
+ - **Output**: CSV with enzyme variants and mutations
63
+ - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
+
65
+ ### 2. cleanup_sequence.py
66
+ - **Input**: Enzyme lineage CSV
67
+ - **Output**: CSV with validated sequences
68
+ - **Function**: Validates protein sequences, generates full sequences from mutations
69
+
70
+ ### 3a. reaction_info_extractor.py
71
+ - **Input**: PDFs + cleaned enzyme CSV
72
+ - **Output**: CSV with reaction performance data
73
+ - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
+
75
+ ### 3b. substrate_scope_extractor.py
76
+ - **Input**: PDFs + cleaned enzyme CSV
77
+ - **Output**: CSV with substrate scope entries
78
+ - **Function**: Extracts substrate variations tested with different enzymes
79
+
80
+ ### 4. lineage_format_o3.py
81
+ - **Input**: Reaction CSV + Substrate scope CSV
82
+ - **Output**: Final formatted CSV
83
+ - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
+
85
+ ## Key Features
86
+
87
+ 1. **Modular Design**: Each step can be run independently
88
+ 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
+ 3. **Error Recovery**: Pipeline can resume from any step
90
+ 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
+
92
+ ## Usage
93
+
94
+ ```bash
95
+ # Full pipeline
96
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
+
98
+ # With intermediate files kept for debugging
99
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
+ ```
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.16"
3
+ __version__ = "0.1.17"
@@ -823,7 +823,14 @@ def identify_evolution_locations(
823
823
 
824
824
  def _parse_variants(data: Dict[str, Any], campaign_id: Optional[str] = None) -> List[Variant]:
825
825
  """Convert raw JSON to a list[Variant] with basic validation."""
826
- variants_json = data.get("variants", []) if isinstance(data, dict) else []
826
+ if isinstance(data, list):
827
+ # Direct array of variants
828
+ variants_json = data
829
+ elif isinstance(data, dict):
830
+ # Object with "variants" key
831
+ variants_json = data.get("variants", [])
832
+ else:
833
+ variants_json = []
827
834
  parsed: List[Variant] = []
828
835
  for item in variants_json:
829
836
  try:
@@ -1283,13 +1290,40 @@ def get_lineage(
1283
1290
  log.info(f"Identified {len(campaigns)} distinct campaigns")
1284
1291
  for camp in campaigns:
1285
1292
  log.info(f" - {camp.campaign_name}: {camp.description}")
1293
+ else:
1294
+ log.warning("No campaigns identified, creating default campaign for enzyme characterization")
1295
+ # Create a default campaign when none are found
1296
+ default_campaign = Campaign(
1297
+ campaign_id="default_characterization",
1298
+ campaign_name="Enzyme Characterization Study",
1299
+ description="Default campaign for papers that characterize existing enzyme variants without describing new directed evolution",
1300
+ model_substrate="Unknown",
1301
+ model_product="Unknown",
1302
+ data_locations=["Full manuscript text"]
1303
+ )
1304
+ campaigns = [default_campaign]
1305
+ log.info(f"Created default campaign: {default_campaign.campaign_name}")
1286
1306
 
1287
1307
  # Use captions for identification - they're concise and focused
1288
1308
  locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
1289
1309
 
1290
1310
  all_variants = []
1291
1311
 
1292
- if locations and campaigns:
1312
+ if campaigns:
1313
+ # If we have campaigns but no specific locations, use general extraction
1314
+ if not locations:
1315
+ log.info("No specific lineage locations found, extracting from full text with campaign context")
1316
+ # Extract lineage for each campaign using full text
1317
+ for campaign in campaigns:
1318
+ log.info(f"Processing campaign: {campaign.campaign_id}")
1319
+ campaign_variants = extract_campaign_lineage(
1320
+ full_text, model, campaign_id=campaign.campaign_id,
1321
+ debug_dir=debug_dir, pdf_paths=pdf_paths,
1322
+ campaign_info=campaign
1323
+ )
1324
+ all_variants.extend(campaign_variants)
1325
+ return all_variants, campaigns
1326
+ # Original logic for when we have both locations and campaigns
1293
1327
  # Log location information
1294
1328
  location_summary = []
1295
1329
  for loc in locations[:5]:
@@ -1939,6 +1973,173 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
1939
1973
  log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
1940
1974
  return {}
1941
1975
 
1976
+ def extract_enzyme_info_with_gemini(
1977
+ text: str,
1978
+ variants: List[Variant],
1979
+ model,
1980
+ ) -> Dict[str, str]:
1981
+ """Use Gemini to extract enzyme names or sequences when PDB IDs are not available.
1982
+
1983
+ Returns:
1984
+ Dict mapping variant IDs to sequences
1985
+ """
1986
+ # Build variant info for context
1987
+ variant_info = []
1988
+ for v in variants[:10]: # Limit to first 10 variants for context
1989
+ info = {
1990
+ "id": v.variant_id,
1991
+ "mutations": v.mutations[:5] if v.mutations else [], # Limit mutations shown
1992
+ "parent": v.parent_id,
1993
+ "generation": v.generation
1994
+ }
1995
+ variant_info.append(info)
1996
+
1997
+ prompt = f"""You are analyzing a scientific paper about enzyme engineering. No PDB IDs were found in the paper, and I need to obtain protein sequences for the enzyme variants described.
1998
+
1999
+ Here are the variants found in the paper:
2000
+ {json.dumps(variant_info, indent=2)}
2001
+
2002
+ Please analyze the paper text and:
2003
+ 1. Identify the common name of the enzyme being studied (e.g., "P450 BM3", "cytochrome P450 BM3", "CYP102A1")
2004
+ 2. If possible, extract or find the wild-type sequence
2005
+ 3. Provide any UniProt IDs or accession numbers mentioned
2006
+
2007
+ Paper text (first 5000 characters):
2008
+ {text[:5000]}
2009
+
2010
+ Return your response as a JSON object with this structure:
2011
+ {{
2012
+ "enzyme_name": "common name of the enzyme",
2013
+ "systematic_name": "systematic name if applicable (e.g., CYP102A1)",
2014
+ "uniprot_id": "UniProt ID if found",
2015
+ "wild_type_sequence": "sequence if found in paper or if you know it",
2016
+ "additional_names": ["list", "of", "alternative", "names"]
2017
+ }}
2018
+
2019
+ If you cannot determine certain fields, set them to null.
2020
+ """
2021
+
2022
+ try:
2023
+ response = model.generate_content(prompt)
2024
+ text_response = _extract_text(response).strip()
2025
+
2026
+ # Parse JSON response
2027
+ if text_response.startswith("```"):
2028
+ text_response = text_response.split("```")[1].strip()
2029
+ if text_response.startswith("json"):
2030
+ text_response = text_response[4:].strip()
2031
+ text_response = text_response.split("```")[0].strip()
2032
+
2033
+ enzyme_info = json.loads(text_response)
2034
+ log.info(f"Gemini extracted enzyme info: {enzyme_info.get('enzyme_name', 'Unknown')}")
2035
+
2036
+ sequences = {}
2037
+
2038
+ # If Gemini provided a sequence directly, use it
2039
+ if enzyme_info.get("wild_type_sequence"):
2040
+ # Clean the sequence
2041
+ seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
2042
+ # Validate it looks like a protein sequence
2043
+ if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
2044
+ # Map to the first variant or wild-type
2045
+ wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
2046
+ if wt_variant:
2047
+ sequences[wt_variant.variant_id] = seq
2048
+ else:
2049
+ sequences[variants[0].variant_id] = seq
2050
+ log.info(f"Using sequence from Gemini: {len(seq)} residues")
2051
+
2052
+ # If no sequence but we have names, try to fetch from UniProt
2053
+ if not sequences:
2054
+ names_to_try = []
2055
+ if enzyme_info.get("enzyme_name"):
2056
+ names_to_try.append(enzyme_info["enzyme_name"])
2057
+ if enzyme_info.get("systematic_name"):
2058
+ names_to_try.append(enzyme_info["systematic_name"])
2059
+ if enzyme_info.get("uniprot_id"):
2060
+ names_to_try.append(enzyme_info["uniprot_id"])
2061
+ if enzyme_info.get("additional_names"):
2062
+ names_to_try.extend(enzyme_info["additional_names"])
2063
+
2064
+ # Try each name with UniProt
2065
+ for name in names_to_try:
2066
+ if name:
2067
+ uniprot_seqs = fetch_sequence_by_name(name)
2068
+ if uniprot_seqs:
2069
+ # Map the first sequence to appropriate variant
2070
+ seq = list(uniprot_seqs.values())[0]
2071
+ wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
2072
+ if wt_variant:
2073
+ sequences[wt_variant.variant_id] = seq
2074
+ else:
2075
+ sequences[variants[0].variant_id] = seq
2076
+ log.info(f"Found sequence via UniProt search for '{name}': {len(seq)} residues")
2077
+ break
2078
+
2079
+ return sequences
2080
+
2081
+ except Exception as e:
2082
+ log.warning(f"Failed to extract enzyme info with Gemini: {e}")
2083
+ return {}
2084
+
2085
+
2086
+ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
2087
+ """Fetch protein sequences from UniProt by enzyme name or ID.
2088
+
2089
+ Args:
2090
+ enzyme_name: Name, ID, or accession of the enzyme
2091
+
2092
+ Returns:
2093
+ Dict mapping identifiers to sequences
2094
+ """
2095
+ import requests
2096
+
2097
+ clean_name = enzyme_name.strip()
2098
+
2099
+ # First try as accession number
2100
+ if len(clean_name) <= 10 and (clean_name[0].isalpha() and clean_name[1:].replace("_", "").isalnum()):
2101
+ # Looks like a UniProt accession
2102
+ url = f"https://rest.uniprot.org/uniprotkb/{clean_name}"
2103
+ try:
2104
+ response = requests.get(url, timeout=10)
2105
+ if response.status_code == 200:
2106
+ data = response.json()
2107
+ sequence = data.get('sequence', {}).get('value', '')
2108
+ if sequence:
2109
+ return {clean_name: sequence}
2110
+ except:
2111
+ pass
2112
+
2113
+ # Try search API
2114
+ url = "https://rest.uniprot.org/uniprotkb/search"
2115
+ params = {
2116
+ "query": f'(protein_name:"{clean_name}" OR gene:"{clean_name}" OR id:"{clean_name}")',
2117
+ "format": "json",
2118
+ "size": "5",
2119
+ "fields": "accession,id,protein_name,gene_names,sequence"
2120
+ }
2121
+
2122
+ try:
2123
+ response = requests.get(url, params=params, timeout=10)
2124
+ response.raise_for_status()
2125
+ data = response.json()
2126
+
2127
+ results = data.get('results', [])
2128
+ sequences = {}
2129
+
2130
+ for result in results[:1]: # Just take the first match
2131
+ sequence = result.get('sequence', {}).get('value', '')
2132
+ if sequence:
2133
+ sequences[clean_name] = sequence
2134
+ break
2135
+
2136
+ return sequences
2137
+
2138
+ except Exception as e:
2139
+ log.warning(f"Failed to fetch sequence for '{enzyme_name}': {e}")
2140
+ return {}
2141
+
2142
+
1942
2143
  def match_pdb_to_variants(
1943
2144
  pdb_sequences: Dict[str, str],
1944
2145
  variants: List[Variant],
@@ -2110,16 +2311,23 @@ def _merge_lineage_and_sequences(
2110
2311
  for v in lineage
2111
2312
  ])
2112
2313
 
2113
- df_seq = pd.DataFrame([
2114
- {
2115
- "variant_id": s.variant_id,
2116
- "aa_seq": s.aa_seq,
2117
- "dna_seq": s.dna_seq,
2118
- "seq_confidence": s.confidence,
2119
- "truncated": s.truncated,
2120
- }
2121
- for s in seqs
2122
- ])
2314
+ if seqs:
2315
+ df_seq = pd.DataFrame([
2316
+ {
2317
+ "variant_id": s.variant_id,
2318
+ "aa_seq": s.aa_seq,
2319
+ "dna_seq": s.dna_seq,
2320
+ "seq_confidence": s.confidence,
2321
+ "truncated": s.truncated,
2322
+ "seq_source": s.metadata.get("source", None) if s.metadata else None,
2323
+ }
2324
+ for s in seqs
2325
+ ])
2326
+ else:
2327
+ # Create empty DataFrame with correct columns for merging
2328
+ df_seq = pd.DataFrame(columns=[
2329
+ "variant_id", "aa_seq", "dna_seq", "seq_confidence", "truncated", "seq_source"
2330
+ ])
2123
2331
 
2124
2332
  # Log sequence data info
2125
2333
  if len(df_seq) > 0:
@@ -2397,7 +2605,7 @@ def run_pipeline(
2397
2605
  early_df = _lineage_to_dataframe(lineage)
2398
2606
  output_csv_path = Path(output_csv)
2399
2607
  # Save lineage-only data with specific filename
2400
- lineage_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2608
+ lineage_path = output_csv_path.parent / "enzyme_lineage_name.csv"
2401
2609
  early_df.to_csv(lineage_path, index=False)
2402
2610
  log.info(
2403
2611
  "Saved lineage-only CSV -> %s",
@@ -2461,6 +2669,36 @@ def run_pipeline(
2461
2669
  log.warning(f"No sequences found in PDB {pdb_id}")
2462
2670
  else:
2463
2671
  log.warning("No PDB IDs found in paper")
2672
+
2673
+ # 4b. If still no sequences, try Gemini extraction as last resort
2674
+ if not sequences or all(not s.aa_seq for s in sequences):
2675
+ log.info("No sequences from PDB, attempting Gemini-based extraction...")
2676
+
2677
+ gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
2678
+
2679
+ if gemini_sequences:
2680
+ # Convert to SequenceBlock objects
2681
+ gemini_seq_blocks = []
2682
+ for variant_id, seq in gemini_sequences.items():
2683
+ # Find the matching variant
2684
+ variant = next((v for v in lineage if v.variant_id == variant_id), None)
2685
+ if variant:
2686
+ seq_block = SequenceBlock(
2687
+ variant_id=variant.variant_id,
2688
+ aa_seq=seq,
2689
+ dna_seq=None,
2690
+ confidence=0.9, # High confidence but slightly lower than PDB
2691
+ truncated=False,
2692
+ metadata={"source": "Gemini/UniProt"}
2693
+ )
2694
+ gemini_seq_blocks.append(seq_block)
2695
+ log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
2696
+
2697
+ if gemini_seq_blocks:
2698
+ sequences = gemini_seq_blocks
2699
+ log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
2700
+ else:
2701
+ log.warning("Failed to extract sequences via Gemini")
2464
2702
 
2465
2703
  # 5. Merge & score (Section 8) --------------------------------------------
2466
2704
  doi = extract_doi(manuscript)
debase/lineage_format.py CHANGED
@@ -188,11 +188,17 @@ class VariantRecord:
188
188
  # Reaction-related -------------------------------------------------------------
189
189
  def substrate_iupac(self) -> List[str]:
190
190
  raw = str(self.row.get("substrate_iupac_list", "")).strip()
191
- return _split_list(raw)
191
+ result = _split_list(raw)
192
+ if not result and raw and raw.lower() != 'nan':
193
+ log.debug(f"substrate_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
194
+ return result
192
195
 
193
196
  def product_iupac(self) -> List[str]:
194
197
  raw = str(self.row.get("product_iupac_list", "")).strip()
195
- return _split_list(raw)
198
+ result = _split_list(raw)
199
+ if not result and raw and raw.lower() != 'nan':
200
+ log.debug(f"product_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
201
+ return result
196
202
 
197
203
 
198
204
  def ttn_or_yield(self) -> Optional[float]:
@@ -377,6 +383,53 @@ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str =
377
383
 
378
384
  # === 6. SMILES CONVERSION HELPERS ==================================================
379
385
 
386
+ def search_smiles_with_gemini(compound_name: str, model=None) -> Optional[str]:
387
+ """
388
+ Use Gemini to search for SMILES strings of complex compounds.
389
+ Returns SMILES string if found, None otherwise.
390
+ """
391
+ if not compound_name or compound_name.lower() in ['nan', 'none', '']:
392
+ return None
393
+
394
+ if not model:
395
+ try:
396
+ # Import get_model from enzyme_lineage_extractor
397
+ import sys
398
+ from pathlib import Path
399
+ sys.path.append(str(Path(__file__).parent))
400
+ from enzyme_lineage_extractor import get_model
401
+ model = get_model()
402
+ except Exception as e:
403
+ log.warning(f"Could not load Gemini model: {e}")
404
+ return None
405
+
406
+ prompt = f"""Search for the SMILES string representation of this chemical compound:
407
+ "{compound_name}"
408
+
409
+ IMPORTANT:
410
+ - Do NOT generate or create a SMILES string
411
+ - Only provide SMILES that you can find in chemical databases or literature
412
+ - For deuterated compounds, search for the specific isotope-labeled SMILES
413
+ - If you cannot find the exact SMILES, say "NOT FOUND"
414
+
415
+ Return ONLY the SMILES string if found, or "NOT FOUND" if not found.
416
+ No explanation or additional text."""
417
+
418
+ try:
419
+ response = model.generate_content(prompt)
420
+ result = response.text.strip()
421
+
422
+ if result and result != "NOT FOUND" and not result.startswith("I"):
423
+ # Basic validation that it looks like SMILES
424
+ if any(c in result for c in ['C', 'c', 'N', 'O', 'S', 'P', '[', ']', '(', ')']):
425
+ log.info(f"Gemini found SMILES for '{compound_name}': {result}")
426
+ return result
427
+ return None
428
+ except Exception as e:
429
+ log.debug(f"Gemini SMILES search failed for '{compound_name}': {e}")
430
+ return None
431
+
432
+
380
433
  def _split_list(raw: str) -> List[str]:
381
434
  if not raw or str(raw).lower() == 'nan':
382
435
  return []
@@ -429,7 +482,12 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
429
482
  except FileNotFoundError:
430
483
  pass # OPSIN not installed
431
484
 
432
- # 3. PubChem PUG REST (online) ---------------------------------------------
485
+ # 3. Gemini search (for complex compounds) ---------------------------------
486
+ gemini_smiles = search_smiles_with_gemini(name)
487
+ if gemini_smiles:
488
+ return gemini_smiles
489
+
490
+ # 4. PubChem PUG REST (online) ---------------------------------------------
433
491
  try:
434
492
  import requests
435
493
 
@@ -538,13 +596,23 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
538
596
 
539
597
  def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
540
598
  """Infer lineage roots using generation numbers and simple sequence similarity."""
541
- idmap: Dict[str, Dict[str, str]] = {str(r["enzyme_id"]): r for _, r in df.iterrows()}
599
+ # Create idmap, handling missing enzyme_id gracefully
600
+ idmap: Dict[str, Dict[str, str]] = {}
601
+ for _, r in df.iterrows():
602
+ eid = r.get("enzyme_id")
603
+ if pd.isna(eid) or str(eid).strip() == "":
604
+ continue
605
+ idmap[str(eid)] = r
542
606
  roots: Dict[str, str] = {}
543
607
  # Look for generation 0 as the root
544
- gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "0"}
608
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows()
609
+ if str(r.get("generation", "")).strip() == "0"
610
+ and not pd.isna(r.get("enzyme_id"))}
545
611
  # If no gen0 found, fall back to gen1
546
612
  if not gen0:
547
- gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "1"}
613
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows()
614
+ if str(r.get("generation", "")).strip() == "1"
615
+ and not pd.isna(r.get("enzyme_id"))}
548
616
 
549
617
  def _seq_sim(a: str, b: str) -> float:
550
618
  if not a or not b:
@@ -553,7 +621,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
553
621
  return matches / max(len(a), len(b))
554
622
 
555
623
  for _, row in df.iterrows():
556
- eid = row["enzyme_id"]
624
+ eid = row.get("enzyme_id")
625
+ if pd.isna(eid) or str(eid).strip() == "":
626
+ continue
557
627
  if eid in gen0:
558
628
  roots[eid] = eid
559
629
  continue
@@ -593,6 +663,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
593
663
 
594
664
  def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
595
665
  """Main public API: returns a DataFrame in the flat output format."""
666
+ log.info(f"Starting flatten_dataframe with {len(df)} input rows")
667
+ log.info(f"Input columns: {list(df.columns)}")
668
+
596
669
  # Apply column aliases to the dataframe
597
670
  for alias, canonical in COLUMN_ALIASES.items():
598
671
  if alias in df.columns and canonical not in df.columns:
@@ -621,8 +694,29 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
621
694
  # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
622
695
 
623
696
  # 3. Flatten rows ---------------------------------------------------------
624
- idmap = {str(r["enzyme_id"]): r.to_dict() for _, r in df.iterrows()}
697
+ # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
698
+ idmap = {}
699
+ for _, r in df.iterrows():
700
+ eid = str(r["enzyme_id"])
701
+ if eid in idmap:
702
+ log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
703
+ idmap[eid] = r.to_dict()
704
+
705
+ # Check for duplicate enzyme_ids
706
+ enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
707
+ unique_ids = set(enzyme_ids)
708
+ if len(enzyme_ids) != len(unique_ids):
709
+ log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
710
+ from collections import Counter
711
+ id_counts = Counter(enzyme_ids)
712
+ duplicates = {k: v for k, v in id_counts.items() if v > 1}
713
+ log.warning(f"Duplicate enzyme_ids: {duplicates}")
714
+ log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
715
+
625
716
  output_rows: List[Dict[str, str]] = []
717
+ skipped_count = 0
718
+ processed_count = 0
719
+
626
720
  for idx, (_, row) in enumerate(df.iterrows()):
627
721
  rec = VariantRecord(row.to_dict())
628
722
  eid = rec.eid
@@ -632,13 +726,19 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
632
726
  prods = rec.product_iupac()
633
727
  data_type = rec.row.get("data_type", "")
634
728
 
635
- if not subs or not prods:
636
- # Skip entries without reaction info unless it's marked as lineage only
729
+ if not prods:
730
+ # Skip entries without product info unless it's marked as lineage only
637
731
  if data_type == "lineage":
638
732
  subs, prods = [""], [""] # placeholders
639
733
  else:
640
- log.debug("Skipping %s due to missing reaction data", eid)
734
+ log.info(f"Skipping enzyme_id={eid} (row {idx}) due to missing product data. prods={prods}, data_type={data_type}")
735
+ skipped_count += 1
641
736
  continue
737
+
738
+ # If no substrates but we have products, use empty substrate list
739
+ if not subs:
740
+ log.debug(f"Empty substrate list for enzyme_id={eid}, using empty placeholder")
741
+ subs = [""]
642
742
 
643
743
  sub_smiles = [sub_cache.get(s, "") for s in subs]
644
744
  prod_smiles = [prod_cache.get(p, "") for p in prods]
@@ -712,7 +812,9 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
712
812
  additional_information=additional_information,
713
813
  )
714
814
  output_rows.append(flat.as_dict())
815
+ processed_count += 1
715
816
 
817
+ log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
716
818
  out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
717
819
  return out_df
718
820
 
@@ -761,6 +761,15 @@ Ignore locations that contain data for other campaigns.
761
761
  return line
762
762
  return page[:800]
763
763
 
764
+ def _ensure_rgb_pixmap(self, pix: fitz.Pixmap) -> fitz.Pixmap:
765
+ """Ensure pixmap is in RGB colorspace for PIL compatibility."""
766
+ if pix.alpha: # RGBA -> RGB
767
+ pix = fitz.Pixmap(fitz.csRGB, pix)
768
+ elif pix.colorspace and pix.colorspace.name not in ["DeviceRGB", "DeviceGray"]:
769
+ # Convert unsupported colorspaces (CMYK, LAB, etc.) to RGB
770
+ pix = fitz.Pixmap(fitz.csRGB, pix)
771
+ return pix
772
+
764
773
  # ---- NEW: Page image helper for both figures and tables ----
765
774
  def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
766
775
  """Export the page containing the reference as PNG.
@@ -802,14 +811,14 @@ Ignore locations that contain data for other campaigns.
802
811
  if img_rect.y1 < cap_rect.y0: # fully above caption
803
812
  # Extract image bytes
804
813
  pix = fitz.Pixmap(doc, xref)
805
- if pix.alpha: # RGBA -> RGB
806
- pix = fitz.Pixmap(fitz.csRGB, pix)
814
+ pix = self._ensure_rgb_pixmap(pix)
807
815
  img_bytes = pix.tobytes("png")
808
816
  return b64encode(img_bytes).decode()
809
817
  else:
810
818
  # Extract the entire page as an image
811
819
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
812
820
  pix = page.get_pixmap(matrix=mat)
821
+ pix = self._ensure_rgb_pixmap(pix)
813
822
  img_bytes = pix.tobytes("png")
814
823
  return b64encode(img_bytes).decode()
815
824
  return None
@@ -842,11 +851,13 @@ Ignore locations that contain data for other campaigns.
842
851
  # Add the current page
843
852
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
844
853
  pix = doc.load_page(page_num).get_pixmap(matrix=mat)
854
+ pix = self._ensure_rgb_pixmap(pix)
845
855
  all_images.append(pix)
846
856
 
847
857
  # If this is the last page with the reference, also add the next page
848
858
  if i == len(pages) - 1 and page_num + 1 < doc.page_count:
849
859
  next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
860
+ next_pix = self._ensure_rgb_pixmap(next_pix)
850
861
  all_images.append(next_pix)
851
862
  LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
852
863
 
@@ -855,14 +866,16 @@ Ignore locations that contain data for other campaigns.
855
866
 
856
867
  # If only one page, return it directly
857
868
  if len(all_images) == 1:
858
- return b64encode(all_images[0].tobytes("png")).decode()
869
+ pix = self._ensure_rgb_pixmap(all_images[0])
870
+ return b64encode(pix.tobytes("png")).decode()
859
871
 
860
872
  # Combine multiple pages vertically
861
873
  if not all_images:
862
874
  return None
863
875
 
864
876
  if len(all_images) == 1:
865
- return b64encode(all_images[0].tobytes("png")).decode()
877
+ pix = self._ensure_rgb_pixmap(all_images[0])
878
+ return b64encode(pix.tobytes("png")).decode()
866
879
 
867
880
  # Calculate dimensions for combined image
868
881
  total_height = sum(pix.height for pix in all_images)
@@ -903,6 +916,7 @@ Ignore locations that contain data for other campaigns.
903
916
  # Convert the page to a pixmap
904
917
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
905
918
  combined_pix = page.get_pixmap(matrix=mat)
919
+ combined_pix = self._ensure_rgb_pixmap(combined_pix)
906
920
 
907
921
  # Convert to PNG and return
908
922
  img_bytes = combined_pix.tobytes("png")
debase/wrapper.py CHANGED
@@ -46,101 +46,333 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
46
46
  """
47
47
  Step 2: Clean and validate protein sequences
48
48
  Calls: cleanup_sequence.py
49
+ Returns output path even if cleanup fails (copies input file)
49
50
  """
50
51
  logger.info(f"Cleaning sequences from {input_csv.name}")
51
52
 
52
- from .cleanup_sequence import main as cleanup_sequences
53
- cleanup_sequences([str(input_csv), str(output_csv)])
54
-
55
- logger.info(f"Sequence cleanup complete: {output_csv}")
56
- return output_csv
53
+ try:
54
+ from .cleanup_sequence import main as cleanup_sequences
55
+ cleanup_sequences([str(input_csv), str(output_csv)])
56
+
57
+ logger.info(f"Sequence cleanup complete: {output_csv}")
58
+ return output_csv
59
+
60
+ except Exception as e:
61
+ logger.warning(f"Sequence cleanup failed: {e}")
62
+ logger.info("Copying original file to continue pipeline...")
63
+
64
+ # Copy the input file as-is to continue pipeline
65
+ import shutil
66
+ shutil.copy2(input_csv, output_csv)
67
+
68
+ logger.info(f"Original file copied: {output_csv}")
69
+ return output_csv
57
70
 
58
71
 
59
72
  def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
60
73
  """
61
74
  Step 3a: Extract reaction performance metrics
62
75
  Calls: reaction_info_extractor.py
76
+ Returns output path even if extraction fails (creates empty file)
63
77
  """
64
78
  logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
65
79
 
66
- from .reaction_info_extractor import ReactionExtractor, Config
67
- import pandas as pd
68
-
69
- # Load enzyme data
70
- enzyme_df = pd.read_csv(lineage_csv)
71
-
72
- # Initialize extractor and run
73
- cfg = Config()
74
- extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
75
- df_metrics = extractor.run(enzyme_df)
76
-
77
- # Save results
78
- df_metrics.to_csv(output, index=False)
79
- logger.info(f"Reaction extraction complete: {output}")
80
- return output
80
+ try:
81
+ from .reaction_info_extractor import ReactionExtractor, Config
82
+ import pandas as pd
83
+
84
+ # Load enzyme data
85
+ enzyme_df = pd.read_csv(lineage_csv)
86
+
87
+ # Initialize extractor and run
88
+ cfg = Config()
89
+ extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
90
+ df_metrics = extractor.run(enzyme_df)
91
+
92
+ # Save results
93
+ df_metrics.to_csv(output, index=False)
94
+ logger.info(f"Reaction extraction complete: {output}")
95
+ return output
96
+
97
+ except Exception as e:
98
+ logger.warning(f"Reaction extraction failed: {e}")
99
+ logger.info("Creating empty reaction info file to continue pipeline...")
100
+
101
+ # Create empty reaction CSV with basic columns
102
+ import pandas as pd
103
+ empty_df = pd.DataFrame(columns=[
104
+ 'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
105
+ 'conversion_percent', 'reaction_type', 'reaction_conditions', 'notes'
106
+ ])
107
+ empty_df.to_csv(output, index=False)
108
+
109
+ logger.info(f"Empty reaction file created: {output}")
110
+ return output
81
111
 
82
112
 
83
113
  def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
84
114
  """
85
115
  Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
86
116
  Calls: substrate_scope_extractor.py
117
+ Returns output path even if extraction fails (creates empty file)
87
118
  """
88
119
  logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
89
120
 
90
- from .substrate_scope_extractor import run_pipeline
121
+ try:
122
+ from .substrate_scope_extractor import run_pipeline
123
+
124
+ # Run substrate scope extraction
125
+ run_pipeline(
126
+ manuscript=manuscript,
127
+ si=si,
128
+ lineage_csv=lineage_csv,
129
+ output_csv=output,
130
+ debug_dir=debug_dir
131
+ )
132
+
133
+ logger.info(f"Substrate scope extraction complete: {output}")
134
+ return output
135
+
136
+ except Exception as e:
137
+ logger.warning(f"Substrate scope extraction failed: {e}")
138
+ logger.info("Creating empty substrate scope file to continue pipeline...")
139
+
140
+ # Create empty substrate scope CSV with proper headers
141
+ import pandas as pd
142
+ empty_df = pd.DataFrame(columns=[
143
+ 'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
144
+ 'conversion_percent', 'selectivity', 'reaction_conditions', 'notes'
145
+ ])
146
+ empty_df.to_csv(output, index=False)
147
+
148
+ logger.info(f"Empty substrate scope file created: {output}")
149
+ return output
150
+
151
+
152
+ def match_enzyme_variants_with_gemini(lineage_enzymes: list, data_enzymes: list, model=None) -> dict:
153
+ """
154
+ Use Gemini to match enzyme variant IDs between different datasets.
155
+ Returns a mapping of data_enzyme_id -> lineage_enzyme_id.
156
+ """
157
+ import json
91
158
 
92
- # Run substrate scope extraction
93
- run_pipeline(
94
- manuscript=manuscript,
95
- si=si,
96
- lineage_csv=lineage_csv,
97
- output_csv=output,
98
- debug_dir=debug_dir
99
- )
159
+ if not model:
160
+ try:
161
+ from .enzyme_lineage_extractor import get_model
162
+ model = get_model()
163
+ except:
164
+ logger.warning("Could not load Gemini model for variant matching")
165
+ return {}
100
166
 
101
- logger.info(f"Substrate scope extraction complete: {output}")
102
- return output
167
+ prompt = f"""Match enzyme variant IDs between two lists from the same scientific paper.
168
+
169
+ These lists come from different sections or analyses of the same study, but may use different naming conventions.
170
+
171
+ List 1 (from lineage/sequence data):
172
+ {json.dumps(lineage_enzymes)}
173
+
174
+ List 2 (from experimental data):
175
+ {json.dumps(data_enzymes)}
176
+
177
+ Analyze the patterns and match variants that refer to the same enzyme.
178
+ Return ONLY a JSON object mapping IDs from List 2 to their corresponding IDs in List 1.
179
+ Format: {{"list2_id": "list1_id", ...}}
180
+ Only include matches you are confident about based on the naming patterns.
181
+ """
182
+
183
+ try:
184
+ response = model.generate_content(prompt)
185
+ mapping_text = response.text.strip()
186
+
187
+ # Extract JSON from response
188
+ if '```json' in mapping_text:
189
+ mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
190
+ elif '```' in mapping_text:
191
+ mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
192
+
193
+ mapping = json.loads(mapping_text)
194
+ logger.info(f"Gemini matched {len(mapping)} enzyme variants")
195
+ for k, v in mapping.items():
196
+ logger.info(f" Matched '{k}' -> '{v}'")
197
+ return mapping
198
+ except Exception as e:
199
+ logger.warning(f"Failed to match variants with Gemini: {e}")
200
+ return {}
103
201
 
104
202
 
105
203
  def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
106
204
  """
107
205
  Step 4: Format and merge all data into final CSV
108
- Calls: lineage_format.py
206
+ Creates comprehensive format merging all available data, even if some extraction steps failed
109
207
  """
110
208
  logger.info(f"Formatting and merging data into final output")
111
209
 
112
- from .lineage_format import run_pipeline
113
- import pandas as pd
114
-
115
- # First, we need to merge the protein sequences into the reaction data
116
- df_reaction = pd.read_csv(reaction_csv)
117
- df_sequences = pd.read_csv(cleaned_csv)
118
-
119
- # Merge sequences into reaction data
120
- # Include generation and parent info for proper mutation calculation
121
- sequence_cols = ['protein_sequence', 'dna_seq', 'seq_confidence', 'truncated', 'flag',
122
- 'generation', 'parent_enzyme_id', 'mutations']
123
- sequence_data = df_sequences[['enzyme_id'] + [col for col in sequence_cols if col in df_sequences.columns]]
124
-
125
- # Merge on enzyme_id or variant_id
126
- if 'enzyme_id' in df_reaction.columns:
127
- df_reaction = df_reaction.merge(sequence_data, on='enzyme_id', how='left', suffixes=('', '_seq'))
128
- elif 'enzyme' in df_reaction.columns:
129
- sequence_data = sequence_data.rename(columns={'enzyme_id': 'enzyme'})
130
- df_reaction = df_reaction.merge(sequence_data, on='enzyme', how='left', suffixes=('', '_seq'))
131
-
132
- # Save the merged reaction data
133
- df_reaction.to_csv(reaction_csv, index=False)
134
-
135
- # Run the formatting pipeline
136
- df_final = run_pipeline(
137
- reaction_csv=reaction_csv,
138
- substrate_scope_csv=substrate_scope_csv,
139
- output_csv=output_csv
140
- )
141
-
142
- logger.info(f"Final formatting complete: {output_csv}")
143
- return output_csv
210
+ try:
211
+ import pandas as pd
212
+
213
+ # Read all available data files
214
+ logger.info("Reading enzyme lineage data...")
215
+ df_lineage = pd.read_csv(cleaned_csv)
216
+
217
+ logger.info("Reading reaction data...")
218
+ try:
219
+ df_reaction = pd.read_csv(reaction_csv)
220
+ has_reaction_data = len(df_reaction) > 0 and not df_reaction.empty
221
+ except:
222
+ df_reaction = pd.DataFrame()
223
+ has_reaction_data = False
224
+
225
+ logger.info("Reading substrate scope data...")
226
+ try:
227
+ df_scope = pd.read_csv(substrate_scope_csv)
228
+ has_scope_data = len(df_scope) > 0 and not df_scope.empty
229
+ except:
230
+ df_scope = pd.DataFrame()
231
+ has_scope_data = False
232
+
233
+ # Start with lineage data as base
234
+ df_final = df_lineage.copy()
235
+
236
+ # Ensure consistent enzyme ID column
237
+ if 'variant_id' in df_final.columns and 'enzyme_id' not in df_final.columns:
238
+ df_final = df_final.rename(columns={'variant_id': 'enzyme_id'})
239
+
240
+ # Merge reaction data if available
241
+ if has_reaction_data:
242
+ logger.info(f"Merging reaction data ({len(df_reaction)} records)")
243
+ # Match on enzyme_id or enzyme
244
+ merge_key = 'enzyme_id' if 'enzyme_id' in df_reaction.columns else 'enzyme'
245
+ if merge_key in df_reaction.columns:
246
+ df_final = df_final.merge(df_reaction, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_reaction'))
247
+ else:
248
+ logger.info("No reaction data available")
249
+
250
+ # Merge substrate scope data if available
251
+ if has_scope_data:
252
+ logger.info(f"Merging substrate scope data ({len(df_scope)} records)")
253
+ merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
254
+
255
+ if merge_key in df_scope.columns:
256
+ # First try direct merge
257
+ df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
258
+
259
+ # Check if any matches were found
260
+ matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
261
+
262
+ if matched_count == 0:
263
+ logger.info("No direct matches found, using Gemini to match enzyme variants...")
264
+
265
+ # Get unique enzyme IDs from both datasets
266
+ lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
267
+ scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
268
+
269
+ # Get mapping from Gemini
270
+ mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
271
+
272
+ if mapping:
273
+ # Apply mapping to scope data
274
+ df_scope_mapped = df_scope.copy()
275
+ df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
276
+ df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
277
+ else:
278
+ logger.warning("Could not match enzyme variants between datasets")
279
+ df_final = df_test_merge
280
+ else:
281
+ df_final = df_test_merge
282
+ logger.info(f"Direct merge matched {matched_count} records")
283
+ else:
284
+ logger.info("No substrate scope data available")
285
+
286
+ # Add comprehensive column structure for missing data
287
+ essential_columns = [
288
+ 'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
289
+ 'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
290
+ 'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
291
+ 'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
292
+ 'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
293
+ 'data_location'
294
+ ]
295
+
296
+ # Add missing columns with NaN
297
+ for col in essential_columns:
298
+ if col not in df_final.columns:
299
+ df_final[col] = None
300
+
301
+ # Clean up duplicate columns from merging
302
+ columns_to_keep = []
303
+ seen_base_names = set()
304
+ for col in df_final.columns:
305
+ base_name = col.split('_reaction')[0].split('_scope')[0]
306
+ if base_name not in seen_base_names:
307
+ columns_to_keep.append(col)
308
+ seen_base_names.add(base_name)
309
+ elif col.endswith('_scope') or col.endswith('_reaction'):
310
+ # Prefer scope or reaction data over base lineage data for certain columns
311
+ if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
312
+ columns_to_keep.append(col)
313
+ # Remove the base column if it exists
314
+ if base_name in columns_to_keep:
315
+ columns_to_keep.remove(base_name)
316
+ seen_base_names.add(base_name)
317
+
318
+ df_final = df_final[columns_to_keep]
319
+
320
+ # Rename merged columns back to standard names
321
+ rename_map = {}
322
+ for col in df_final.columns:
323
+ if col.endswith('_scope') or col.endswith('_reaction'):
324
+ base_name = col.split('_scope')[0].split('_reaction')[0]
325
+ rename_map[col] = base_name
326
+ df_final = df_final.rename(columns=rename_map)
327
+
328
+ # Save the comprehensive final output
329
+ df_final.to_csv(output_csv, index=False)
330
+
331
+ logger.info(f"Final comprehensive format complete: {output_csv}")
332
+ logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
333
+
334
+ # Log what data was successfully merged
335
+ if has_reaction_data:
336
+ logger.info("✓ Reaction performance data merged")
337
+ if has_scope_data:
338
+ logger.info("✓ Substrate scope data merged")
339
+
340
+ # Now run the actual lineage format to produce plate-based format
341
+ logger.info("\nRunning lineage format to produce plate-based output...")
342
+ try:
343
+ from .lineage_format import flatten_dataframe
344
+
345
+ # Create the plate-based output filename
346
+ plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
347
+
348
+ # Flatten the dataframe to plate format
349
+ df_flattened = flatten_dataframe(df_final)
350
+
351
+ # Save the flattened output
352
+ df_flattened.to_csv(plate_output, index=False)
353
+
354
+ logger.info(f"✓ Plate-based format saved to: {plate_output}")
355
+ logger.info(f" Contains {len(df_flattened)} rows with plate/well assignments")
356
+
357
+ # Update the final output path to be the plate format
358
+ output_csv = plate_output
359
+
360
+ except Exception as e:
361
+ logger.warning(f"Could not generate plate-based format: {e}")
362
+ logger.info("Comprehensive format will be used as final output")
363
+
364
+ return output_csv
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Final formatting failed: {e}")
368
+ logger.info("Using cleaned sequence data as final output...")
369
+
370
+ # Copy the cleaned CSV as the final output
371
+ import shutil
372
+ shutil.copy2(cleaned_csv, output_csv)
373
+
374
+ logger.info(f"Cleaned sequence file used as final output: {output_csv}")
375
+ return output_csv
144
376
 
145
377
 
146
378
  def run_pipeline(
@@ -206,7 +438,7 @@ def run_pipeline(
206
438
 
207
439
  # Step 4: Format and merge
208
440
  logger.info("\n[Step 4/5] Formatting and merging data...")
209
- run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
441
+ final_output = run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
210
442
 
211
443
  # Step 5: Finalize
212
444
  logger.info("\n[Step 5/5] Finalizing...")
@@ -219,11 +451,13 @@ def run_pipeline(
219
451
 
220
452
  logger.info("\n" + "="*60)
221
453
  logger.info("PIPELINE COMPLETED SUCCESSFULLY")
222
- logger.info(f"Output: {output_path}")
454
+ logger.info(f"Comprehensive output: {output_path}")
455
+ if final_output != output_path:
456
+ logger.info(f"Plate-based output: {final_output}")
223
457
  logger.info(f"Runtime: {elapsed:.1f} seconds")
224
458
  logger.info("="*60)
225
459
 
226
- return output_path
460
+ return final_output
227
461
 
228
462
  except Exception as e:
229
463
  logger.error(f"Pipeline failed: {str(e)}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,17 @@
1
+ debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
+ debase/_version.py,sha256=edeF0ciTSBytkIGNcNjx3UR4nAs3QzF_Lmmyr66k0Jc,50
5
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
+ debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
+ debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
8
+ debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
9
+ debase/reaction_info_extractor.py,sha256=NjOXZf22i3PvYpCgk9DCnswCbgmCQkj5V2-E21LEM6M,112876
10
+ debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
11
+ debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
12
+ debase-0.1.17.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.17.dist-info/METADATA,sha256=uCGXpNG7dIVZtpywd8V7kBcXuWHPyTjhJmH0mWKD7Ew,10790
14
+ debase-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.17.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.17.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.17.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=l25FRqoNjxB5d3qBHsLMMA_9YWsIZ7nJ5BiTLj0qYE8,50
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
6
- debase/enzyme_lineage_extractor.py,sha256=jNxNCh8VF0dUFxUlTall0w1-oQojXRXLnWcuPFs5ij8,106879
7
- debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
8
- debase/reaction_info_extractor.py,sha256=9DkEZh7TgsxKpFkKbLyUhS_w0Z84LczkDFv-v_NEHE4,112174
9
- debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
10
- debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
11
- debase-0.1.16.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
- debase-0.1.16.dist-info/METADATA,sha256=7sv2OcIuHaoOImkBdoEtRzyOjp9Kuoz2ZmgK4tosaUc,10790
13
- debase-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- debase-0.1.16.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
- debase-0.1.16.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
- debase-0.1.16.dist-info/RECORD,,