debase 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +251 -13
- debase/lineage_format.py +113 -11
- debase/reaction_info_extractor.py +21 -6
- debase/wrapper.py +301 -67
- {debase-0.1.16.dist-info → debase-0.1.18.dist-info}/METADATA +1 -1
- debase-0.1.18.dist-info/RECORD +17 -0
- debase-0.1.16.dist-info/RECORD +0 -16
- {debase-0.1.16.dist-info → debase-0.1.18.dist-info}/WHEEL +0 -0
- {debase-0.1.16.dist-info → debase-0.1.18.dist-info}/entry_points.txt +0 -0
- {debase-0.1.16.dist-info → debase-0.1.18.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.16.dist-info → debase-0.1.18.dist-info}/top_level.txt +0 -0
debase/PIPELINE_FLOW.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# DEBase Pipeline Flow
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
+
|
6
|
+
## Pipeline Architecture
|
7
|
+
|
8
|
+
```
|
9
|
+
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
+
│ Manuscript PDF │ │ SI PDF │
|
11
|
+
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
+
│ │
|
13
|
+
└───────────┬───────────────┘
|
14
|
+
│
|
15
|
+
▼
|
16
|
+
┌─────────────────────────────┐
|
17
|
+
│ 1. enzyme_lineage_extractor │
|
18
|
+
│ - Extract enzyme variants │
|
19
|
+
│ - Parse mutations │
|
20
|
+
│ - Get basic metadata │
|
21
|
+
└─────────────┬───────────────┘
|
22
|
+
│
|
23
|
+
▼
|
24
|
+
┌─────────────────────────────┐
|
25
|
+
│ 2. cleanup_sequence │
|
26
|
+
│ - Validate sequences │
|
27
|
+
│ - Fix formatting issues │
|
28
|
+
│ - Generate full sequences │
|
29
|
+
└─────────────┬───────────────┘
|
30
|
+
│
|
31
|
+
┌───────────┴───────────────┐
|
32
|
+
│ │
|
33
|
+
▼ ▼
|
34
|
+
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
+
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
+
│ _extractor │ │ _extractor │
|
37
|
+
│ - Performance metrics │ │ - Substrate variations │
|
38
|
+
│ - Model reaction │ │ - Additional variants │
|
39
|
+
│ - Conditions │ │ - Scope data │
|
40
|
+
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
+
│ │
|
42
|
+
└───────────┬───────────────┘
|
43
|
+
│
|
44
|
+
▼
|
45
|
+
┌─────────────────────────────┐
|
46
|
+
│ 4. lineage_format_o3 │
|
47
|
+
│ - Merge all data │
|
48
|
+
│ - Fill missing sequences │
|
49
|
+
│ - Format final output │
|
50
|
+
└─────────────┬───────────────┘
|
51
|
+
│
|
52
|
+
▼
|
53
|
+
┌─────────────┐
|
54
|
+
│ Final CSV │
|
55
|
+
└─────────────┘
|
56
|
+
```
|
57
|
+
|
58
|
+
## Module Details
|
59
|
+
|
60
|
+
### 1. enzyme_lineage_extractor.py
|
61
|
+
- **Input**: Manuscript PDF, SI PDF
|
62
|
+
- **Output**: CSV with enzyme variants and mutations
|
63
|
+
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
+
|
65
|
+
### 2. cleanup_sequence.py
|
66
|
+
- **Input**: Enzyme lineage CSV
|
67
|
+
- **Output**: CSV with validated sequences
|
68
|
+
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
+
|
70
|
+
### 3a. reaction_info_extractor.py
|
71
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
+
- **Output**: CSV with reaction performance data
|
73
|
+
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
+
|
75
|
+
### 3b. substrate_scope_extractor.py
|
76
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
+
- **Output**: CSV with substrate scope entries
|
78
|
+
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
+
|
80
|
+
### 4. lineage_format_o3.py
|
81
|
+
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
+
- **Output**: Final formatted CSV
|
83
|
+
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
+
|
85
|
+
## Key Features
|
86
|
+
|
87
|
+
1. **Modular Design**: Each step can be run independently
|
88
|
+
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
+
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
+
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
+
|
92
|
+
## Usage
|
93
|
+
|
94
|
+
```bash
|
95
|
+
# Full pipeline
|
96
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
+
|
98
|
+
# With intermediate files kept for debugging
|
99
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
+
```
|
debase/_version.py
CHANGED
@@ -823,7 +823,14 @@ def identify_evolution_locations(
|
|
823
823
|
|
824
824
|
def _parse_variants(data: Dict[str, Any], campaign_id: Optional[str] = None) -> List[Variant]:
|
825
825
|
"""Convert raw JSON to a list[Variant] with basic validation."""
|
826
|
-
|
826
|
+
if isinstance(data, list):
|
827
|
+
# Direct array of variants
|
828
|
+
variants_json = data
|
829
|
+
elif isinstance(data, dict):
|
830
|
+
# Object with "variants" key
|
831
|
+
variants_json = data.get("variants", [])
|
832
|
+
else:
|
833
|
+
variants_json = []
|
827
834
|
parsed: List[Variant] = []
|
828
835
|
for item in variants_json:
|
829
836
|
try:
|
@@ -1283,13 +1290,40 @@ def get_lineage(
|
|
1283
1290
|
log.info(f"Identified {len(campaigns)} distinct campaigns")
|
1284
1291
|
for camp in campaigns:
|
1285
1292
|
log.info(f" - {camp.campaign_name}: {camp.description}")
|
1293
|
+
else:
|
1294
|
+
log.warning("No campaigns identified, creating default campaign for enzyme characterization")
|
1295
|
+
# Create a default campaign when none are found
|
1296
|
+
default_campaign = Campaign(
|
1297
|
+
campaign_id="default_characterization",
|
1298
|
+
campaign_name="Enzyme Characterization Study",
|
1299
|
+
description="Default campaign for papers that characterize existing enzyme variants without describing new directed evolution",
|
1300
|
+
model_substrate="Unknown",
|
1301
|
+
model_product="Unknown",
|
1302
|
+
data_locations=["Full manuscript text"]
|
1303
|
+
)
|
1304
|
+
campaigns = [default_campaign]
|
1305
|
+
log.info(f"Created default campaign: {default_campaign.campaign_name}")
|
1286
1306
|
|
1287
1307
|
# Use captions for identification - they're concise and focused
|
1288
1308
|
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
|
1289
1309
|
|
1290
1310
|
all_variants = []
|
1291
1311
|
|
1292
|
-
if
|
1312
|
+
if campaigns:
|
1313
|
+
# If we have campaigns but no specific locations, use general extraction
|
1314
|
+
if not locations:
|
1315
|
+
log.info("No specific lineage locations found, extracting from full text with campaign context")
|
1316
|
+
# Extract lineage for each campaign using full text
|
1317
|
+
for campaign in campaigns:
|
1318
|
+
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1319
|
+
campaign_variants = extract_campaign_lineage(
|
1320
|
+
full_text, model, campaign_id=campaign.campaign_id,
|
1321
|
+
debug_dir=debug_dir, pdf_paths=pdf_paths,
|
1322
|
+
campaign_info=campaign
|
1323
|
+
)
|
1324
|
+
all_variants.extend(campaign_variants)
|
1325
|
+
return all_variants, campaigns
|
1326
|
+
# Original logic for when we have both locations and campaigns
|
1293
1327
|
# Log location information
|
1294
1328
|
location_summary = []
|
1295
1329
|
for loc in locations[:5]:
|
@@ -1939,6 +1973,173 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
|
|
1939
1973
|
log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
|
1940
1974
|
return {}
|
1941
1975
|
|
1976
|
+
def extract_enzyme_info_with_gemini(
|
1977
|
+
text: str,
|
1978
|
+
variants: List[Variant],
|
1979
|
+
model,
|
1980
|
+
) -> Dict[str, str]:
|
1981
|
+
"""Use Gemini to extract enzyme names or sequences when PDB IDs are not available.
|
1982
|
+
|
1983
|
+
Returns:
|
1984
|
+
Dict mapping variant IDs to sequences
|
1985
|
+
"""
|
1986
|
+
# Build variant info for context
|
1987
|
+
variant_info = []
|
1988
|
+
for v in variants[:10]: # Limit to first 10 variants for context
|
1989
|
+
info = {
|
1990
|
+
"id": v.variant_id,
|
1991
|
+
"mutations": v.mutations[:5] if v.mutations else [], # Limit mutations shown
|
1992
|
+
"parent": v.parent_id,
|
1993
|
+
"generation": v.generation
|
1994
|
+
}
|
1995
|
+
variant_info.append(info)
|
1996
|
+
|
1997
|
+
prompt = f"""You are analyzing a scientific paper about enzyme engineering. No PDB IDs were found in the paper, and I need to obtain protein sequences for the enzyme variants described.
|
1998
|
+
|
1999
|
+
Here are the variants found in the paper:
|
2000
|
+
{json.dumps(variant_info, indent=2)}
|
2001
|
+
|
2002
|
+
Please analyze the paper text and:
|
2003
|
+
1. Identify the common name of the enzyme being studied (e.g., "P450 BM3", "cytochrome P450 BM3", "CYP102A1")
|
2004
|
+
2. If possible, extract or find the wild-type sequence
|
2005
|
+
3. Provide any UniProt IDs or accession numbers mentioned
|
2006
|
+
|
2007
|
+
Paper text (first 5000 characters):
|
2008
|
+
{text[:5000]}
|
2009
|
+
|
2010
|
+
Return your response as a JSON object with this structure:
|
2011
|
+
{{
|
2012
|
+
"enzyme_name": "common name of the enzyme",
|
2013
|
+
"systematic_name": "systematic name if applicable (e.g., CYP102A1)",
|
2014
|
+
"uniprot_id": "UniProt ID if found",
|
2015
|
+
"wild_type_sequence": "sequence if found in paper or if you know it",
|
2016
|
+
"additional_names": ["list", "of", "alternative", "names"]
|
2017
|
+
}}
|
2018
|
+
|
2019
|
+
If you cannot determine certain fields, set them to null.
|
2020
|
+
"""
|
2021
|
+
|
2022
|
+
try:
|
2023
|
+
response = model.generate_content(prompt)
|
2024
|
+
text_response = _extract_text(response).strip()
|
2025
|
+
|
2026
|
+
# Parse JSON response
|
2027
|
+
if text_response.startswith("```"):
|
2028
|
+
text_response = text_response.split("```")[1].strip()
|
2029
|
+
if text_response.startswith("json"):
|
2030
|
+
text_response = text_response[4:].strip()
|
2031
|
+
text_response = text_response.split("```")[0].strip()
|
2032
|
+
|
2033
|
+
enzyme_info = json.loads(text_response)
|
2034
|
+
log.info(f"Gemini extracted enzyme info: {enzyme_info.get('enzyme_name', 'Unknown')}")
|
2035
|
+
|
2036
|
+
sequences = {}
|
2037
|
+
|
2038
|
+
# If Gemini provided a sequence directly, use it
|
2039
|
+
if enzyme_info.get("wild_type_sequence"):
|
2040
|
+
# Clean the sequence
|
2041
|
+
seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
|
2042
|
+
# Validate it looks like a protein sequence
|
2043
|
+
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
|
2044
|
+
# Map to the first variant or wild-type
|
2045
|
+
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
2046
|
+
if wt_variant:
|
2047
|
+
sequences[wt_variant.variant_id] = seq
|
2048
|
+
else:
|
2049
|
+
sequences[variants[0].variant_id] = seq
|
2050
|
+
log.info(f"Using sequence from Gemini: {len(seq)} residues")
|
2051
|
+
|
2052
|
+
# If no sequence but we have names, try to fetch from UniProt
|
2053
|
+
if not sequences:
|
2054
|
+
names_to_try = []
|
2055
|
+
if enzyme_info.get("enzyme_name"):
|
2056
|
+
names_to_try.append(enzyme_info["enzyme_name"])
|
2057
|
+
if enzyme_info.get("systematic_name"):
|
2058
|
+
names_to_try.append(enzyme_info["systematic_name"])
|
2059
|
+
if enzyme_info.get("uniprot_id"):
|
2060
|
+
names_to_try.append(enzyme_info["uniprot_id"])
|
2061
|
+
if enzyme_info.get("additional_names"):
|
2062
|
+
names_to_try.extend(enzyme_info["additional_names"])
|
2063
|
+
|
2064
|
+
# Try each name with UniProt
|
2065
|
+
for name in names_to_try:
|
2066
|
+
if name:
|
2067
|
+
uniprot_seqs = fetch_sequence_by_name(name)
|
2068
|
+
if uniprot_seqs:
|
2069
|
+
# Map the first sequence to appropriate variant
|
2070
|
+
seq = list(uniprot_seqs.values())[0]
|
2071
|
+
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
2072
|
+
if wt_variant:
|
2073
|
+
sequences[wt_variant.variant_id] = seq
|
2074
|
+
else:
|
2075
|
+
sequences[variants[0].variant_id] = seq
|
2076
|
+
log.info(f"Found sequence via UniProt search for '{name}': {len(seq)} residues")
|
2077
|
+
break
|
2078
|
+
|
2079
|
+
return sequences
|
2080
|
+
|
2081
|
+
except Exception as e:
|
2082
|
+
log.warning(f"Failed to extract enzyme info with Gemini: {e}")
|
2083
|
+
return {}
|
2084
|
+
|
2085
|
+
|
2086
|
+
def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
|
2087
|
+
"""Fetch protein sequences from UniProt by enzyme name or ID.
|
2088
|
+
|
2089
|
+
Args:
|
2090
|
+
enzyme_name: Name, ID, or accession of the enzyme
|
2091
|
+
|
2092
|
+
Returns:
|
2093
|
+
Dict mapping identifiers to sequences
|
2094
|
+
"""
|
2095
|
+
import requests
|
2096
|
+
|
2097
|
+
clean_name = enzyme_name.strip()
|
2098
|
+
|
2099
|
+
# First try as accession number
|
2100
|
+
if len(clean_name) <= 10 and (clean_name[0].isalpha() and clean_name[1:].replace("_", "").isalnum()):
|
2101
|
+
# Looks like a UniProt accession
|
2102
|
+
url = f"https://rest.uniprot.org/uniprotkb/{clean_name}"
|
2103
|
+
try:
|
2104
|
+
response = requests.get(url, timeout=10)
|
2105
|
+
if response.status_code == 200:
|
2106
|
+
data = response.json()
|
2107
|
+
sequence = data.get('sequence', {}).get('value', '')
|
2108
|
+
if sequence:
|
2109
|
+
return {clean_name: sequence}
|
2110
|
+
except:
|
2111
|
+
pass
|
2112
|
+
|
2113
|
+
# Try search API
|
2114
|
+
url = "https://rest.uniprot.org/uniprotkb/search"
|
2115
|
+
params = {
|
2116
|
+
"query": f'(protein_name:"{clean_name}" OR gene:"{clean_name}" OR id:"{clean_name}")',
|
2117
|
+
"format": "json",
|
2118
|
+
"size": "5",
|
2119
|
+
"fields": "accession,id,protein_name,gene_names,sequence"
|
2120
|
+
}
|
2121
|
+
|
2122
|
+
try:
|
2123
|
+
response = requests.get(url, params=params, timeout=10)
|
2124
|
+
response.raise_for_status()
|
2125
|
+
data = response.json()
|
2126
|
+
|
2127
|
+
results = data.get('results', [])
|
2128
|
+
sequences = {}
|
2129
|
+
|
2130
|
+
for result in results[:1]: # Just take the first match
|
2131
|
+
sequence = result.get('sequence', {}).get('value', '')
|
2132
|
+
if sequence:
|
2133
|
+
sequences[clean_name] = sequence
|
2134
|
+
break
|
2135
|
+
|
2136
|
+
return sequences
|
2137
|
+
|
2138
|
+
except Exception as e:
|
2139
|
+
log.warning(f"Failed to fetch sequence for '{enzyme_name}': {e}")
|
2140
|
+
return {}
|
2141
|
+
|
2142
|
+
|
1942
2143
|
def match_pdb_to_variants(
|
1943
2144
|
pdb_sequences: Dict[str, str],
|
1944
2145
|
variants: List[Variant],
|
@@ -2110,16 +2311,23 @@ def _merge_lineage_and_sequences(
|
|
2110
2311
|
for v in lineage
|
2111
2312
|
])
|
2112
2313
|
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
2314
|
+
if seqs:
|
2315
|
+
df_seq = pd.DataFrame([
|
2316
|
+
{
|
2317
|
+
"variant_id": s.variant_id,
|
2318
|
+
"aa_seq": s.aa_seq,
|
2319
|
+
"dna_seq": s.dna_seq,
|
2320
|
+
"seq_confidence": s.confidence,
|
2321
|
+
"truncated": s.truncated,
|
2322
|
+
"seq_source": s.metadata.get("source", None) if s.metadata else None,
|
2323
|
+
}
|
2324
|
+
for s in seqs
|
2325
|
+
])
|
2326
|
+
else:
|
2327
|
+
# Create empty DataFrame with correct columns for merging
|
2328
|
+
df_seq = pd.DataFrame(columns=[
|
2329
|
+
"variant_id", "aa_seq", "dna_seq", "seq_confidence", "truncated", "seq_source"
|
2330
|
+
])
|
2123
2331
|
|
2124
2332
|
# Log sequence data info
|
2125
2333
|
if len(df_seq) > 0:
|
@@ -2397,7 +2605,7 @@ def run_pipeline(
|
|
2397
2605
|
early_df = _lineage_to_dataframe(lineage)
|
2398
2606
|
output_csv_path = Path(output_csv)
|
2399
2607
|
# Save lineage-only data with specific filename
|
2400
|
-
lineage_path = output_csv_path.parent / "
|
2608
|
+
lineage_path = output_csv_path.parent / "enzyme_lineage_name.csv"
|
2401
2609
|
early_df.to_csv(lineage_path, index=False)
|
2402
2610
|
log.info(
|
2403
2611
|
"Saved lineage-only CSV -> %s",
|
@@ -2461,6 +2669,36 @@ def run_pipeline(
|
|
2461
2669
|
log.warning(f"No sequences found in PDB {pdb_id}")
|
2462
2670
|
else:
|
2463
2671
|
log.warning("No PDB IDs found in paper")
|
2672
|
+
|
2673
|
+
# 4b. If still no sequences, try Gemini extraction as last resort
|
2674
|
+
if not sequences or all(not s.aa_seq for s in sequences):
|
2675
|
+
log.info("No sequences from PDB, attempting Gemini-based extraction...")
|
2676
|
+
|
2677
|
+
gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
|
2678
|
+
|
2679
|
+
if gemini_sequences:
|
2680
|
+
# Convert to SequenceBlock objects
|
2681
|
+
gemini_seq_blocks = []
|
2682
|
+
for variant_id, seq in gemini_sequences.items():
|
2683
|
+
# Find the matching variant
|
2684
|
+
variant = next((v for v in lineage if v.variant_id == variant_id), None)
|
2685
|
+
if variant:
|
2686
|
+
seq_block = SequenceBlock(
|
2687
|
+
variant_id=variant.variant_id,
|
2688
|
+
aa_seq=seq,
|
2689
|
+
dna_seq=None,
|
2690
|
+
confidence=0.9, # High confidence but slightly lower than PDB
|
2691
|
+
truncated=False,
|
2692
|
+
metadata={"source": "Gemini/UniProt"}
|
2693
|
+
)
|
2694
|
+
gemini_seq_blocks.append(seq_block)
|
2695
|
+
log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
|
2696
|
+
|
2697
|
+
if gemini_seq_blocks:
|
2698
|
+
sequences = gemini_seq_blocks
|
2699
|
+
log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
|
2700
|
+
else:
|
2701
|
+
log.warning("Failed to extract sequences via Gemini")
|
2464
2702
|
|
2465
2703
|
# 5. Merge & score (Section 8) --------------------------------------------
|
2466
2704
|
doi = extract_doi(manuscript)
|
debase/lineage_format.py
CHANGED
@@ -188,11 +188,17 @@ class VariantRecord:
|
|
188
188
|
# Reaction-related -------------------------------------------------------------
|
189
189
|
def substrate_iupac(self) -> List[str]:
|
190
190
|
raw = str(self.row.get("substrate_iupac_list", "")).strip()
|
191
|
-
|
191
|
+
result = _split_list(raw)
|
192
|
+
if not result and raw and raw.lower() != 'nan':
|
193
|
+
log.debug(f"substrate_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
|
194
|
+
return result
|
192
195
|
|
193
196
|
def product_iupac(self) -> List[str]:
|
194
197
|
raw = str(self.row.get("product_iupac_list", "")).strip()
|
195
|
-
|
198
|
+
result = _split_list(raw)
|
199
|
+
if not result and raw and raw.lower() != 'nan':
|
200
|
+
log.debug(f"product_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
|
201
|
+
return result
|
196
202
|
|
197
203
|
|
198
204
|
def ttn_or_yield(self) -> Optional[float]:
|
@@ -377,6 +383,53 @@ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str =
|
|
377
383
|
|
378
384
|
# === 6. SMILES CONVERSION HELPERS ==================================================
|
379
385
|
|
386
|
+
def search_smiles_with_gemini(compound_name: str, model=None) -> Optional[str]:
|
387
|
+
"""
|
388
|
+
Use Gemini to search for SMILES strings of complex compounds.
|
389
|
+
Returns SMILES string if found, None otherwise.
|
390
|
+
"""
|
391
|
+
if not compound_name or compound_name.lower() in ['nan', 'none', '']:
|
392
|
+
return None
|
393
|
+
|
394
|
+
if not model:
|
395
|
+
try:
|
396
|
+
# Import get_model from enzyme_lineage_extractor
|
397
|
+
import sys
|
398
|
+
from pathlib import Path
|
399
|
+
sys.path.append(str(Path(__file__).parent))
|
400
|
+
from enzyme_lineage_extractor import get_model
|
401
|
+
model = get_model()
|
402
|
+
except Exception as e:
|
403
|
+
log.warning(f"Could not load Gemini model: {e}")
|
404
|
+
return None
|
405
|
+
|
406
|
+
prompt = f"""Search for the SMILES string representation of this chemical compound:
|
407
|
+
"{compound_name}"
|
408
|
+
|
409
|
+
IMPORTANT:
|
410
|
+
- Do NOT generate or create a SMILES string
|
411
|
+
- Only provide SMILES that you can find in chemical databases or literature
|
412
|
+
- For deuterated compounds, search for the specific isotope-labeled SMILES
|
413
|
+
- If you cannot find the exact SMILES, say "NOT FOUND"
|
414
|
+
|
415
|
+
Return ONLY the SMILES string if found, or "NOT FOUND" if not found.
|
416
|
+
No explanation or additional text."""
|
417
|
+
|
418
|
+
try:
|
419
|
+
response = model.generate_content(prompt)
|
420
|
+
result = response.text.strip()
|
421
|
+
|
422
|
+
if result and result != "NOT FOUND" and not result.startswith("I"):
|
423
|
+
# Basic validation that it looks like SMILES
|
424
|
+
if any(c in result for c in ['C', 'c', 'N', 'O', 'S', 'P', '[', ']', '(', ')']):
|
425
|
+
log.info(f"Gemini found SMILES for '{compound_name}': {result}")
|
426
|
+
return result
|
427
|
+
return None
|
428
|
+
except Exception as e:
|
429
|
+
log.debug(f"Gemini SMILES search failed for '{compound_name}': {e}")
|
430
|
+
return None
|
431
|
+
|
432
|
+
|
380
433
|
def _split_list(raw: str) -> List[str]:
|
381
434
|
if not raw or str(raw).lower() == 'nan':
|
382
435
|
return []
|
@@ -429,7 +482,12 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
|
429
482
|
except FileNotFoundError:
|
430
483
|
pass # OPSIN not installed
|
431
484
|
|
432
|
-
# 3.
|
485
|
+
# 3. Gemini search (for complex compounds) ---------------------------------
|
486
|
+
gemini_smiles = search_smiles_with_gemini(name)
|
487
|
+
if gemini_smiles:
|
488
|
+
return gemini_smiles
|
489
|
+
|
490
|
+
# 4. PubChem PUG REST (online) ---------------------------------------------
|
433
491
|
try:
|
434
492
|
import requests
|
435
493
|
|
@@ -538,13 +596,23 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
|
|
538
596
|
|
539
597
|
def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
540
598
|
"""Infer lineage roots using generation numbers and simple sequence similarity."""
|
541
|
-
|
599
|
+
# Create idmap, handling missing enzyme_id gracefully
|
600
|
+
idmap: Dict[str, Dict[str, str]] = {}
|
601
|
+
for _, r in df.iterrows():
|
602
|
+
eid = r.get("enzyme_id")
|
603
|
+
if pd.isna(eid) or str(eid).strip() == "":
|
604
|
+
continue
|
605
|
+
idmap[str(eid)] = r
|
542
606
|
roots: Dict[str, str] = {}
|
543
607
|
# Look for generation 0 as the root
|
544
|
-
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
608
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
609
|
+
if str(r.get("generation", "")).strip() == "0"
|
610
|
+
and not pd.isna(r.get("enzyme_id"))}
|
545
611
|
# If no gen0 found, fall back to gen1
|
546
612
|
if not gen0:
|
547
|
-
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
613
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
614
|
+
if str(r.get("generation", "")).strip() == "1"
|
615
|
+
and not pd.isna(r.get("enzyme_id"))}
|
548
616
|
|
549
617
|
def _seq_sim(a: str, b: str) -> float:
|
550
618
|
if not a or not b:
|
@@ -553,7 +621,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
|
553
621
|
return matches / max(len(a), len(b))
|
554
622
|
|
555
623
|
for _, row in df.iterrows():
|
556
|
-
eid = row
|
624
|
+
eid = row.get("enzyme_id")
|
625
|
+
if pd.isna(eid) or str(eid).strip() == "":
|
626
|
+
continue
|
557
627
|
if eid in gen0:
|
558
628
|
roots[eid] = eid
|
559
629
|
continue
|
@@ -593,6 +663,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
|
593
663
|
|
594
664
|
def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
595
665
|
"""Main public API: returns a DataFrame in the flat output format."""
|
666
|
+
log.info(f"Starting flatten_dataframe with {len(df)} input rows")
|
667
|
+
log.info(f"Input columns: {list(df.columns)}")
|
668
|
+
|
596
669
|
# Apply column aliases to the dataframe
|
597
670
|
for alias, canonical in COLUMN_ALIASES.items():
|
598
671
|
if alias in df.columns and canonical not in df.columns:
|
@@ -621,8 +694,29 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
621
694
|
# _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
|
622
695
|
|
623
696
|
# 3. Flatten rows ---------------------------------------------------------
|
624
|
-
|
697
|
+
# Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
|
698
|
+
idmap = {}
|
699
|
+
for _, r in df.iterrows():
|
700
|
+
eid = str(r["enzyme_id"])
|
701
|
+
if eid in idmap:
|
702
|
+
log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
|
703
|
+
idmap[eid] = r.to_dict()
|
704
|
+
|
705
|
+
# Check for duplicate enzyme_ids
|
706
|
+
enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
|
707
|
+
unique_ids = set(enzyme_ids)
|
708
|
+
if len(enzyme_ids) != len(unique_ids):
|
709
|
+
log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
|
710
|
+
from collections import Counter
|
711
|
+
id_counts = Counter(enzyme_ids)
|
712
|
+
duplicates = {k: v for k, v in id_counts.items() if v > 1}
|
713
|
+
log.warning(f"Duplicate enzyme_ids: {duplicates}")
|
714
|
+
log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
|
715
|
+
|
625
716
|
output_rows: List[Dict[str, str]] = []
|
717
|
+
skipped_count = 0
|
718
|
+
processed_count = 0
|
719
|
+
|
626
720
|
for idx, (_, row) in enumerate(df.iterrows()):
|
627
721
|
rec = VariantRecord(row.to_dict())
|
628
722
|
eid = rec.eid
|
@@ -632,13 +726,19 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
632
726
|
prods = rec.product_iupac()
|
633
727
|
data_type = rec.row.get("data_type", "")
|
634
728
|
|
635
|
-
if not
|
636
|
-
# Skip entries without
|
729
|
+
if not prods:
|
730
|
+
# Skip entries without product info unless it's marked as lineage only
|
637
731
|
if data_type == "lineage":
|
638
732
|
subs, prods = [""], [""] # placeholders
|
639
733
|
else:
|
640
|
-
log.
|
734
|
+
log.info(f"Skipping enzyme_id={eid} (row {idx}) due to missing product data. prods={prods}, data_type={data_type}")
|
735
|
+
skipped_count += 1
|
641
736
|
continue
|
737
|
+
|
738
|
+
# If no substrates but we have products, use empty substrate list
|
739
|
+
if not subs:
|
740
|
+
log.debug(f"Empty substrate list for enzyme_id={eid}, using empty placeholder")
|
741
|
+
subs = [""]
|
642
742
|
|
643
743
|
sub_smiles = [sub_cache.get(s, "") for s in subs]
|
644
744
|
prod_smiles = [prod_cache.get(p, "") for p in prods]
|
@@ -712,7 +812,9 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
712
812
|
additional_information=additional_information,
|
713
813
|
)
|
714
814
|
output_rows.append(flat.as_dict())
|
815
|
+
processed_count += 1
|
715
816
|
|
817
|
+
log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
|
716
818
|
out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
|
717
819
|
return out_df
|
718
820
|
|
@@ -761,6 +761,15 @@ Ignore locations that contain data for other campaigns.
|
|
761
761
|
return line
|
762
762
|
return page[:800]
|
763
763
|
|
764
|
+
def _ensure_rgb_pixmap(self, pix: fitz.Pixmap) -> fitz.Pixmap:
|
765
|
+
"""Ensure pixmap is in RGB colorspace for PIL compatibility."""
|
766
|
+
if pix.alpha: # RGBA -> RGB
|
767
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
768
|
+
elif pix.colorspace and pix.colorspace.name not in ["DeviceRGB", "DeviceGray"]:
|
769
|
+
# Convert unsupported colorspaces (CMYK, LAB, etc.) to RGB
|
770
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
771
|
+
return pix
|
772
|
+
|
764
773
|
# ---- NEW: Page image helper for both figures and tables ----
|
765
774
|
def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
|
766
775
|
"""Export the page containing the reference as PNG.
|
@@ -802,14 +811,14 @@ Ignore locations that contain data for other campaigns.
|
|
802
811
|
if img_rect.y1 < cap_rect.y0: # fully above caption
|
803
812
|
# Extract image bytes
|
804
813
|
pix = fitz.Pixmap(doc, xref)
|
805
|
-
|
806
|
-
pix = fitz.Pixmap(fitz.csRGB, pix)
|
814
|
+
pix = self._ensure_rgb_pixmap(pix)
|
807
815
|
img_bytes = pix.tobytes("png")
|
808
816
|
return b64encode(img_bytes).decode()
|
809
817
|
else:
|
810
818
|
# Extract the entire page as an image
|
811
819
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
812
820
|
pix = page.get_pixmap(matrix=mat)
|
821
|
+
pix = self._ensure_rgb_pixmap(pix)
|
813
822
|
img_bytes = pix.tobytes("png")
|
814
823
|
return b64encode(img_bytes).decode()
|
815
824
|
return None
|
@@ -842,11 +851,13 @@ Ignore locations that contain data for other campaigns.
|
|
842
851
|
# Add the current page
|
843
852
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
844
853
|
pix = doc.load_page(page_num).get_pixmap(matrix=mat)
|
854
|
+
pix = self._ensure_rgb_pixmap(pix)
|
845
855
|
all_images.append(pix)
|
846
856
|
|
847
857
|
# If this is the last page with the reference, also add the next page
|
848
858
|
if i == len(pages) - 1 and page_num + 1 < doc.page_count:
|
849
859
|
next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
|
860
|
+
next_pix = self._ensure_rgb_pixmap(next_pix)
|
850
861
|
all_images.append(next_pix)
|
851
862
|
LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
|
852
863
|
|
@@ -855,14 +866,16 @@ Ignore locations that contain data for other campaigns.
|
|
855
866
|
|
856
867
|
# If only one page, return it directly
|
857
868
|
if len(all_images) == 1:
|
858
|
-
|
869
|
+
pix = self._ensure_rgb_pixmap(all_images[0])
|
870
|
+
return b64encode(pix.tobytes("png")).decode()
|
859
871
|
|
860
872
|
# Combine multiple pages vertically
|
861
873
|
if not all_images:
|
862
874
|
return None
|
863
875
|
|
864
876
|
if len(all_images) == 1:
|
865
|
-
|
877
|
+
pix = self._ensure_rgb_pixmap(all_images[0])
|
878
|
+
return b64encode(pix.tobytes("png")).decode()
|
866
879
|
|
867
880
|
# Calculate dimensions for combined image
|
868
881
|
total_height = sum(pix.height for pix in all_images)
|
@@ -903,6 +916,7 @@ Ignore locations that contain data for other campaigns.
|
|
903
916
|
# Convert the page to a pixmap
|
904
917
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
|
905
918
|
combined_pix = page.get_pixmap(matrix=mat)
|
919
|
+
combined_pix = self._ensure_rgb_pixmap(combined_pix)
|
906
920
|
|
907
921
|
# Convert to PNG and return
|
908
922
|
img_bytes = combined_pix.tobytes("png")
|
@@ -947,8 +961,9 @@ Ignore locations that contain data for other campaigns.
|
|
947
961
|
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
948
962
|
tag = f"extract_metrics_batch_vision"
|
949
963
|
else:
|
950
|
-
# Add enzyme names to prompt for batch extraction
|
951
|
-
|
964
|
+
# Add enzyme names to prompt for batch extraction with explicit format requirement
|
965
|
+
format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
|
966
|
+
prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
|
952
967
|
LOGGER.info("Gemini: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
953
968
|
tag = f"extract_metrics_batch"
|
954
969
|
|
debase/wrapper.py
CHANGED
@@ -46,101 +46,333 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
|
|
46
46
|
"""
|
47
47
|
Step 2: Clean and validate protein sequences
|
48
48
|
Calls: cleanup_sequence.py
|
49
|
+
Returns output path even if cleanup fails (copies input file)
|
49
50
|
"""
|
50
51
|
logger.info(f"Cleaning sequences from {input_csv.name}")
|
51
52
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
try:
|
54
|
+
from .cleanup_sequence import main as cleanup_sequences
|
55
|
+
cleanup_sequences([str(input_csv), str(output_csv)])
|
56
|
+
|
57
|
+
logger.info(f"Sequence cleanup complete: {output_csv}")
|
58
|
+
return output_csv
|
59
|
+
|
60
|
+
except Exception as e:
|
61
|
+
logger.warning(f"Sequence cleanup failed: {e}")
|
62
|
+
logger.info("Copying original file to continue pipeline...")
|
63
|
+
|
64
|
+
# Copy the input file as-is to continue pipeline
|
65
|
+
import shutil
|
66
|
+
shutil.copy2(input_csv, output_csv)
|
67
|
+
|
68
|
+
logger.info(f"Original file copied: {output_csv}")
|
69
|
+
return output_csv
|
57
70
|
|
58
71
|
|
59
72
|
def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
|
60
73
|
"""
|
61
74
|
Step 3a: Extract reaction performance metrics
|
62
75
|
Calls: reaction_info_extractor.py
|
76
|
+
Returns output path even if extraction fails (creates empty file)
|
63
77
|
"""
|
64
78
|
logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
|
65
79
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
80
|
+
try:
|
81
|
+
from .reaction_info_extractor import ReactionExtractor, Config
|
82
|
+
import pandas as pd
|
83
|
+
|
84
|
+
# Load enzyme data
|
85
|
+
enzyme_df = pd.read_csv(lineage_csv)
|
86
|
+
|
87
|
+
# Initialize extractor and run
|
88
|
+
cfg = Config()
|
89
|
+
extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
|
90
|
+
df_metrics = extractor.run(enzyme_df)
|
91
|
+
|
92
|
+
# Save results
|
93
|
+
df_metrics.to_csv(output, index=False)
|
94
|
+
logger.info(f"Reaction extraction complete: {output}")
|
95
|
+
return output
|
96
|
+
|
97
|
+
except Exception as e:
|
98
|
+
logger.warning(f"Reaction extraction failed: {e}")
|
99
|
+
logger.info("Creating empty reaction info file to continue pipeline...")
|
100
|
+
|
101
|
+
# Create empty reaction CSV with basic columns
|
102
|
+
import pandas as pd
|
103
|
+
empty_df = pd.DataFrame(columns=[
|
104
|
+
'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
|
105
|
+
'conversion_percent', 'reaction_type', 'reaction_conditions', 'notes'
|
106
|
+
])
|
107
|
+
empty_df.to_csv(output, index=False)
|
108
|
+
|
109
|
+
logger.info(f"Empty reaction file created: {output}")
|
110
|
+
return output
|
81
111
|
|
82
112
|
|
83
113
|
def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
|
84
114
|
"""
|
85
115
|
Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
|
86
116
|
Calls: substrate_scope_extractor.py
|
117
|
+
Returns output path even if extraction fails (creates empty file)
|
87
118
|
"""
|
88
119
|
logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
|
89
120
|
|
90
|
-
|
121
|
+
try:
|
122
|
+
from .substrate_scope_extractor import run_pipeline
|
123
|
+
|
124
|
+
# Run substrate scope extraction
|
125
|
+
run_pipeline(
|
126
|
+
manuscript=manuscript,
|
127
|
+
si=si,
|
128
|
+
lineage_csv=lineage_csv,
|
129
|
+
output_csv=output,
|
130
|
+
debug_dir=debug_dir
|
131
|
+
)
|
132
|
+
|
133
|
+
logger.info(f"Substrate scope extraction complete: {output}")
|
134
|
+
return output
|
135
|
+
|
136
|
+
except Exception as e:
|
137
|
+
logger.warning(f"Substrate scope extraction failed: {e}")
|
138
|
+
logger.info("Creating empty substrate scope file to continue pipeline...")
|
139
|
+
|
140
|
+
# Create empty substrate scope CSV with proper headers
|
141
|
+
import pandas as pd
|
142
|
+
empty_df = pd.DataFrame(columns=[
|
143
|
+
'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
|
144
|
+
'conversion_percent', 'selectivity', 'reaction_conditions', 'notes'
|
145
|
+
])
|
146
|
+
empty_df.to_csv(output, index=False)
|
147
|
+
|
148
|
+
logger.info(f"Empty substrate scope file created: {output}")
|
149
|
+
return output
|
150
|
+
|
151
|
+
|
152
|
+
def match_enzyme_variants_with_gemini(lineage_enzymes: list, data_enzymes: list, model=None) -> dict:
|
153
|
+
"""
|
154
|
+
Use Gemini to match enzyme variant IDs between different datasets.
|
155
|
+
Returns a mapping of data_enzyme_id -> lineage_enzyme_id.
|
156
|
+
"""
|
157
|
+
import json
|
91
158
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
)
|
159
|
+
if not model:
|
160
|
+
try:
|
161
|
+
from .enzyme_lineage_extractor import get_model
|
162
|
+
model = get_model()
|
163
|
+
except:
|
164
|
+
logger.warning("Could not load Gemini model for variant matching")
|
165
|
+
return {}
|
100
166
|
|
101
|
-
|
102
|
-
|
167
|
+
prompt = f"""Match enzyme variant IDs between two lists from the same scientific paper.
|
168
|
+
|
169
|
+
These lists come from different sections or analyses of the same study, but may use different naming conventions.
|
170
|
+
|
171
|
+
List 1 (from lineage/sequence data):
|
172
|
+
{json.dumps(lineage_enzymes)}
|
173
|
+
|
174
|
+
List 2 (from experimental data):
|
175
|
+
{json.dumps(data_enzymes)}
|
176
|
+
|
177
|
+
Analyze the patterns and match variants that refer to the same enzyme.
|
178
|
+
Return ONLY a JSON object mapping IDs from List 2 to their corresponding IDs in List 1.
|
179
|
+
Format: {{"list2_id": "list1_id", ...}}
|
180
|
+
Only include matches you are confident about based on the naming patterns.
|
181
|
+
"""
|
182
|
+
|
183
|
+
try:
|
184
|
+
response = model.generate_content(prompt)
|
185
|
+
mapping_text = response.text.strip()
|
186
|
+
|
187
|
+
# Extract JSON from response
|
188
|
+
if '```json' in mapping_text:
|
189
|
+
mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
|
190
|
+
elif '```' in mapping_text:
|
191
|
+
mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
|
192
|
+
|
193
|
+
mapping = json.loads(mapping_text)
|
194
|
+
logger.info(f"Gemini matched {len(mapping)} enzyme variants")
|
195
|
+
for k, v in mapping.items():
|
196
|
+
logger.info(f" Matched '{k}' -> '{v}'")
|
197
|
+
return mapping
|
198
|
+
except Exception as e:
|
199
|
+
logger.warning(f"Failed to match variants with Gemini: {e}")
|
200
|
+
return {}
|
103
201
|
|
104
202
|
|
105
203
|
def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
|
106
204
|
"""
|
107
205
|
Step 4: Format and merge all data into final CSV
|
108
|
-
|
206
|
+
Creates comprehensive format merging all available data, even if some extraction steps failed
|
109
207
|
"""
|
110
208
|
logger.info(f"Formatting and merging data into final output")
|
111
209
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
210
|
+
try:
|
211
|
+
import pandas as pd
|
212
|
+
|
213
|
+
# Read all available data files
|
214
|
+
logger.info("Reading enzyme lineage data...")
|
215
|
+
df_lineage = pd.read_csv(cleaned_csv)
|
216
|
+
|
217
|
+
logger.info("Reading reaction data...")
|
218
|
+
try:
|
219
|
+
df_reaction = pd.read_csv(reaction_csv)
|
220
|
+
has_reaction_data = len(df_reaction) > 0 and not df_reaction.empty
|
221
|
+
except:
|
222
|
+
df_reaction = pd.DataFrame()
|
223
|
+
has_reaction_data = False
|
224
|
+
|
225
|
+
logger.info("Reading substrate scope data...")
|
226
|
+
try:
|
227
|
+
df_scope = pd.read_csv(substrate_scope_csv)
|
228
|
+
has_scope_data = len(df_scope) > 0 and not df_scope.empty
|
229
|
+
except:
|
230
|
+
df_scope = pd.DataFrame()
|
231
|
+
has_scope_data = False
|
232
|
+
|
233
|
+
# Start with lineage data as base
|
234
|
+
df_final = df_lineage.copy()
|
235
|
+
|
236
|
+
# Ensure consistent enzyme ID column
|
237
|
+
if 'variant_id' in df_final.columns and 'enzyme_id' not in df_final.columns:
|
238
|
+
df_final = df_final.rename(columns={'variant_id': 'enzyme_id'})
|
239
|
+
|
240
|
+
# Merge reaction data if available
|
241
|
+
if has_reaction_data:
|
242
|
+
logger.info(f"Merging reaction data ({len(df_reaction)} records)")
|
243
|
+
# Match on enzyme_id or enzyme
|
244
|
+
merge_key = 'enzyme_id' if 'enzyme_id' in df_reaction.columns else 'enzyme'
|
245
|
+
if merge_key in df_reaction.columns:
|
246
|
+
df_final = df_final.merge(df_reaction, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_reaction'))
|
247
|
+
else:
|
248
|
+
logger.info("No reaction data available")
|
249
|
+
|
250
|
+
# Merge substrate scope data if available
|
251
|
+
if has_scope_data:
|
252
|
+
logger.info(f"Merging substrate scope data ({len(df_scope)} records)")
|
253
|
+
merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
|
254
|
+
|
255
|
+
if merge_key in df_scope.columns:
|
256
|
+
# First try direct merge
|
257
|
+
df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
|
258
|
+
|
259
|
+
# Check if any matches were found
|
260
|
+
matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
|
261
|
+
|
262
|
+
if matched_count == 0:
|
263
|
+
logger.info("No direct matches found, using Gemini to match enzyme variants...")
|
264
|
+
|
265
|
+
# Get unique enzyme IDs from both datasets
|
266
|
+
lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
|
267
|
+
scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
|
268
|
+
|
269
|
+
# Get mapping from Gemini
|
270
|
+
mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
|
271
|
+
|
272
|
+
if mapping:
|
273
|
+
# Apply mapping to scope data
|
274
|
+
df_scope_mapped = df_scope.copy()
|
275
|
+
df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
|
276
|
+
df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
|
277
|
+
else:
|
278
|
+
logger.warning("Could not match enzyme variants between datasets")
|
279
|
+
df_final = df_test_merge
|
280
|
+
else:
|
281
|
+
df_final = df_test_merge
|
282
|
+
logger.info(f"Direct merge matched {matched_count} records")
|
283
|
+
else:
|
284
|
+
logger.info("No substrate scope data available")
|
285
|
+
|
286
|
+
# Add comprehensive column structure for missing data
|
287
|
+
essential_columns = [
|
288
|
+
'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
|
289
|
+
'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
|
290
|
+
'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
|
291
|
+
'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
|
292
|
+
'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
|
293
|
+
'data_location'
|
294
|
+
]
|
295
|
+
|
296
|
+
# Add missing columns with NaN
|
297
|
+
for col in essential_columns:
|
298
|
+
if col not in df_final.columns:
|
299
|
+
df_final[col] = None
|
300
|
+
|
301
|
+
# Clean up duplicate columns from merging
|
302
|
+
columns_to_keep = []
|
303
|
+
seen_base_names = set()
|
304
|
+
for col in df_final.columns:
|
305
|
+
base_name = col.split('_reaction')[0].split('_scope')[0]
|
306
|
+
if base_name not in seen_base_names:
|
307
|
+
columns_to_keep.append(col)
|
308
|
+
seen_base_names.add(base_name)
|
309
|
+
elif col.endswith('_scope') or col.endswith('_reaction'):
|
310
|
+
# Prefer scope or reaction data over base lineage data for certain columns
|
311
|
+
if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
|
312
|
+
columns_to_keep.append(col)
|
313
|
+
# Remove the base column if it exists
|
314
|
+
if base_name in columns_to_keep:
|
315
|
+
columns_to_keep.remove(base_name)
|
316
|
+
seen_base_names.add(base_name)
|
317
|
+
|
318
|
+
df_final = df_final[columns_to_keep]
|
319
|
+
|
320
|
+
# Rename merged columns back to standard names
|
321
|
+
rename_map = {}
|
322
|
+
for col in df_final.columns:
|
323
|
+
if col.endswith('_scope') or col.endswith('_reaction'):
|
324
|
+
base_name = col.split('_scope')[0].split('_reaction')[0]
|
325
|
+
rename_map[col] = base_name
|
326
|
+
df_final = df_final.rename(columns=rename_map)
|
327
|
+
|
328
|
+
# Save the comprehensive final output
|
329
|
+
df_final.to_csv(output_csv, index=False)
|
330
|
+
|
331
|
+
logger.info(f"Final comprehensive format complete: {output_csv}")
|
332
|
+
logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
|
333
|
+
|
334
|
+
# Log what data was successfully merged
|
335
|
+
if has_reaction_data:
|
336
|
+
logger.info("✓ Reaction performance data merged")
|
337
|
+
if has_scope_data:
|
338
|
+
logger.info("✓ Substrate scope data merged")
|
339
|
+
|
340
|
+
# Now run the actual lineage format to produce plate-based format
|
341
|
+
logger.info("\nRunning lineage format to produce plate-based output...")
|
342
|
+
try:
|
343
|
+
from .lineage_format import flatten_dataframe
|
344
|
+
|
345
|
+
# Create the plate-based output filename
|
346
|
+
plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
|
347
|
+
|
348
|
+
# Flatten the dataframe to plate format
|
349
|
+
df_flattened = flatten_dataframe(df_final)
|
350
|
+
|
351
|
+
# Save the flattened output
|
352
|
+
df_flattened.to_csv(plate_output, index=False)
|
353
|
+
|
354
|
+
logger.info(f"✓ Plate-based format saved to: {plate_output}")
|
355
|
+
logger.info(f" Contains {len(df_flattened)} rows with plate/well assignments")
|
356
|
+
|
357
|
+
# Update the final output path to be the plate format
|
358
|
+
output_csv = plate_output
|
359
|
+
|
360
|
+
except Exception as e:
|
361
|
+
logger.warning(f"Could not generate plate-based format: {e}")
|
362
|
+
logger.info("Comprehensive format will be used as final output")
|
363
|
+
|
364
|
+
return output_csv
|
365
|
+
|
366
|
+
except Exception as e:
|
367
|
+
logger.warning(f"Final formatting failed: {e}")
|
368
|
+
logger.info("Using cleaned sequence data as final output...")
|
369
|
+
|
370
|
+
# Copy the cleaned CSV as the final output
|
371
|
+
import shutil
|
372
|
+
shutil.copy2(cleaned_csv, output_csv)
|
373
|
+
|
374
|
+
logger.info(f"Cleaned sequence file used as final output: {output_csv}")
|
375
|
+
return output_csv
|
144
376
|
|
145
377
|
|
146
378
|
def run_pipeline(
|
@@ -206,7 +438,7 @@ def run_pipeline(
|
|
206
438
|
|
207
439
|
# Step 4: Format and merge
|
208
440
|
logger.info("\n[Step 4/5] Formatting and merging data...")
|
209
|
-
run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
|
441
|
+
final_output = run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
|
210
442
|
|
211
443
|
# Step 5: Finalize
|
212
444
|
logger.info("\n[Step 5/5] Finalizing...")
|
@@ -219,11 +451,13 @@ def run_pipeline(
|
|
219
451
|
|
220
452
|
logger.info("\n" + "="*60)
|
221
453
|
logger.info("PIPELINE COMPLETED SUCCESSFULLY")
|
222
|
-
logger.info(f"
|
454
|
+
logger.info(f"Comprehensive output: {output_path}")
|
455
|
+
if final_output != output_path:
|
456
|
+
logger.info(f"Plate-based output: {final_output}")
|
223
457
|
logger.info(f"Runtime: {elapsed:.1f} seconds")
|
224
458
|
logger.info("="*60)
|
225
459
|
|
226
|
-
return
|
460
|
+
return final_output
|
227
461
|
|
228
462
|
except Exception as e:
|
229
463
|
logger.error(f"Pipeline failed: {str(e)}")
|
@@ -0,0 +1,17 @@
|
|
1
|
+
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
+
debase/_version.py,sha256=Qd1kKsssesKE5FvJnDdAuZsx_BrxTSJJyt68SK99D54,50
|
5
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
|
+
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
|
8
|
+
debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
|
9
|
+
debase/reaction_info_extractor.py,sha256=W9CS0puFTdhJ_T2Fpy931EgnjOCsHHjbtU6RdnzDlhw,113140
|
10
|
+
debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
|
11
|
+
debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
|
12
|
+
debase-0.1.18.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.18.dist-info/METADATA,sha256=XvSrveJ0Y40c53JYUfiveaQNJ3qoEkxaQ61n3_--1cQ,10790
|
14
|
+
debase-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.18.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.18.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.18.dist-info/RECORD,,
|
debase-0.1.16.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=l25FRqoNjxB5d3qBHsLMMA_9YWsIZ7nJ5BiTLj0qYE8,50
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=jNxNCh8VF0dUFxUlTall0w1-oQojXRXLnWcuPFs5ij8,106879
|
7
|
-
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
8
|
-
debase/reaction_info_extractor.py,sha256=9DkEZh7TgsxKpFkKbLyUhS_w0Z84LczkDFv-v_NEHE4,112174
|
9
|
-
debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
|
10
|
-
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
11
|
-
debase-0.1.16.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.1.16.dist-info/METADATA,sha256=7sv2OcIuHaoOImkBdoEtRzyOjp9Kuoz2ZmgK4tosaUc,10790
|
13
|
-
debase-0.1.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.1.16.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.1.16.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.1.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|