debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
debase/PIPELINE_FLOW.md
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
# DEBase Pipeline Flow
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
+
|
6
|
+
## Pipeline Architecture
|
7
|
+
|
8
|
+
```
|
9
|
+
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
+
│ Manuscript PDF │ │ SI PDF │
|
11
|
+
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
+
│ │
|
13
|
+
└───────────┬───────────────┘
|
14
|
+
│
|
15
|
+
▼
|
16
|
+
┌─────────────────────────────┐
|
17
|
+
│ 1. enzyme_lineage_extractor │
|
18
|
+
│ - Extract enzyme variants │
|
19
|
+
│ - Parse mutations │
|
20
|
+
│ - Get basic metadata │
|
21
|
+
└─────────────┬───────────────┘
|
22
|
+
│
|
23
|
+
▼
|
24
|
+
┌─────────────────────────────┐
|
25
|
+
│ 2. cleanup_sequence │
|
26
|
+
│ - Validate sequences │
|
27
|
+
│ - Fix formatting issues │
|
28
|
+
│ - Generate full sequences │
|
29
|
+
└─────────────┬───────────────┘
|
30
|
+
│
|
31
|
+
┌───────────┴───────────────┐
|
32
|
+
│ │
|
33
|
+
▼ ▼
|
34
|
+
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
+
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
+
│ _extractor │ │ _extractor │
|
37
|
+
│ - Performance metrics │ │ - Substrate variations │
|
38
|
+
│ - Model reaction │ │ - Additional variants │
|
39
|
+
│ - Conditions │ │ - Scope data │
|
40
|
+
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
+
│ │
|
42
|
+
└───────────┬───────────────┘
|
43
|
+
│
|
44
|
+
▼
|
45
|
+
┌─────────────────────────────┐
|
46
|
+
│ 4. lineage_format_o3 │
|
47
|
+
│ - Merge all data │
|
48
|
+
│ - Fill missing sequences │
|
49
|
+
│ - Format final output │
|
50
|
+
└─────────────┬───────────────┘
|
51
|
+
│
|
52
|
+
▼
|
53
|
+
┌─────────────┐
|
54
|
+
│ Final CSV │
|
55
|
+
└─────────────┘
|
56
|
+
```
|
57
|
+
|
58
|
+
## Module Details
|
59
|
+
|
60
|
+
### 1. enzyme_lineage_extractor.py
|
61
|
+
- **Input**: Manuscript PDF, SI PDF
|
62
|
+
- **Output**: CSV with enzyme variants and mutations
|
63
|
+
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
+
|
65
|
+
### 2. cleanup_sequence.py
|
66
|
+
- **Input**: Enzyme lineage CSV
|
67
|
+
- **Output**: CSV with validated sequences
|
68
|
+
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
+
|
70
|
+
### 3a. reaction_info_extractor.py
|
71
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
+
- **Output**: CSV with reaction performance data
|
73
|
+
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
+
|
75
|
+
### 3b. substrate_scope_extractor.py
|
76
|
+
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
+
- **Output**: CSV with substrate scope entries
|
78
|
+
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
+
|
80
|
+
### 4. lineage_format_o3.py
|
81
|
+
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
+
- **Output**: Final formatted CSV
|
83
|
+
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
+
|
85
|
+
## Key Features
|
86
|
+
|
87
|
+
1. **Modular Design**: Each step can be run independently
|
88
|
+
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
+
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
+
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
+
|
92
|
+
## Usage
|
93
|
+
|
94
|
+
```bash
|
95
|
+
# Full pipeline
|
96
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
+
|
98
|
+
# With intermediate files kept for debugging
|
99
|
+
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
+
```
|
debase/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
"""DEBase - Enzyme lineage analysis and sequence extraction package."""
|
2
|
+
|
3
|
+
from ._version import __version__
|
4
|
+
|
5
|
+
from .enzyme_lineage_extractor import run_pipeline as extract_lineage
|
6
|
+
from .cleanup_sequence import main as cleanup_sequences
|
7
|
+
from .reaction_info_extractor import ReactionExtractor
|
8
|
+
from .substrate_scope_extractor import run_pipeline as extract_substrate_scope
|
9
|
+
from .lineage_format import run_pipeline as format_lineage
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"__version__",
|
13
|
+
"extract_lineage",
|
14
|
+
"cleanup_sequences",
|
15
|
+
"ReactionExtractor",
|
16
|
+
"extract_substrate_scope",
|
17
|
+
"format_lineage",
|
18
|
+
]
|
debase/__main__.py
ADDED
debase/_version.py
ADDED
debase/build_db.py
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
"""
|
2
|
+
Build an IUPAC/Synonym -> SMILES SQLite database from PubChem
|
3
|
+
and fall back to OPSIN for names not found locally.
|
4
|
+
|
5
|
+
New in this version
|
6
|
+
-------------------
|
7
|
+
- Downloads and ingests CID-Synonym-filtered.gz
|
8
|
+
- Adds a simple opsin_lookup() fallback
|
9
|
+
- locate_db() helper so user code can find the DB
|
10
|
+
|
11
|
+
Usage
|
12
|
+
-----
|
13
|
+
python build_db.py # one-time build (3-4 GB disk, <6 GB RAM)
|
14
|
+
python -m i2s "ethyl 2-(dimethyl(p-tolyl)silyl)propanoate"
|
15
|
+
"""
|
16
|
+
|
17
|
+
from __future__ import annotations
|
18
|
+
import gzip, sqlite3, urllib.request, pathlib, sys, subprocess, shutil, os
|
19
|
+
from typing import Optional
|
20
|
+
|
21
|
+
# ---------------------------------------------------------------------------
|
22
|
+
# 0. Where to keep big files?
|
23
|
+
# ---------------------------------------------------------------------------
|
24
|
+
# Use local data directory in the project
|
25
|
+
DATA_DIR = pathlib.Path(__file__).parent.parent.parent / "data"
|
26
|
+
DL_DIR = DATA_DIR / "pubchem"
|
27
|
+
DB_PATH = DATA_DIR / "iupac2smiles.db"
|
28
|
+
|
29
|
+
BASE = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/"
|
30
|
+
FILES = {
|
31
|
+
"CID-IUPAC.gz" : "cid_iupac.gz",
|
32
|
+
"CID-SMILES.gz" : "cid_smiles.gz",
|
33
|
+
"CID-Synonym-filtered.gz": "cid_synonym.gz", # NEW
|
34
|
+
}
|
35
|
+
|
36
|
+
# ---------------------------------------------------------------------------
|
37
|
+
# 1. Download
|
38
|
+
# ---------------------------------------------------------------------------
|
39
|
+
def download_all() -> None:
|
40
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
41
|
+
DL_DIR.mkdir(parents=True, exist_ok=True)
|
42
|
+
for remote, local_name in FILES.items():
|
43
|
+
local = DL_DIR / local_name
|
44
|
+
if not local.exists():
|
45
|
+
print(f"Downloading {remote}")
|
46
|
+
urllib.request.urlretrieve(BASE + remote, local)
|
47
|
+
else:
|
48
|
+
print(f"[OK] {local_name} already present")
|
49
|
+
|
50
|
+
# ---------------------------------------------------------------------------
|
51
|
+
# 2. Build DB (streaming, memory-safe)
|
52
|
+
# ---------------------------------------------------------------------------
|
53
|
+
def build_sqlite() -> None:
|
54
|
+
db = sqlite3.connect(DB_PATH)
|
55
|
+
c = db.cursor()
|
56
|
+
c.executescript("""
|
57
|
+
PRAGMA journal_mode = OFF;
|
58
|
+
PRAGMA synchronous = OFF;
|
59
|
+
CREATE TABLE IF NOT EXISTS x(
|
60
|
+
name TEXT PRIMARY KEY, -- lower-case
|
61
|
+
smiles TEXT NOT NULL
|
62
|
+
);
|
63
|
+
""")
|
64
|
+
|
65
|
+
# 2a. SMILES lookup dict (CID -> SMILES) ~1.3 GB -> <1 GB RAM
|
66
|
+
print("Loading SMILES...")
|
67
|
+
cid2smiles: dict[str, str] = {}
|
68
|
+
with gzip.open(DL_DIR / "cid_smiles.gz", "rt", encoding="utf-8") as f:
|
69
|
+
for line in f:
|
70
|
+
cid, smiles = line.rstrip("\n").split("\t")
|
71
|
+
cid2smiles[cid] = smiles
|
72
|
+
|
73
|
+
# helper to flush batches
|
74
|
+
batch: list[tuple[str, str]] = []
|
75
|
+
def canonicalize(name: str) -> str:
|
76
|
+
"""Canonicalize name: lowercase, strip, collapse spaces"""
|
77
|
+
return ' '.join(name.lower().split())
|
78
|
+
|
79
|
+
def flush():
|
80
|
+
if batch:
|
81
|
+
c.executemany("INSERT OR IGNORE INTO x VALUES(?,?)", batch)
|
82
|
+
db.commit()
|
83
|
+
batch.clear()
|
84
|
+
|
85
|
+
# 2b. IUPAC table
|
86
|
+
print("Merging IUPAC...")
|
87
|
+
with gzip.open(DL_DIR / "cid_iupac.gz", "rt", encoding="utf-8") as f:
|
88
|
+
for n, line in enumerate(f, 1):
|
89
|
+
cid, iupac = line.rstrip("\n").split("\t")
|
90
|
+
smiles = cid2smiles.get(cid)
|
91
|
+
if smiles:
|
92
|
+
batch.append((canonicalize(iupac), smiles))
|
93
|
+
if len(batch) == 100_000:
|
94
|
+
flush()
|
95
|
+
print(f" ... {n:,} IUPAC rows", end="\r")
|
96
|
+
flush()
|
97
|
+
|
98
|
+
# 2c. Synonyms table **NEW**
|
99
|
+
print("\nAdding synonyms...")
|
100
|
+
with gzip.open(DL_DIR / "cid_synonym.gz", "rt", encoding="utf-8") as f:
|
101
|
+
for n, line in enumerate(f, 1):
|
102
|
+
cid, syn = line.rstrip("\n").split("\t")
|
103
|
+
smiles = cid2smiles.get(cid)
|
104
|
+
if smiles:
|
105
|
+
batch.append((canonicalize(syn), smiles))
|
106
|
+
if len(batch) == 100_000:
|
107
|
+
flush()
|
108
|
+
if n % 1_000_000 == 0:
|
109
|
+
print(f" ... {n:,} synonym rows", end="\r")
|
110
|
+
flush()
|
111
|
+
|
112
|
+
c.execute("CREATE INDEX IF NOT EXISTS idx_name ON x(name);")
|
113
|
+
db.commit()
|
114
|
+
print("Build complete")
|
115
|
+
db.close()
|
116
|
+
|
117
|
+
# ---------------------------------------------------------------------------
|
118
|
+
# 3. Lookup helpers
|
119
|
+
# ---------------------------------------------------------------------------
|
120
|
+
def locate_db(explicit: Optional[str] = None) -> pathlib.Path:
|
121
|
+
"""Return path to SQLite DB, or raise FileNotFoundError."""
|
122
|
+
for p in (
|
123
|
+
pathlib.Path(explicit) if explicit else None,
|
124
|
+
DB_PATH, # Primary location in project data folder
|
125
|
+
pathlib.Path.cwd() / "data" / "iupac2smiles.db",
|
126
|
+
pathlib.Path.home() / ".iupac2smiles.db",
|
127
|
+
):
|
128
|
+
if p and p.exists():
|
129
|
+
return p
|
130
|
+
raise FileNotFoundError("iupac2smiles.db not found; run build_db.py")
|
131
|
+
|
132
|
+
def sqlite_lookup(name: str, db_path: pathlib.Path | str = DB_PATH) -> Optional[str]:
|
133
|
+
# Canonicalize: lowercase, strip, collapse multiple spaces
|
134
|
+
canonical = ' '.join(name.lower().split())
|
135
|
+
with sqlite3.connect(db_path) as db:
|
136
|
+
row = db.execute("SELECT smiles FROM x WHERE name = ?", (canonical,)).fetchone()
|
137
|
+
return row[0] if row else None
|
138
|
+
|
139
|
+
# ---------------------------------------------------------------------------
|
140
|
+
# 4. OPSIN fallback **NEW**
|
141
|
+
# ---------------------------------------------------------------------------
|
142
|
+
def check_opsin_available() -> bool:
|
143
|
+
return shutil.which("opsin") is not None # expects 'opsin' on PATH
|
144
|
+
|
145
|
+
def opsin_lookup(name: str) -> Optional[str]:
|
146
|
+
"""
|
147
|
+
Convert an IUPAC name to SMILES via OPSIN CLI.
|
148
|
+
Install with: brew install opsin (macOS) or
|
149
|
+
conda install -c conda-forge opsin or
|
150
|
+
download JAR: https://opsin.ch.cam.ac.uk
|
151
|
+
"""
|
152
|
+
if not check_opsin_available():
|
153
|
+
return None
|
154
|
+
try:
|
155
|
+
# OPSIN reads from stdin and outputs SMILES by default
|
156
|
+
res = subprocess.run(
|
157
|
+
["opsin"],
|
158
|
+
input=name,
|
159
|
+
capture_output=True, text=True, check=True, timeout=10,
|
160
|
+
)
|
161
|
+
smiles = res.stdout.strip()
|
162
|
+
return smiles or None
|
163
|
+
except subprocess.SubprocessError:
|
164
|
+
return None
|
165
|
+
|
166
|
+
def iupac_to_smiles(name: str, db_path: pathlib.Path | str | None = None) -> Optional[str]:
|
167
|
+
db_path = locate_db(db_path) if db_path else locate_db()
|
168
|
+
smi = sqlite_lookup(name, db_path)
|
169
|
+
if smi:
|
170
|
+
return smi
|
171
|
+
# try OPSIN
|
172
|
+
return opsin_lookup(name)
|
173
|
+
|
174
|
+
# ---------------------------------------------------------------------------
|
175
|
+
# 5. CLI helper
|
176
|
+
# ---------------------------------------------------------------------------
|
177
|
+
if __name__ == "__main__":
|
178
|
+
if len(sys.argv) == 1:
|
179
|
+
print("Usage:")
|
180
|
+
print(" build_db.py download # fetch PubChem files")
|
181
|
+
print(" build_db.py build # create SQLite")
|
182
|
+
print(" python -m i2s NAME # (see below)")
|
183
|
+
elif sys.argv[1] == "download":
|
184
|
+
download_all()
|
185
|
+
elif sys.argv[1] == "build":
|
186
|
+
build_sqlite()
|
187
|
+
else:
|
188
|
+
# act like a tiny module: python build_db.py "acetylsalicylic acid"
|
189
|
+
q = " ".join(sys.argv[1:])
|
190
|
+
print(iupac_to_smiles(q) or "Not found")
|