debase 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.0 → debase-0.1.1}/PKG-INFO +1 -1
- {debase-0.1.0 → debase-0.1.1}/src/debase/_version.py +1 -1
- {debase-0.1.0 → debase-0.1.1}/src/debase/wrapper.py +5 -7
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/SOURCES.txt +0 -7
- debase-0.1.0/.gitignore +0 -177
- debase-0.1.0/CONTRIBUTING.md +0 -61
- debase-0.1.0/docs/README.md +0 -19
- debase-0.1.0/docs/examples/README.md +0 -24
- debase-0.1.0/environment.yml +0 -21
- debase-0.1.0/src/__init__.py +0 -1
- debase-0.1.0/src/debase/PIPELINE_FLOW.md +0 -100
- {debase-0.1.0 → debase-0.1.1}/LICENSE +0 -0
- {debase-0.1.0 → debase-0.1.1}/MANIFEST.in +0 -0
- {debase-0.1.0 → debase-0.1.1}/README.md +0 -0
- {debase-0.1.0 → debase-0.1.1}/pyproject.toml +0 -0
- {debase-0.1.0 → debase-0.1.1}/setup.cfg +0 -0
- {debase-0.1.0 → debase-0.1.1}/setup.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/__init__.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/__main__.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/build_db.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/enzyme_lineage_extractor.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/lineage_format.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/reaction_info_extractor.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.0 → debase-0.1.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -35,9 +35,7 @@ def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir:
|
|
35
35
|
"""
|
36
36
|
logger.info(f"Extracting enzyme lineage from {manuscript.name}")
|
37
37
|
|
38
|
-
import
|
39
|
-
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
40
|
-
from src.debase.enzyme_lineage_extractor import run_pipeline
|
38
|
+
from .enzyme_lineage_extractor import run_pipeline
|
41
39
|
run_pipeline(manuscript=manuscript, si=si, output_csv=output, debug_dir=debug_dir)
|
42
40
|
|
43
41
|
logger.info(f"Lineage extraction complete: {output}")
|
@@ -51,7 +49,7 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
|
|
51
49
|
"""
|
52
50
|
logger.info(f"Cleaning sequences from {input_csv.name}")
|
53
51
|
|
54
|
-
from
|
52
|
+
from .cleanup_sequence import main as cleanup_sequences
|
55
53
|
cleanup_sequences([str(input_csv), str(output_csv)])
|
56
54
|
|
57
55
|
logger.info(f"Sequence cleanup complete: {output_csv}")
|
@@ -65,7 +63,7 @@ def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, outpu
|
|
65
63
|
"""
|
66
64
|
logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
|
67
65
|
|
68
|
-
from
|
66
|
+
from .reaction_info_extractor import ReactionExtractor, Config
|
69
67
|
import pandas as pd
|
70
68
|
|
71
69
|
# Load enzyme data
|
@@ -89,7 +87,7 @@ def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path
|
|
89
87
|
"""
|
90
88
|
logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
|
91
89
|
|
92
|
-
from
|
90
|
+
from .substrate_scope_extractor import run_pipeline
|
93
91
|
|
94
92
|
# Run substrate scope extraction
|
95
93
|
run_pipeline(
|
@@ -111,7 +109,7 @@ def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_cs
|
|
111
109
|
"""
|
112
110
|
logger.info(f"Formatting and merging data into final output")
|
113
111
|
|
114
|
-
from
|
112
|
+
from .lineage_format import run_pipeline
|
115
113
|
import pandas as pd
|
116
114
|
|
117
115
|
# First, we need to merge the protein sequences into the reaction data
|
@@ -1,15 +1,8 @@
|
|
1
|
-
.gitignore
|
2
|
-
CONTRIBUTING.md
|
3
1
|
LICENSE
|
4
2
|
MANIFEST.in
|
5
3
|
README.md
|
6
|
-
environment.yml
|
7
4
|
pyproject.toml
|
8
5
|
setup.py
|
9
|
-
docs/README.md
|
10
|
-
docs/examples/README.md
|
11
|
-
src/__init__.py
|
12
|
-
src/debase/PIPELINE_FLOW.md
|
13
6
|
src/debase/__init__.py
|
14
7
|
src/debase/__main__.py
|
15
8
|
src/debase/_version.py
|
debase-0.1.0/.gitignore
DELETED
@@ -1,177 +0,0 @@
|
|
1
|
-
# Byte-compiled / optimized / DLL files
|
2
|
-
__pycache__/
|
3
|
-
*.py[cod]
|
4
|
-
*$py.class
|
5
|
-
|
6
|
-
# C extensions
|
7
|
-
*.so
|
8
|
-
|
9
|
-
# Distribution / packaging
|
10
|
-
.Python
|
11
|
-
build/
|
12
|
-
develop-eggs/
|
13
|
-
dist/
|
14
|
-
downloads/
|
15
|
-
eggs/
|
16
|
-
.eggs/
|
17
|
-
lib/
|
18
|
-
lib64/
|
19
|
-
parts/
|
20
|
-
sdist/
|
21
|
-
var/
|
22
|
-
wheels/
|
23
|
-
share/python-wheels/
|
24
|
-
*.egg-info/
|
25
|
-
.installed.cfg
|
26
|
-
*.egg
|
27
|
-
MANIFEST
|
28
|
-
|
29
|
-
# PyInstaller
|
30
|
-
*.manifest
|
31
|
-
*.spec
|
32
|
-
|
33
|
-
# Installer logs
|
34
|
-
pip-log.txt
|
35
|
-
pip-delete-this-directory.txt
|
36
|
-
|
37
|
-
# Unit test / coverage reports
|
38
|
-
htmlcov/
|
39
|
-
.tox/
|
40
|
-
.nox/
|
41
|
-
.coverage
|
42
|
-
.coverage.*
|
43
|
-
.cache
|
44
|
-
nosetests.xml
|
45
|
-
coverage.xml
|
46
|
-
*.cover
|
47
|
-
*.py,cover
|
48
|
-
.hypothesis/
|
49
|
-
.pytest_cache/
|
50
|
-
cover/
|
51
|
-
|
52
|
-
# Jupyter Notebook
|
53
|
-
.ipynb_checkpoints
|
54
|
-
|
55
|
-
# IPython
|
56
|
-
profile_default/
|
57
|
-
ipython_config.py
|
58
|
-
|
59
|
-
# pyenv
|
60
|
-
.python-version
|
61
|
-
|
62
|
-
# pipenv
|
63
|
-
Pipfile.lock
|
64
|
-
|
65
|
-
# poetry
|
66
|
-
poetry.lock
|
67
|
-
|
68
|
-
# pdm
|
69
|
-
.pdm.toml
|
70
|
-
|
71
|
-
# PEP 582
|
72
|
-
__pypackages__/
|
73
|
-
|
74
|
-
# Celery stuff
|
75
|
-
celerybeat-schedule
|
76
|
-
celerybeat.pid
|
77
|
-
|
78
|
-
# SageMath parsed files
|
79
|
-
*.sage.py
|
80
|
-
|
81
|
-
# Environments
|
82
|
-
.env
|
83
|
-
.venv
|
84
|
-
env/
|
85
|
-
venv/
|
86
|
-
ENV/
|
87
|
-
env.bak/
|
88
|
-
venv.bak/
|
89
|
-
|
90
|
-
# Spyder project settings
|
91
|
-
.spyderproject
|
92
|
-
.spyproject
|
93
|
-
|
94
|
-
# Rope project settings
|
95
|
-
.ropeproject
|
96
|
-
|
97
|
-
# mkdocs documentation
|
98
|
-
/site
|
99
|
-
|
100
|
-
# mypy
|
101
|
-
.mypy_cache/
|
102
|
-
.dmypy.json
|
103
|
-
dmypy.json
|
104
|
-
|
105
|
-
# Pyre type checker
|
106
|
-
.pyre/
|
107
|
-
|
108
|
-
# pytype static type analyzer
|
109
|
-
.pytype/
|
110
|
-
|
111
|
-
# Cython debug symbols
|
112
|
-
cython_debug/
|
113
|
-
|
114
|
-
# PyCharm
|
115
|
-
.idea/
|
116
|
-
|
117
|
-
# VS Code
|
118
|
-
.vscode/
|
119
|
-
|
120
|
-
# macOS
|
121
|
-
.DS_Store
|
122
|
-
.AppleDouble
|
123
|
-
.LSOverride
|
124
|
-
|
125
|
-
# Windows
|
126
|
-
Thumbs.db
|
127
|
-
Thumbs.db:encryptable
|
128
|
-
ehthumbs.db
|
129
|
-
ehthumbs_vista.db
|
130
|
-
*.stackdump
|
131
|
-
[Dd]esktop.ini
|
132
|
-
$RECYCLE.BIN/
|
133
|
-
*.cab
|
134
|
-
*.msi
|
135
|
-
*.msix
|
136
|
-
*.msm
|
137
|
-
*.msp
|
138
|
-
*.lnk
|
139
|
-
|
140
|
-
# Linux
|
141
|
-
*~
|
142
|
-
|
143
|
-
# Temporary files
|
144
|
-
*.tmp
|
145
|
-
*.temp
|
146
|
-
*.log
|
147
|
-
.temp_*/
|
148
|
-
.cache/
|
149
|
-
|
150
|
-
# DEBase specific
|
151
|
-
enzyme_pipeline*.log
|
152
|
-
temp_merged_input.csv
|
153
|
-
*.egg-info/
|
154
|
-
|
155
|
-
# Project data and examples
|
156
|
-
data/
|
157
|
-
examples/
|
158
|
-
!examples/test.csv # Keep test.csv as example output
|
159
|
-
|
160
|
-
# Cache files
|
161
|
-
*.pkl
|
162
|
-
*_cache.pkl
|
163
|
-
|
164
|
-
# Large database files
|
165
|
-
*.db
|
166
|
-
|
167
|
-
# PDFs and Excel files
|
168
|
-
*.pdf
|
169
|
-
*.xlsx
|
170
|
-
|
171
|
-
# Backup files
|
172
|
-
*_backup.py
|
173
|
-
lineage_format_backup.py
|
174
|
-
|
175
|
-
# Temporary directories
|
176
|
-
.temp_*
|
177
|
-
enzyme_analysis_*
|
debase-0.1.0/CONTRIBUTING.md
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
# Contributing to DEBase
|
2
|
-
|
3
|
-
Thank you for your interest in contributing to DEBase!
|
4
|
-
|
5
|
-
## Development Setup
|
6
|
-
|
7
|
-
1. Clone the repository:
|
8
|
-
```bash
|
9
|
-
git clone https://github.com/yourusername/debase.git
|
10
|
-
cd debase
|
11
|
-
```
|
12
|
-
|
13
|
-
2. Create a virtual environment:
|
14
|
-
```bash
|
15
|
-
python -m venv venv
|
16
|
-
source venv/bin/activate # On Windows: venv\Scripts\activate
|
17
|
-
```
|
18
|
-
|
19
|
-
3. Install in development mode:
|
20
|
-
```bash
|
21
|
-
pip install -e ".[dev]"
|
22
|
-
```
|
23
|
-
|
24
|
-
## Running Tests
|
25
|
-
|
26
|
-
```bash
|
27
|
-
pytest tests/
|
28
|
-
```
|
29
|
-
|
30
|
-
## Code Style
|
31
|
-
|
32
|
-
We use Black for code formatting:
|
33
|
-
```bash
|
34
|
-
black src/ tests/
|
35
|
-
```
|
36
|
-
|
37
|
-
And isort for import sorting:
|
38
|
-
```bash
|
39
|
-
isort src/ tests/
|
40
|
-
```
|
41
|
-
|
42
|
-
## Project Structure
|
43
|
-
|
44
|
-
```
|
45
|
-
debase/
|
46
|
-
├── src/debase/ # Main package source code
|
47
|
-
├── tests/ # Test suite
|
48
|
-
├── docs/ # Documentation
|
49
|
-
├── examples/ # Example outputs and usage
|
50
|
-
├── data/ # Research data (PDFs)
|
51
|
-
└── scripts/ # Utility scripts
|
52
|
-
```
|
53
|
-
|
54
|
-
## Submitting Changes
|
55
|
-
|
56
|
-
1. Fork the repository
|
57
|
-
2. Create a feature branch
|
58
|
-
3. Make your changes
|
59
|
-
4. Add tests if applicable
|
60
|
-
5. Run the test suite
|
61
|
-
6. Submit a pull request
|
debase-0.1.0/docs/README.md
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# DEBase Documentation
|
2
|
-
|
3
|
-
This directory contains comprehensive documentation for the DEBase enzyme analysis pipeline.
|
4
|
-
|
5
|
-
## Directory Structure
|
6
|
-
|
7
|
-
- `api/` - API documentation and reference
|
8
|
-
- `examples/` - Usage examples and tutorials
|
9
|
-
- `tutorials/` - Step-by-step guides
|
10
|
-
|
11
|
-
## Quick Start
|
12
|
-
|
13
|
-
See the main [README.md](../README.md) for installation and basic usage.
|
14
|
-
|
15
|
-
## Contents
|
16
|
-
|
17
|
-
1. [Installation Guide](tutorials/installation.md)
|
18
|
-
2. [API Reference](api/README.md)
|
19
|
-
3. [Usage Examples](examples/README.md)
|
@@ -1,24 +0,0 @@
|
|
1
|
-
# DEBase Examples
|
2
|
-
|
3
|
-
This directory contains example outputs and usage demonstrations for the DEBase pipeline.
|
4
|
-
|
5
|
-
## Example Outputs
|
6
|
-
|
7
|
-
The `../../examples/` directory contains sample results from successful pipeline runs:
|
8
|
-
|
9
|
-
- `trpb_complete_pipeline.csv` - Complete TrpB enzyme dataset with sequences, mutations, and reactions
|
10
|
-
- `carbene_complete_pipeline.csv` - Carbene transfer enzyme data with SMILES and conditions
|
11
|
-
- `REFINED_ENZYME_SEQUENCES.csv` - Refined sequence extraction results
|
12
|
-
|
13
|
-
## Data Format
|
14
|
-
|
15
|
-
Each CSV contains:
|
16
|
-
- Full-length protein sequences (200-400+ amino acids)
|
17
|
-
- Complete mutation lineage tracking
|
18
|
-
- Chemical reaction SMILES strings
|
19
|
-
- Experimental conditions and metadata
|
20
|
-
- Performance metrics (yield, TTN, enantioselectivity)
|
21
|
-
|
22
|
-
## Usage
|
23
|
-
|
24
|
-
These files demonstrate the expected output format and can be used as reference for pipeline validation.
|
debase-0.1.0/environment.yml
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
name: debase
|
2
|
-
channels:
|
3
|
-
- conda-forge
|
4
|
-
- defaults
|
5
|
-
dependencies:
|
6
|
-
- python=3.9
|
7
|
-
- pandas>=1.0.0
|
8
|
-
- numpy>=1.19.0
|
9
|
-
- matplotlib>=3.3.0
|
10
|
-
- seaborn>=0.11.0
|
11
|
-
- jupyter>=1.0.0
|
12
|
-
- jupyterlab>=3.0.0
|
13
|
-
- openpyxl>=3.0.0
|
14
|
-
- biopython>=1.78
|
15
|
-
- requests>=2.25.0
|
16
|
-
- tqdm>=4.60.0
|
17
|
-
- rdkit>=2020.03.1
|
18
|
-
- pip
|
19
|
-
- pip:
|
20
|
-
- PyMuPDF>=1.18.0
|
21
|
-
- google-generativeai>=0.3.0
|
debase-0.1.0/src/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
|
@@ -1,100 +0,0 @@
|
|
1
|
-
# DEBase Pipeline Flow
|
2
|
-
|
3
|
-
## Overview
|
4
|
-
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
-
|
6
|
-
## Pipeline Architecture
|
7
|
-
|
8
|
-
```
|
9
|
-
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
-
│ Manuscript PDF │ │ SI PDF │
|
11
|
-
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
-
│ │
|
13
|
-
└───────────┬───────────────┘
|
14
|
-
│
|
15
|
-
▼
|
16
|
-
┌─────────────────────────────┐
|
17
|
-
│ 1. enzyme_lineage_extractor │
|
18
|
-
│ - Extract enzyme variants │
|
19
|
-
│ - Parse mutations │
|
20
|
-
│ - Get basic metadata │
|
21
|
-
└─────────────┬───────────────┘
|
22
|
-
│
|
23
|
-
▼
|
24
|
-
┌─────────────────────────────┐
|
25
|
-
│ 2. cleanup_sequence │
|
26
|
-
│ - Validate sequences │
|
27
|
-
│ - Fix formatting issues │
|
28
|
-
│ - Generate full sequences │
|
29
|
-
└─────────────┬───────────────┘
|
30
|
-
│
|
31
|
-
┌───────────┴───────────────┐
|
32
|
-
│ │
|
33
|
-
▼ ▼
|
34
|
-
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
-
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
-
│ _extractor │ │ _extractor │
|
37
|
-
│ - Performance metrics │ │ - Substrate variations │
|
38
|
-
│ - Model reaction │ │ - Additional variants │
|
39
|
-
│ - Conditions │ │ - Scope data │
|
40
|
-
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
-
│ │
|
42
|
-
└───────────┬───────────────┘
|
43
|
-
│
|
44
|
-
▼
|
45
|
-
┌─────────────────────────────┐
|
46
|
-
│ 4. lineage_format_o3 │
|
47
|
-
│ - Merge all data │
|
48
|
-
│ - Fill missing sequences │
|
49
|
-
│ - Format final output │
|
50
|
-
└─────────────┬───────────────┘
|
51
|
-
│
|
52
|
-
▼
|
53
|
-
┌─────────────┐
|
54
|
-
│ Final CSV │
|
55
|
-
└─────────────┘
|
56
|
-
```
|
57
|
-
|
58
|
-
## Module Details
|
59
|
-
|
60
|
-
### 1. enzyme_lineage_extractor.py
|
61
|
-
- **Input**: Manuscript PDF, SI PDF
|
62
|
-
- **Output**: CSV with enzyme variants and mutations
|
63
|
-
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
-
|
65
|
-
### 2. cleanup_sequence.py
|
66
|
-
- **Input**: Enzyme lineage CSV
|
67
|
-
- **Output**: CSV with validated sequences
|
68
|
-
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
-
|
70
|
-
### 3a. reaction_info_extractor.py
|
71
|
-
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
-
- **Output**: CSV with reaction performance data
|
73
|
-
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
-
|
75
|
-
### 3b. substrate_scope_extractor.py
|
76
|
-
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
-
- **Output**: CSV with substrate scope entries
|
78
|
-
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
-
|
80
|
-
### 4. lineage_format_o3.py
|
81
|
-
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
-
- **Output**: Final formatted CSV
|
83
|
-
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
-
|
85
|
-
## Key Features
|
86
|
-
|
87
|
-
1. **Modular Design**: Each step can be run independently
|
88
|
-
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
-
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
-
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
-
|
92
|
-
## Usage
|
93
|
-
|
94
|
-
```bash
|
95
|
-
# Full pipeline
|
96
|
-
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
-
|
98
|
-
# With intermediate files kept for debugging
|
99
|
-
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
-
```
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|