debase 0.1.16__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. debase-0.1.17/.gitignore +177 -0
  2. debase-0.1.17/CONTRIBUTING.md +61 -0
  3. {debase-0.1.16/src/debase.egg-info → debase-0.1.17}/PKG-INFO +1 -1
  4. debase-0.1.17/docs/README.md +19 -0
  5. debase-0.1.17/docs/examples/README.md +24 -0
  6. debase-0.1.17/environment.yml +21 -0
  7. debase-0.1.17/src/debase/PIPELINE_FLOW.md +100 -0
  8. {debase-0.1.16 → debase-0.1.17}/src/debase/_version.py +1 -1
  9. {debase-0.1.16 → debase-0.1.17}/src/debase/enzyme_lineage_extractor.py +251 -13
  10. {debase-0.1.16 → debase-0.1.17}/src/debase/lineage_format.py +113 -11
  11. {debase-0.1.16 → debase-0.1.17}/src/debase/reaction_info_extractor.py +18 -4
  12. debase-0.1.17/src/debase/wrapper.py +535 -0
  13. {debase-0.1.16 → debase-0.1.17/src/debase.egg-info}/PKG-INFO +1 -1
  14. {debase-0.1.16 → debase-0.1.17}/src/debase.egg-info/SOURCES.txt +7 -0
  15. debase-0.1.17/src/debase.egg-info/dependency_links.txt +1 -0
  16. debase-0.1.16/src/debase/wrapper.py +0 -301
  17. {debase-0.1.16 → debase-0.1.17}/LICENSE +0 -0
  18. {debase-0.1.16 → debase-0.1.17}/MANIFEST.in +0 -0
  19. {debase-0.1.16 → debase-0.1.17}/README.md +0 -0
  20. {debase-0.1.16 → debase-0.1.17}/pyproject.toml +0 -0
  21. {debase-0.1.16 → debase-0.1.17}/setup.cfg +0 -0
  22. {debase-0.1.16 → debase-0.1.17}/setup.py +0 -0
  23. /debase-0.1.16/src/debase.egg-info/dependency_links.txt → /debase-0.1.17/src/__init__.py +0 -0
  24. {debase-0.1.16 → debase-0.1.17}/src/debase/__init__.py +0 -0
  25. {debase-0.1.16 → debase-0.1.17}/src/debase/__main__.py +0 -0
  26. {debase-0.1.16 → debase-0.1.17}/src/debase/build_db.py +0 -0
  27. {debase-0.1.16 → debase-0.1.17}/src/debase/cleanup_sequence.py +0 -0
  28. {debase-0.1.16 → debase-0.1.17}/src/debase/substrate_scope_extractor.py +0 -0
  29. {debase-0.1.16 → debase-0.1.17}/src/debase.egg-info/entry_points.txt +0 -0
  30. {debase-0.1.16 → debase-0.1.17}/src/debase.egg-info/requires.txt +0 -0
  31. {debase-0.1.16 → debase-0.1.17}/src/debase.egg-info/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Jupyter Notebook
53
+ .ipynb_checkpoints
54
+
55
+ # IPython
56
+ profile_default/
57
+ ipython_config.py
58
+
59
+ # pyenv
60
+ .python-version
61
+
62
+ # pipenv
63
+ Pipfile.lock
64
+
65
+ # poetry
66
+ poetry.lock
67
+
68
+ # pdm
69
+ .pdm.toml
70
+
71
+ # PEP 582
72
+ __pypackages__/
73
+
74
+ # Celery stuff
75
+ celerybeat-schedule
76
+ celerybeat.pid
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # pytype static type analyzer
109
+ .pytype/
110
+
111
+ # Cython debug symbols
112
+ cython_debug/
113
+
114
+ # PyCharm
115
+ .idea/
116
+
117
+ # VS Code
118
+ .vscode/
119
+
120
+ # macOS
121
+ .DS_Store
122
+ .AppleDouble
123
+ .LSOverride
124
+
125
+ # Windows
126
+ Thumbs.db
127
+ Thumbs.db:encryptable
128
+ ehthumbs.db
129
+ ehthumbs_vista.db
130
+ *.stackdump
131
+ [Dd]esktop.ini
132
+ $RECYCLE.BIN/
133
+ *.cab
134
+ *.msi
135
+ *.msix
136
+ *.msm
137
+ *.msp
138
+ *.lnk
139
+
140
+ # Linux
141
+ *~
142
+
143
+ # Temporary files
144
+ *.tmp
145
+ *.temp
146
+ *.log
147
+ .temp_*/
148
+ .cache/
149
+
150
+ # DEBase specific
151
+ enzyme_pipeline*.log
152
+ temp_merged_input.csv
153
+ *.egg-info/
154
+
155
+ # Project data and examples
156
+ data/
157
+ examples/
158
+ !examples/test.csv # Keep test.csv as example output
159
+
160
+ # Cache files
161
+ *.pkl
162
+ *_cache.pkl
163
+
164
+ # Large database files
165
+ *.db
166
+
167
+ # PDFs and Excel files
168
+ *.pdf
169
+ *.xlsx
170
+
171
+ # Backup files
172
+ *_backup.py
173
+ lineage_format_backup.py
174
+
175
+ # Temporary directories
176
+ .temp_*
177
+ enzyme_analysis_*
@@ -0,0 +1,61 @@
1
+ # Contributing to DEBase
2
+
3
+ Thank you for your interest in contributing to DEBase!
4
+
5
+ ## Development Setup
6
+
7
+ 1. Clone the repository:
8
+ ```bash
9
+ git clone https://github.com/yourusername/debase.git
10
+ cd debase
11
+ ```
12
+
13
+ 2. Create a virtual environment:
14
+ ```bash
15
+ python -m venv venv
16
+ source venv/bin/activate # On Windows: venv\Scripts\activate
17
+ ```
18
+
19
+ 3. Install in development mode:
20
+ ```bash
21
+ pip install -e ".[dev]"
22
+ ```
23
+
24
+ ## Running Tests
25
+
26
+ ```bash
27
+ pytest tests/
28
+ ```
29
+
30
+ ## Code Style
31
+
32
+ We use Black for code formatting:
33
+ ```bash
34
+ black src/ tests/
35
+ ```
36
+
37
+ And isort for import sorting:
38
+ ```bash
39
+ isort src/ tests/
40
+ ```
41
+
42
+ ## Project Structure
43
+
44
+ ```
45
+ debase/
46
+ ├── src/debase/ # Main package source code
47
+ ├── tests/ # Test suite
48
+ ├── docs/ # Documentation
49
+ ├── examples/ # Example outputs and usage
50
+ ├── data/ # Research data (PDFs)
51
+ └── scripts/ # Utility scripts
52
+ ```
53
+
54
+ ## Submitting Changes
55
+
56
+ 1. Fork the repository
57
+ 2. Create a feature branch
58
+ 3. Make your changes
59
+ 4. Add tests if applicable
60
+ 5. Run the test suite
61
+ 6. Submit a pull request
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,19 @@
1
+ # DEBase Documentation
2
+
3
+ This directory contains comprehensive documentation for the DEBase enzyme analysis pipeline.
4
+
5
+ ## Directory Structure
6
+
7
+ - `api/` - API documentation and reference
8
+ - `examples/` - Usage examples and tutorials
9
+ - `tutorials/` - Step-by-step guides
10
+
11
+ ## Quick Start
12
+
13
+ See the main [README.md](../README.md) for installation and basic usage.
14
+
15
+ ## Contents
16
+
17
+ 1. [Installation Guide](tutorials/installation.md)
18
+ 2. [API Reference](api/README.md)
19
+ 3. [Usage Examples](examples/README.md)
@@ -0,0 +1,24 @@
1
+ # DEBase Examples
2
+
3
+ This directory contains example outputs and usage demonstrations for the DEBase pipeline.
4
+
5
+ ## Example Outputs
6
+
7
+ The `../../examples/` directory contains sample results from successful pipeline runs:
8
+
9
+ - `trpb_complete_pipeline.csv` - Complete TrpB enzyme dataset with sequences, mutations, and reactions
10
+ - `carbene_complete_pipeline.csv` - Carbene transfer enzyme data with SMILES and conditions
11
+ - `REFINED_ENZYME_SEQUENCES.csv` - Refined sequence extraction results
12
+
13
+ ## Data Format
14
+
15
+ Each CSV contains:
16
+ - Full-length protein sequences (200-400+ amino acids)
17
+ - Complete mutation lineage tracking
18
+ - Chemical reaction SMILES strings
19
+ - Experimental conditions and metadata
20
+ - Performance metrics (yield, TTN, enantioselectivity)
21
+
22
+ ## Usage
23
+
24
+ These files demonstrate the expected output format and can be used as reference for pipeline validation.
@@ -0,0 +1,21 @@
1
+ name: debase
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.9
7
+ - pandas>=1.0.0
8
+ - numpy>=1.19.0
9
+ - matplotlib>=3.3.0
10
+ - seaborn>=0.11.0
11
+ - jupyter>=1.0.0
12
+ - jupyterlab>=3.0.0
13
+ - openpyxl>=3.0.0
14
+ - biopython>=1.78
15
+ - requests>=2.25.0
16
+ - tqdm>=4.60.0
17
+ - rdkit>=2020.03.1
18
+ - pip
19
+ - pip:
20
+ - PyMuPDF>=1.18.0
21
+ - google-generativeai>=0.3.0
@@ -0,0 +1,100 @@
1
+ # DEBase Pipeline Flow
2
+
3
+ ## Overview
4
+ The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
+
6
+ ## Pipeline Architecture
7
+
8
+ ```
9
+ ┌─────────────────────┐ ┌─────────────────────┐
10
+ │ Manuscript PDF │ │ SI PDF │
11
+ └──────────┬──────────┘ └──────────┬──────────┘
12
+ │ │
13
+ └───────────┬───────────────┘
14
+
15
+
16
+ ┌─────────────────────────────┐
17
+ │ 1. enzyme_lineage_extractor │
18
+ │ - Extract enzyme variants │
19
+ │ - Parse mutations │
20
+ │ - Get basic metadata │
21
+ └─────────────┬───────────────┘
22
+
23
+
24
+ ┌─────────────────────────────┐
25
+ │ 2. cleanup_sequence │
26
+ │ - Validate sequences │
27
+ │ - Fix formatting issues │
28
+ │ - Generate full sequences │
29
+ └─────────────┬───────────────┘
30
+
31
+ ┌───────────┴───────────────┐
32
+ │ │
33
+ ▼ ▼
34
+ ┌─────────────────────────┐ ┌─────────────────────────┐
35
+ │ 3a. reaction_info │ │ 3b. substrate_scope │
36
+ │ _extractor │ │ _extractor │
37
+ │ - Performance metrics │ │ - Substrate variations │
38
+ │ - Model reaction │ │ - Additional variants │
39
+ │ - Conditions │ │ - Scope data │
40
+ └───────────┬─────────────┘ └───────────┬─────────────┘
41
+ │ │
42
+ └───────────┬───────────────┘
43
+
44
+
45
+ ┌─────────────────────────────┐
46
+ │ 4. lineage_format_o3 │
47
+ │ - Merge all data │
48
+ │ - Fill missing sequences │
49
+ │ - Format final output │
50
+ └─────────────┬───────────────┘
51
+
52
+
53
+ ┌─────────────┐
54
+ │ Final CSV │
55
+ └─────────────┘
56
+ ```
57
+
58
+ ## Module Details
59
+
60
+ ### 1. enzyme_lineage_extractor.py
61
+ - **Input**: Manuscript PDF, SI PDF
62
+ - **Output**: CSV with enzyme variants and mutations
63
+ - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
+
65
+ ### 2. cleanup_sequence.py
66
+ - **Input**: Enzyme lineage CSV
67
+ - **Output**: CSV with validated sequences
68
+ - **Function**: Validates protein sequences, generates full sequences from mutations
69
+
70
+ ### 3a. reaction_info_extractor.py
71
+ - **Input**: PDFs + cleaned enzyme CSV
72
+ - **Output**: CSV with reaction performance data
73
+ - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
+
75
+ ### 3b. substrate_scope_extractor.py
76
+ - **Input**: PDFs + cleaned enzyme CSV
77
+ - **Output**: CSV with substrate scope entries
78
+ - **Function**: Extracts substrate variations tested with different enzymes
79
+
80
+ ### 4. lineage_format_o3.py
81
+ - **Input**: Reaction CSV + Substrate scope CSV
82
+ - **Output**: Final formatted CSV
83
+ - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
+
85
+ ## Key Features
86
+
87
+ 1. **Modular Design**: Each step can be run independently
88
+ 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
+ 3. **Error Recovery**: Pipeline can resume from any step
90
+ 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
+
92
+ ## Usage
93
+
94
+ ```bash
95
+ # Full pipeline
96
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
+
98
+ # With intermediate files kept for debugging
99
+ python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
+ ```
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.16"
3
+ __version__ = "0.1.17"