debase 0.4.3__tar.gz → 0.4.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. debase-0.4.5/PKG-INFO +121 -0
  2. debase-0.4.5/README.md +64 -0
  3. {debase-0.4.3 → debase-0.4.5}/src/debase/_version.py +1 -1
  4. {debase-0.4.3 → debase-0.4.5}/src/debase/cleanup_sequence.py +512 -33
  5. {debase-0.4.3 → debase-0.4.5}/src/debase/enzyme_lineage_extractor.py +985 -100
  6. {debase-0.4.3 → debase-0.4.5}/src/debase/lineage_format.py +226 -13
  7. {debase-0.4.3 → debase-0.4.5}/src/debase/reaction_info_extractor.py +178 -34
  8. {debase-0.4.3 → debase-0.4.5}/src/debase/substrate_scope_extractor.py +52 -4
  9. {debase-0.4.3 → debase-0.4.5}/src/debase/wrapper.py +155 -151
  10. debase-0.4.5/src/debase.egg-info/PKG-INFO +121 -0
  11. debase-0.4.3/PKG-INFO +0 -296
  12. debase-0.4.3/README.md +0 -239
  13. debase-0.4.3/src/debase.egg-info/PKG-INFO +0 -296
  14. {debase-0.4.3 → debase-0.4.5}/.gitignore +0 -0
  15. {debase-0.4.3 → debase-0.4.5}/LICENSE +0 -0
  16. {debase-0.4.3 → debase-0.4.5}/MANIFEST.in +0 -0
  17. {debase-0.4.3 → debase-0.4.5}/environment.yml +0 -0
  18. {debase-0.4.3 → debase-0.4.5}/pyproject.toml +0 -0
  19. {debase-0.4.3 → debase-0.4.5}/setup.cfg +0 -0
  20. {debase-0.4.3 → debase-0.4.5}/setup.py +0 -0
  21. {debase-0.4.3 → debase-0.4.5}/src/__init__.py +0 -0
  22. {debase-0.4.3 → debase-0.4.5}/src/debase/__init__.py +0 -0
  23. {debase-0.4.3 → debase-0.4.5}/src/debase/__main__.py +0 -0
  24. {debase-0.4.3 → debase-0.4.5}/src/debase/build_db.py +0 -0
  25. {debase-0.4.3 → debase-0.4.5}/src/debase.egg-info/SOURCES.txt +0 -0
  26. {debase-0.4.3 → debase-0.4.5}/src/debase.egg-info/dependency_links.txt +0 -0
  27. {debase-0.4.3 → debase-0.4.5}/src/debase.egg-info/entry_points.txt +0 -0
  28. {debase-0.4.3 → debase-0.4.5}/src/debase.egg-info/requires.txt +0 -0
  29. {debase-0.4.3 → debase-0.4.5}/src/debase.egg-info/top_level.txt +0 -0
debase-0.4.5/PKG-INFO ADDED
@@ -0,0 +1,121 @@
1
+ Metadata-Version: 2.4
2
+ Name: debase
3
+ Version: 0.4.5
4
+ Summary: Enzyme lineage analysis and sequence extraction package
5
+ Home-page: https://github.com/YuemingLong/DEBase
6
+ Author: DEBase Team
7
+ Author-email: DEBase Team <ylong@caltech.edu>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/YuemingLong/DEBase
10
+ Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
11
+ Project-URL: Repository, https://github.com/YuemingLong/DEBase
12
+ Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
24
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: pandas>=1.0.0
29
+ Requires-Dist: PyMuPDF>=1.18.0
30
+ Requires-Dist: numpy>=1.19.0
31
+ Requires-Dist: google-generativeai>=0.3.0
32
+ Requires-Dist: biopython>=1.78
33
+ Requires-Dist: requests>=2.25.0
34
+ Requires-Dist: httpx>=0.24.0
35
+ Requires-Dist: tqdm>=4.60.0
36
+ Requires-Dist: openpyxl>=3.0.0
37
+ Requires-Dist: PyPDF2>=2.0.0
38
+ Requires-Dist: Pillow>=8.0.0
39
+ Requires-Dist: networkx>=2.5
40
+ Provides-Extra: rdkit
41
+ Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=6.0; extra == "dev"
44
+ Requires-Dist: pytest-cov; extra == "dev"
45
+ Requires-Dist: black; extra == "dev"
46
+ Requires-Dist: isort; extra == "dev"
47
+ Requires-Dist: flake8; extra == "dev"
48
+ Requires-Dist: mypy; extra == "dev"
49
+ Provides-Extra: docs
50
+ Requires-Dist: sphinx>=4.0; extra == "docs"
51
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
52
+ Requires-Dist: myst-parser; extra == "docs"
53
+ Dynamic: author
54
+ Dynamic: home-page
55
+ Dynamic: license-file
56
+ Dynamic: requires-python
57
+
58
+ # DEBase
59
+
60
+ DEBase is a Python package for extracting and analyzing enzyme lineage data from scientific papers using AI-powered parsing.
61
+
62
+ ## Features
63
+
64
+ - Extract enzyme variant lineages from PDF documents
65
+ - Parse protein and DNA sequences with mutation annotations
66
+ - Extract reaction performance metrics (yield, TTN, ee)
67
+ - Extract and organize substrate scope data
68
+ - Match enzyme variants across different data sources using AI
69
+ - Generate structured CSV outputs for downstream analysis
70
+
71
+ ## Installation
72
+
73
+ ```bash
74
+ pip install debase
75
+ ```
76
+
77
+ ## Quick Start
78
+
79
+ ```bash
80
+ # Run the complete pipeline
81
+ debase --manuscript paper.pdf --si supplementary.pdf --output results.csv
82
+
83
+ # Enable debug mode to save Gemini prompts and responses
84
+ debase --manuscript paper.pdf --si supplementary.pdf --output results.csv --debug-dir ./debug_output
85
+
86
+ # Individual components with debugging
87
+ python -m debase.enzyme_lineage_extractor --manuscript paper.pdf --output lineage.csv --debug-dir ./debug_output
88
+ python -m debase.reaction_info_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output reactions.csv --debug-dir ./debug_output
89
+ python -m debase.substrate_scope_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output substrate_scope.csv --debug-dir ./debug_output
90
+ python -m debase.lineage_format -r reactions.csv -s substrate_scope.csv -o final.csv -v
91
+ ```
92
+
93
+ ## Debugging
94
+
95
+ Use the `--debug-dir` flag to save all Gemini API prompts and responses for debugging:
96
+ - Location extraction prompts
97
+ - Sequence extraction prompts (can be very large, up to 150K characters)
98
+ - Enzyme matching prompts
99
+ - All API responses with timestamps
100
+ - Note: lineage_format.py uses `-v` for verbose output instead of `--debug-dir`
101
+
102
+ ## Requirements
103
+
104
+ - Python 3.8+
105
+ - Google Gemini API key (set as GEMINI_API_KEY environment variable)
106
+
107
+ ## Version
108
+
109
+ 0.4.4
110
+
111
+ ## License
112
+
113
+ MIT License
114
+
115
+ ## Authors
116
+
117
+ DEBase Team - Caltech
118
+
119
+ ## Contact
120
+
121
+ ylong@caltech.edu
debase-0.4.5/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # DEBase
2
+
3
+ DEBase is a Python package for extracting and analyzing enzyme lineage data from scientific papers using AI-powered parsing.
4
+
5
+ ## Features
6
+
7
+ - Extract enzyme variant lineages from PDF documents
8
+ - Parse protein and DNA sequences with mutation annotations
9
+ - Extract reaction performance metrics (yield, TTN, ee)
10
+ - Extract and organize substrate scope data
11
+ - Match enzyme variants across different data sources using AI
12
+ - Generate structured CSV outputs for downstream analysis
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install debase
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ # Run the complete pipeline
24
+ debase --manuscript paper.pdf --si supplementary.pdf --output results.csv
25
+
26
+ # Enable debug mode to save Gemini prompts and responses
27
+ debase --manuscript paper.pdf --si supplementary.pdf --output results.csv --debug-dir ./debug_output
28
+
29
+ # Individual components with debugging
30
+ python -m debase.enzyme_lineage_extractor --manuscript paper.pdf --output lineage.csv --debug-dir ./debug_output
31
+ python -m debase.reaction_info_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output reactions.csv --debug-dir ./debug_output
32
+ python -m debase.substrate_scope_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output substrate_scope.csv --debug-dir ./debug_output
33
+ python -m debase.lineage_format -r reactions.csv -s substrate_scope.csv -o final.csv -v
34
+ ```
35
+
36
+ ## Debugging
37
+
38
+ Use the `--debug-dir` flag to save all Gemini API prompts and responses for debugging:
39
+ - Location extraction prompts
40
+ - Sequence extraction prompts (can be very large, up to 150K characters)
41
+ - Enzyme matching prompts
42
+ - All API responses with timestamps
43
+ - Note: lineage_format.py uses `-v` for verbose output instead of `--debug-dir`
44
+
45
+ ## Requirements
46
+
47
+ - Python 3.8+
48
+ - Google Gemini API key (set as GEMINI_API_KEY environment variable)
49
+
50
+ ## Version
51
+
52
+ 0.4.4
53
+
54
+ ## License
55
+
56
+ MIT License
57
+
58
+ ## Authors
59
+
60
+ DEBase Team - Caltech
61
+
62
+ ## Contact
63
+
64
+ ylong@caltech.edu
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.3"
3
+ __version__ = "0.4.5"