debase 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. debase-0.4.2/.gitignore +177 -0
  2. {debase-0.4.0/src/debase.egg-info → debase-0.4.2}/PKG-INFO +1 -1
  3. debase-0.4.2/environment.yml +21 -0
  4. {debase-0.4.0 → debase-0.4.2}/src/debase/_version.py +1 -1
  5. {debase-0.4.0 → debase-0.4.2}/src/debase/enzyme_lineage_extractor.py +14 -8
  6. {debase-0.4.0 → debase-0.4.2}/src/debase/lineage_format.py +335 -56
  7. {debase-0.4.0 → debase-0.4.2}/src/debase/reaction_info_extractor.py +60 -32
  8. {debase-0.4.0 → debase-0.4.2}/src/debase/substrate_scope_extractor.py +373 -140
  9. {debase-0.4.0 → debase-0.4.2}/src/debase/wrapper.py +37 -11
  10. {debase-0.4.0 → debase-0.4.2/src/debase.egg-info}/PKG-INFO +1 -1
  11. {debase-0.4.0 → debase-0.4.2}/src/debase.egg-info/SOURCES.txt +3 -0
  12. debase-0.4.2/src/debase.egg-info/dependency_links.txt +1 -0
  13. {debase-0.4.0 → debase-0.4.2}/LICENSE +0 -0
  14. {debase-0.4.0 → debase-0.4.2}/MANIFEST.in +0 -0
  15. {debase-0.4.0 → debase-0.4.2}/README.md +0 -0
  16. {debase-0.4.0 → debase-0.4.2}/pyproject.toml +0 -0
  17. {debase-0.4.0 → debase-0.4.2}/setup.cfg +0 -0
  18. {debase-0.4.0 → debase-0.4.2}/setup.py +0 -0
  19. /debase-0.4.0/src/debase.egg-info/dependency_links.txt → /debase-0.4.2/src/__init__.py +0 -0
  20. {debase-0.4.0 → debase-0.4.2}/src/debase/__init__.py +0 -0
  21. {debase-0.4.0 → debase-0.4.2}/src/debase/__main__.py +0 -0
  22. {debase-0.4.0 → debase-0.4.2}/src/debase/build_db.py +0 -0
  23. {debase-0.4.0 → debase-0.4.2}/src/debase/cleanup_sequence.py +0 -0
  24. {debase-0.4.0 → debase-0.4.2}/src/debase.egg-info/entry_points.txt +0 -0
  25. {debase-0.4.0 → debase-0.4.2}/src/debase.egg-info/requires.txt +0 -0
  26. {debase-0.4.0 → debase-0.4.2}/src/debase.egg-info/top_level.txt +0 -0
@@ -0,0 +1,177 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Jupyter Notebook
53
+ .ipynb_checkpoints
54
+
55
+ # IPython
56
+ profile_default/
57
+ ipython_config.py
58
+
59
+ # pyenv
60
+ .python-version
61
+
62
+ # pipenv
63
+ Pipfile.lock
64
+
65
+ # poetry
66
+ poetry.lock
67
+
68
+ # pdm
69
+ .pdm.toml
70
+
71
+ # PEP 582
72
+ __pypackages__/
73
+
74
+ # Celery stuff
75
+ celerybeat-schedule
76
+ celerybeat.pid
77
+
78
+ # SageMath parsed files
79
+ *.sage.py
80
+
81
+ # Environments
82
+ .env
83
+ .venv
84
+ env/
85
+ venv/
86
+ ENV/
87
+ env.bak/
88
+ venv.bak/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+ .spyproject
93
+
94
+ # Rope project settings
95
+ .ropeproject
96
+
97
+ # mkdocs documentation
98
+ /site
99
+
100
+ # mypy
101
+ .mypy_cache/
102
+ .dmypy.json
103
+ dmypy.json
104
+
105
+ # Pyre type checker
106
+ .pyre/
107
+
108
+ # pytype static type analyzer
109
+ .pytype/
110
+
111
+ # Cython debug symbols
112
+ cython_debug/
113
+
114
+ # PyCharm
115
+ .idea/
116
+
117
+ # VS Code
118
+ .vscode/
119
+
120
+ # macOS
121
+ .DS_Store
122
+ .AppleDouble
123
+ .LSOverride
124
+
125
+ # Windows
126
+ Thumbs.db
127
+ Thumbs.db:encryptable
128
+ ehthumbs.db
129
+ ehthumbs_vista.db
130
+ *.stackdump
131
+ [Dd]esktop.ini
132
+ $RECYCLE.BIN/
133
+ *.cab
134
+ *.msi
135
+ *.msix
136
+ *.msm
137
+ *.msp
138
+ *.lnk
139
+
140
+ # Linux
141
+ *~
142
+
143
+ # Temporary files
144
+ *.tmp
145
+ *.temp
146
+ *.log
147
+ .temp_*/
148
+ .cache/
149
+
150
+ # DEBase specific
151
+ enzyme_pipeline*.log
152
+ temp_merged_input.csv
153
+ *.egg-info/
154
+
155
+ # Project data and examples
156
+ data/
157
+ examples/
158
+ !examples/test.csv # Keep test.csv as example output
159
+
160
+ # Cache files
161
+ *.pkl
162
+ *_cache.pkl
163
+
164
+ # Large database files
165
+ *.db
166
+
167
+ # PDFs and Excel files
168
+ *.pdf
169
+ *.xlsx
170
+
171
+ # Backup files
172
+ *_backup.py
173
+ lineage_format_backup.py
174
+
175
+ # Temporary directories
176
+ .temp_*
177
+ enzyme_analysis_*
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,21 @@
1
+ name: debase
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.9
7
+ - pandas>=1.0.0
8
+ - numpy>=1.19.0
9
+ - matplotlib>=3.3.0
10
+ - seaborn>=0.11.0
11
+ - jupyter>=1.0.0
12
+ - jupyterlab>=3.0.0
13
+ - openpyxl>=3.0.0
14
+ - biopython>=1.78
15
+ - requests>=2.25.0
16
+ - tqdm>=4.60.0
17
+ - rdkit>=2020.03.1
18
+ - pip
19
+ - pip:
20
+ - PyMuPDF>=1.18.0
21
+ - google-generativeai>=0.3.0
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.4.2"
@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
645
645
  came from which parent and what mutations were introduced).
646
646
 
647
647
  Respond with a JSON array of objects, each containing:
648
- - "location": the identifier (e.g. "Table S1", "Figure 2B", "p. 6")
648
+ - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
649
649
  - "type": one of "table", "figure", "text", "section"
650
650
  - "confidence": your confidence score (0-100) that this location contains lineage data
651
651
  - "reason": brief explanation of why this location likely contains lineage
652
652
 
653
+ IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
654
+
653
655
  Order by confidence score (highest first). Tables showing complete variant lineages or
654
656
  mutation lists should be ranked higher than figure showing complete variant lineages.
655
657
  Text sections is used when no suitable tables/figurews exist.
@@ -747,7 +749,7 @@ def identify_campaigns(
747
749
  debug_dir: str | Path | None = None,
748
750
  ) -> List[Campaign]:
749
751
  """Identify distinct directed evolution campaigns in the manuscript."""
750
- prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text[:30_000])
752
+ prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
751
753
  campaigns_data: List[dict] = []
752
754
  try:
753
755
  campaigns_data = generate_json_with_retry(
@@ -825,7 +827,7 @@ def identify_evolution_locations(
825
827
 
826
828
  # Include TOC before the main text
827
829
  combined_text = toc_text + text if toc_text else text
828
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
830
+ prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
829
831
  locs: List[dict] = []
830
832
  try:
831
833
  locs = generate_json_with_retry(
@@ -1306,7 +1308,7 @@ def get_lineage(
1306
1308
  5. Return both variants and campaigns.
1307
1309
  """
1308
1310
  # First, identify campaigns in the manuscript
1309
- campaigns = identify_campaigns(full_text[:50_000], model, debug_dir=debug_dir)
1311
+ campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
1310
1312
 
1311
1313
  if campaigns:
1312
1314
  log.info(f"Identified {len(campaigns)} distinct campaigns")
@@ -1364,7 +1366,7 @@ def get_lineage(
1364
1366
  context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1365
1367
  locations_with_context.append({
1366
1368
  'location': loc,
1367
- 'context': context_text[:1000] # First 1000 chars of extracted context
1369
+ 'context': context_text # Full extracted context
1368
1370
  })
1369
1371
 
1370
1372
  # For each campaign, ask Gemini to select the best location
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
1554
1556
  Look for table of contents entries or section listings that mention sequences.
1555
1557
  Return a JSON array where each element has:
1556
1558
  - "section": the section heading or description
1557
- - "page": the page number shown in the table of contents for this section, to your best judgement.
1559
+ - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
1558
1560
 
1559
1561
  Focus on:
1560
1562
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
1561
- - Return the EXACT notation as shown.
1563
+ - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
1562
1564
  - Prioritize sections that mention "protein" or "amino acid" sequences
1563
1565
 
1566
+ CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
1567
+ - Correct: "53", "S12", "147"
1568
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
1569
+
1564
1570
  Return [] if no sequence sections are found.
1565
1571
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
1566
1572
 
@@ -1572,7 +1578,7 @@ TEXT (truncated):
1572
1578
 
1573
1579
  def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
1574
1580
  """Ask Gemini for promising places to look for sequences."""
1575
- prompt = _SEQ_LOC_PROMPT.format(chunk=text[:15_000])
1581
+ prompt = _SEQ_LOC_PROMPT.format(chunk=text)
1576
1582
  try:
1577
1583
  locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
1578
1584
  return locs if isinstance(locs, list) else []