PyPI - debase - Versions diffs - 0.4.1__tar.gz → 0.4.2__tar.gz - Mend

debase 0.4.1tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

debase-0.4.2/.gitignore ADDED Viewed

@@ -0,0 +1,177 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# poetry
+poetry.lock
+# pdm
+.pdm.toml
+# PEP 582
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+.idea/
+# VS Code
+.vscode/
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Windows
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+*.stackdump
+[Dd]esktop.ini
+$RECYCLE.BIN/
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+*.lnk
+# Linux
+*~
+# Temporary files
+*.tmp
+*.temp
+*.log
+.temp_*/
+.cache/
+# DEBase specific
+enzyme_pipeline*.log
+temp_merged_input.csv
+*.egg-info/
+# Project data and examples
+data/
+examples/
+!examples/test.csv  # Keep test.csv as example output
+# Cache files
+*.pkl
+*_cache.pkl
+# Large database files
+*.db
+# PDFs and Excel files
+*.pdf
+*.xlsx
+# Backup files
+*_backup.py
+lineage_format_backup.py
+# Temporary directories
+.temp_*
+enzyme_analysis_*

{debase-0.4.1/src/debase.egg-info → debase-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.1
+Version: 0.4.2
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.4.2/environment.yml ADDED Viewed

@@ -0,0 +1,21 @@
+name: debase
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9
+  - pandas>=1.0.0
+  - numpy>=1.19.0
+  - matplotlib>=3.3.0
+  - seaborn>=0.11.0
+  - jupyter>=1.0.0
+  - jupyterlab>=3.0.0
+  - openpyxl>=3.0.0
+  - biopython>=1.78
+  - requests>=2.25.0
+  - tqdm>=4.60.0
+  - rdkit>=2020.03.1
+  - pip
+  - pip:
+    - PyMuPDF>=1.18.0
+    - google-generativeai>=0.3.0

{debase-0.4.1 → debase-0.4.2}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.1"
+__version__ = "0.4.2"

{debase-0.4.1 → debase-0.4.2}/src/debase/enzyme_lineage_extractor.py RENAMED Viewed

@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
 came from which parent and what mutations were introduced).
 Respond with a JSON array of objects, each containing:
-- "location": the identifier (e.g. "Table S1", "Figure 2B", "p. 6")
+- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
 - "type": one of "table", "figure", "text", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
+IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
 Order by confidence score (highest first). Tables showing complete variant lineages or
 mutation lists should be ranked higher than figure showing complete variant lineages.
 Text sections is used when no suitable tables/figurews exist.
@@ -747,7 +749,7 @@ def identify_campaigns(
     debug_dir: str | Path | None = None,
 ) -> List[Campaign]:
     """Identify distinct directed evolution campaigns in the manuscript."""
-    prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text[:30_000])
+    prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
     campaigns_data: List[dict] = []
     try:
         campaigns_data = generate_json_with_retry(
@@ -825,7 +827,7 @@ def identify_evolution_locations(
     # Include TOC before the main text
     combined_text = toc_text + text if toc_text else text
-    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
+    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
     locs: List[dict] = []
     try:
         locs = generate_json_with_retry(
@@ -1306,7 +1308,7 @@ def get_lineage(
     5. Return both variants and campaigns.
     """
     # First, identify campaigns in the manuscript
-    campaigns = identify_campaigns(full_text[:50_000], model, debug_dir=debug_dir)
+    campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
     if campaigns:
         log.info(f"Identified {len(campaigns)} distinct campaigns")
@@ -1364,7 +1366,7 @@ def get_lineage(
             context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
             locations_with_context.append({
                 'location': loc,
-                'context': context_text[:1000]  # First 1000 chars of extracted context
+                'context': context_text  # Full extracted context
             })
         # For each campaign, ask Gemini to select the best location
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
 Look for table of contents entries or section listings that mention sequences.
 Return a JSON array where each element has:
 - "section": the section heading or description
-- "page": the page number shown in the table of contents for this section, to your best judgement.
+- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
 Focus on:
 - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
-- Return the EXACT notation as shown.
+- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
 - Prioritize sections that mention "protein" or "amino acid" sequences
+CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
+- Correct: "53", "S12", "147"
+- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
 Return [] if no sequence sections are found.
 Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -1572,7 +1578,7 @@ TEXT (truncated):
 def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
     """Ask Gemini for promising places to look for sequences."""
-    prompt = _SEQ_LOC_PROMPT.format(chunk=text[:15_000])
+    prompt = _SEQ_LOC_PROMPT.format(chunk=text)
     try:
         locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
         return locs if isinstance(locs, list) else []

debase 0.4.1__tar.gz → 0.4.2__tar.gz

debase 0.4.1tar.gz → 0.4.2tar.gz