debase 0.1.8__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {debase-0.1.8 → debase-0.1.9}/PKG-INFO +1 -1
  2. {debase-0.1.8 → debase-0.1.9}/src/debase/_version.py +1 -1
  3. {debase-0.1.8 → debase-0.1.9}/src/debase/enzyme_lineage_extractor.py +41 -6
  4. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/PKG-INFO +1 -1
  5. {debase-0.1.8 → debase-0.1.9}/.gitignore +0 -0
  6. {debase-0.1.8 → debase-0.1.9}/CONTRIBUTING.md +0 -0
  7. {debase-0.1.8 → debase-0.1.9}/LICENSE +0 -0
  8. {debase-0.1.8 → debase-0.1.9}/MANIFEST.in +0 -0
  9. {debase-0.1.8 → debase-0.1.9}/README.md +0 -0
  10. {debase-0.1.8 → debase-0.1.9}/docs/README.md +0 -0
  11. {debase-0.1.8 → debase-0.1.9}/docs/examples/README.md +0 -0
  12. {debase-0.1.8 → debase-0.1.9}/environment.yml +0 -0
  13. {debase-0.1.8 → debase-0.1.9}/pyproject.toml +0 -0
  14. {debase-0.1.8 → debase-0.1.9}/setup.cfg +0 -0
  15. {debase-0.1.8 → debase-0.1.9}/setup.py +0 -0
  16. {debase-0.1.8 → debase-0.1.9}/src/__init__.py +0 -0
  17. {debase-0.1.8 → debase-0.1.9}/src/debase/PIPELINE_FLOW.md +0 -0
  18. {debase-0.1.8 → debase-0.1.9}/src/debase/__init__.py +0 -0
  19. {debase-0.1.8 → debase-0.1.9}/src/debase/__main__.py +0 -0
  20. {debase-0.1.8 → debase-0.1.9}/src/debase/build_db.py +0 -0
  21. {debase-0.1.8 → debase-0.1.9}/src/debase/cleanup_sequence.py +0 -0
  22. {debase-0.1.8 → debase-0.1.9}/src/debase/lineage_format.py +0 -0
  23. {debase-0.1.8 → debase-0.1.9}/src/debase/reaction_info_extractor.py +0 -0
  24. {debase-0.1.8 → debase-0.1.9}/src/debase/substrate_scope_extractor.py +0 -0
  25. {debase-0.1.8 → debase-0.1.9}/src/debase/wrapper.py +0 -0
  26. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/SOURCES.txt +0 -0
  27. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/dependency_links.txt +0 -0
  28. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/entry_points.txt +0 -0
  29. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/requires.txt +0 -0
  30. {debase-0.1.8 → debase-0.1.9}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.8"
3
+ __version__ = "0.1.9"
@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
1562
1562
  ```
1563
1563
  """.strip()
1564
1564
 
1565
- def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1565
+ def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
1566
1566
  """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
1567
- prompt = _SEQ_EXTRACTION_PROMPT.format(
1567
+ base_prompt = _SEQ_EXTRACTION_PROMPT.format(
1568
1568
  schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
1569
1569
  )
1570
+
1571
+ # Add lineage context if available
1572
+ if lineage_context:
1573
+ prompt = f"""{base_prompt}
1574
+
1575
+ IMPORTANT CONTEXT - Known variants from lineage extraction:
1576
+ {lineage_context}
1577
+
1578
+ Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
1579
+ """
1580
+ else:
1581
+ prompt = base_prompt
1582
+
1570
1583
  data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
1571
1584
  return _parse_sequences(data)
1572
1585
 
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
1620
1633
  return blocks
1621
1634
 
1622
1635
  # --- 7.5 Convenience wrapper -------------------------------------------------
1623
- def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1636
+ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
1624
1637
  # Phase 1: Identify where sequences might be located
1625
1638
  locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
1626
1639
 
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1685
1698
  if focused_text and len(focused_text) < len(text):
1686
1699
  log.info("Reduced text from %d to %d chars using validated location",
1687
1700
  len(text), len(focused_text))
1688
- return extract_sequences(focused_text, model, debug_dir=debug_dir)
1701
+ # Build lineage context if available
1702
+ lineage_context = None
1703
+ if lineage_variants:
1704
+ variant_info = []
1705
+ for v in lineage_variants[:20]: # Limit to first 20
1706
+ info = f"- {v.variant_id} (Gen {v.generation})"
1707
+ if v.mutations:
1708
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1709
+ variant_info.append(info)
1710
+ lineage_context = "\n".join(variant_info)
1711
+
1712
+ return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1689
1713
  else:
1690
1714
  log.warning("Location validation failed or returned invalid location: %s",
1691
1715
  validation.get("reason", "Unknown"))
1692
1716
 
1693
1717
  # Fallback to full text
1694
1718
  log.info("Using full text for sequence extraction")
1695
- return extract_sequences(text, model, debug_dir=debug_dir)
1719
+ # Build lineage context if available
1720
+ lineage_context = None
1721
+ if lineage_variants:
1722
+ variant_info = []
1723
+ for v in lineage_variants[:20]: # Limit to first 20
1724
+ info = f"- {v.variant_id} (Gen {v.generation})"
1725
+ if v.mutations:
1726
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1727
+ variant_info.append(info)
1728
+ lineage_context = "\n".join(variant_info)
1729
+
1730
+ return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1696
1731
 
1697
1732
  # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
1698
1733
  """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2184,7 +2219,7 @@ def run_pipeline(
2184
2219
  )
2185
2220
 
2186
2221
  # 4. Extract sequences (Section 7) ----------------------------------------
2187
- sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2222
+ sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
2188
2223
 
2189
2224
  # 4a. Try PDB extraction if no sequences found -----------------------------
2190
2225
  # Check if we need PDB sequences (no sequences or only partial sequences)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes