debase 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {debase-0.1.7 → debase-0.1.9}/PKG-INFO +1 -1
  2. {debase-0.1.7 → debase-0.1.9}/src/debase/_version.py +1 -1
  3. {debase-0.1.7 → debase-0.1.9}/src/debase/enzyme_lineage_extractor.py +63 -6
  4. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/PKG-INFO +1 -1
  5. {debase-0.1.7 → debase-0.1.9}/.gitignore +0 -0
  6. {debase-0.1.7 → debase-0.1.9}/CONTRIBUTING.md +0 -0
  7. {debase-0.1.7 → debase-0.1.9}/LICENSE +0 -0
  8. {debase-0.1.7 → debase-0.1.9}/MANIFEST.in +0 -0
  9. {debase-0.1.7 → debase-0.1.9}/README.md +0 -0
  10. {debase-0.1.7 → debase-0.1.9}/docs/README.md +0 -0
  11. {debase-0.1.7 → debase-0.1.9}/docs/examples/README.md +0 -0
  12. {debase-0.1.7 → debase-0.1.9}/environment.yml +0 -0
  13. {debase-0.1.7 → debase-0.1.9}/pyproject.toml +0 -0
  14. {debase-0.1.7 → debase-0.1.9}/setup.cfg +0 -0
  15. {debase-0.1.7 → debase-0.1.9}/setup.py +0 -0
  16. {debase-0.1.7 → debase-0.1.9}/src/__init__.py +0 -0
  17. {debase-0.1.7 → debase-0.1.9}/src/debase/PIPELINE_FLOW.md +0 -0
  18. {debase-0.1.7 → debase-0.1.9}/src/debase/__init__.py +0 -0
  19. {debase-0.1.7 → debase-0.1.9}/src/debase/__main__.py +0 -0
  20. {debase-0.1.7 → debase-0.1.9}/src/debase/build_db.py +0 -0
  21. {debase-0.1.7 → debase-0.1.9}/src/debase/cleanup_sequence.py +0 -0
  22. {debase-0.1.7 → debase-0.1.9}/src/debase/lineage_format.py +0 -0
  23. {debase-0.1.7 → debase-0.1.9}/src/debase/reaction_info_extractor.py +0 -0
  24. {debase-0.1.7 → debase-0.1.9}/src/debase/substrate_scope_extractor.py +0 -0
  25. {debase-0.1.7 → debase-0.1.9}/src/debase/wrapper.py +0 -0
  26. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/SOURCES.txt +0 -0
  27. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/dependency_links.txt +0 -0
  28. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/entry_points.txt +0 -0
  29. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/requires.txt +0 -0
  30. {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.7"
3
+ __version__ = "0.1.9"
@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
1562
1562
  ```
1563
1563
  """.strip()
1564
1564
 
1565
- def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1565
+ def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
1566
1566
  """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
1567
- prompt = _SEQ_EXTRACTION_PROMPT.format(
1567
+ base_prompt = _SEQ_EXTRACTION_PROMPT.format(
1568
1568
  schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
1569
1569
  )
1570
+
1571
+ # Add lineage context if available
1572
+ if lineage_context:
1573
+ prompt = f"""{base_prompt}
1574
+
1575
+ IMPORTANT CONTEXT - Known variants from lineage extraction:
1576
+ {lineage_context}
1577
+
1578
+ Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
1579
+ """
1580
+ else:
1581
+ prompt = base_prompt
1582
+
1570
1583
  data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
1571
1584
  return _parse_sequences(data)
1572
1585
 
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
1620
1633
  return blocks
1621
1634
 
1622
1635
  # --- 7.5 Convenience wrapper -------------------------------------------------
1623
- def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1636
+ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
1624
1637
  # Phase 1: Identify where sequences might be located
1625
1638
  locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
1626
1639
 
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1685
1698
  if focused_text and len(focused_text) < len(text):
1686
1699
  log.info("Reduced text from %d to %d chars using validated location",
1687
1700
  len(text), len(focused_text))
1688
- return extract_sequences(focused_text, model, debug_dir=debug_dir)
1701
+ # Build lineage context if available
1702
+ lineage_context = None
1703
+ if lineage_variants:
1704
+ variant_info = []
1705
+ for v in lineage_variants[:20]: # Limit to first 20
1706
+ info = f"- {v.variant_id} (Gen {v.generation})"
1707
+ if v.mutations:
1708
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1709
+ variant_info.append(info)
1710
+ lineage_context = "\n".join(variant_info)
1711
+
1712
+ return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1689
1713
  else:
1690
1714
  log.warning("Location validation failed or returned invalid location: %s",
1691
1715
  validation.get("reason", "Unknown"))
1692
1716
 
1693
1717
  # Fallback to full text
1694
1718
  log.info("Using full text for sequence extraction")
1695
- return extract_sequences(text, model, debug_dir=debug_dir)
1719
+ # Build lineage context if available
1720
+ lineage_context = None
1721
+ if lineage_variants:
1722
+ variant_info = []
1723
+ for v in lineage_variants[:20]: # Limit to first 20
1724
+ info = f"- {v.variant_id} (Gen {v.generation})"
1725
+ if v.mutations:
1726
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1727
+ variant_info.append(info)
1728
+ lineage_context = "\n".join(variant_info)
1729
+
1730
+ return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1696
1731
 
1697
1732
  # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
1698
1733
  """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2005,6 +2040,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2005
2040
 
2006
2041
  log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2007
2042
 
2043
+ # Log the final state after all matches
2044
+ matched_count = (~df['aa_seq'].isna()).sum()
2045
+ log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
2046
+
2008
2047
  except Exception as e:
2009
2048
  log.warning(f"Failed to match variants using Gemini: {e}")
2010
2049
 
@@ -2025,6 +2064,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2025
2064
  # 5. Sort rows: primary by generation, then by variant_id
2026
2065
  df = df.sort_values(["generation", "variant_id"], kind="mergesort")
2027
2066
 
2067
+ # Debug: Log final merge state
2068
+ seq_count = (~df['aa_seq'].isna()).sum()
2069
+ log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
2070
+ if seq_count > 0:
2071
+ log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
2072
+
2028
2073
  return df
2029
2074
 
2030
2075
  # --- 8.3 Public API -----------------------------------------------------------
@@ -2053,6 +2098,10 @@ def merge_and_score(
2053
2098
  if missing_rate > 0.5:
2054
2099
  log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
2055
2100
 
2101
+ # Debug log before returning
2102
+ seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
2103
+ log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
2104
+
2056
2105
  return df
2057
2106
 
2058
2107
  # -------------------------------------------------------------------- end 8 ---
@@ -2170,7 +2219,7 @@ def run_pipeline(
2170
2219
  )
2171
2220
 
2172
2221
  # 4. Extract sequences (Section 7) ----------------------------------------
2173
- sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2222
+ sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
2174
2223
 
2175
2224
  # 4a. Try PDB extraction if no sequences found -----------------------------
2176
2225
  # Check if we need PDB sequences (no sequences or only partial sequences)
@@ -2236,6 +2285,14 @@ def run_pipeline(
2236
2285
  output_csv_path = Path(output_csv)
2237
2286
  # Save final data with sequences using same filename (overwrites lineage-only)
2238
2287
  sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2288
+
2289
+ # Debug: Log what we're about to save
2290
+ seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2291
+ log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
2292
+ if seq_count > 0 and 'aa_seq' in df_final:
2293
+ with_seq = df_final[~df_final['aa_seq'].isna()]
2294
+ log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
2295
+
2239
2296
  df_final.to_csv(sequence_path, index=False)
2240
2297
  log.info(
2241
2298
  "Overwrote with final results -> %s (%.1f kB)",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes