debase 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.8"
3
+ __version__ = "0.1.11"
@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
1562
1562
  ```
1563
1563
  """.strip()
1564
1564
 
1565
- def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1565
+ def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
1566
1566
  """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
1567
- prompt = _SEQ_EXTRACTION_PROMPT.format(
1567
+ base_prompt = _SEQ_EXTRACTION_PROMPT.format(
1568
1568
  schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
1569
1569
  )
1570
+
1571
+ # Add lineage context if available
1572
+ if lineage_context:
1573
+ prompt = f"""{base_prompt}
1574
+
1575
+ IMPORTANT CONTEXT - Known variants from lineage extraction:
1576
+ {lineage_context}
1577
+
1578
+ Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
1579
+ """
1580
+ else:
1581
+ prompt = base_prompt
1582
+
1570
1583
  data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
1571
1584
  return _parse_sequences(data)
1572
1585
 
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
1620
1633
  return blocks
1621
1634
 
1622
1635
  # --- 7.5 Convenience wrapper -------------------------------------------------
1623
- def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
1636
+ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
1624
1637
  # Phase 1: Identify where sequences might be located
1625
1638
  locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
1626
1639
 
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1685
1698
  if focused_text and len(focused_text) < len(text):
1686
1699
  log.info("Reduced text from %d to %d chars using validated location",
1687
1700
  len(text), len(focused_text))
1688
- return extract_sequences(focused_text, model, debug_dir=debug_dir)
1701
+ # Build lineage context if available
1702
+ lineage_context = None
1703
+ if lineage_variants:
1704
+ variant_info = []
1705
+ for v in lineage_variants[:20]: # Limit to first 20
1706
+ info = f"- {v.variant_id} (Gen {v.generation})"
1707
+ if v.mutations:
1708
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1709
+ variant_info.append(info)
1710
+ lineage_context = "\n".join(variant_info)
1711
+
1712
+ return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1689
1713
  else:
1690
1714
  log.warning("Location validation failed or returned invalid location: %s",
1691
1715
  validation.get("reason", "Unknown"))
1692
1716
 
1693
1717
  # Fallback to full text
1694
1718
  log.info("Using full text for sequence extraction")
1695
- return extract_sequences(text, model, debug_dir=debug_dir)
1719
+ # Build lineage context if available
1720
+ lineage_context = None
1721
+ if lineage_variants:
1722
+ variant_info = []
1723
+ for v in lineage_variants[:20]: # Limit to first 20
1724
+ info = f"- {v.variant_id} (Gen {v.generation})"
1725
+ if v.mutations:
1726
+ info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
1727
+ variant_info.append(info)
1728
+ lineage_context = "\n".join(variant_info)
1729
+
1730
+ return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
1696
1731
 
1697
1732
  # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
1698
1733
  """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -1989,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
1989
2024
  text = text[4:].strip()
1990
2025
 
1991
2026
  matches = json.loads(text)
2027
+ log.info(f"Gemini returned matches: {matches}")
2028
+
2029
+ # Debug: Log what sequences we actually have
2030
+ log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
1992
2031
 
1993
2032
  # Apply the matches
1994
2033
  for lineage_id, seq_id in matches.items():
1995
- if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
1996
- # Get the sequence data
1997
- seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
2034
+ if lineage_id in unmatched_lineage:
2035
+ # Find the sequence data - be flexible with matching
2036
+ seq_data = None
1998
2037
 
1999
- # Update the dataframe
2000
- mask = df['variant_id'] == lineage_id
2001
- df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2002
- df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2003
- df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
2004
- df.loc[mask, 'truncated'] = seq_data['truncated']
2038
+ # First try exact match
2039
+ seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
2040
+ if len(seq_matches) > 0:
2041
+ seq_data = seq_matches.iloc[0]
2042
+ else:
2043
+ # Try to find by checking various matching strategies
2044
+ for idx, row in unmatched_seqs.iterrows():
2045
+ variant_id = row['variant_id']
2046
+ # Check if one is contained in the other
2047
+ if seq_id in variant_id or variant_id in seq_id:
2048
+ seq_data = row
2049
+ break
2050
+ # Check if they share the same core identifier (e.g., G0, G1, etc.)
2051
+ seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
2052
+ variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
2053
+ if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
2054
+ seq_data = row
2055
+ break
2005
2056
 
2006
- log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2057
+ if seq_data is not None:
2058
+ # Update the dataframe
2059
+ mask = df['variant_id'] == lineage_id
2060
+ if mask.any():
2061
+ # Log before update
2062
+ log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
2063
+
2064
+ df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2065
+ df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2066
+ df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2067
+ df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2068
+
2069
+ # Log after update
2070
+ log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
2071
+ log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
2072
+ else:
2073
+ log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
2074
+ else:
2075
+ log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
2007
2076
 
2008
2077
  # Log the final state after all matches
2009
2078
  matched_count = (~df['aa_seq'].isna()).sum()
@@ -2184,7 +2253,7 @@ def run_pipeline(
2184
2253
  )
2185
2254
 
2186
2255
  # 4. Extract sequences (Section 7) ----------------------------------------
2187
- sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2256
+ sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
2188
2257
 
2189
2258
  # 4a. Try PDB extraction if no sequences found -----------------------------
2190
2259
  # Check if we need PDB sequences (no sequences or only partial sequences)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.8
3
+ Version: 0.1.11
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=pU1LNmYqDA6wql-sQ9H8cktGdOqOUTonX-sx1fgYV2Y,49
4
+ debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=k62qIp37ONYJBWT8D7ROh7ooYhz871BlCXJmAduq8js,95764
7
+ debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.8.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.8.dist-info/METADATA,sha256=uGoC1Ebi663KiWpLbzkbcpKHYFxKjTJXoU1lV4cbVJ8,10789
14
- debase-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.8.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.8.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.8.dist-info/RECORD,,
12
+ debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
14
+ debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.11.dist-info/RECORD,,