debase 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +85 -16
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/METADATA +1 -1
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/RECORD +8 -8
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/WHEEL +0 -0
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/entry_points.txt +0 -0
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.8.dist-info → debase-0.1.11.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
|
|
1562
1562
|
```
|
1563
1563
|
""".strip()
|
1564
1564
|
|
1565
|
-
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
|
1565
|
+
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
|
1566
1566
|
"""Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
|
1567
|
-
|
1567
|
+
base_prompt = _SEQ_EXTRACTION_PROMPT.format(
|
1568
1568
|
schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
|
1569
1569
|
)
|
1570
|
+
|
1571
|
+
# Add lineage context if available
|
1572
|
+
if lineage_context:
|
1573
|
+
prompt = f"""{base_prompt}
|
1574
|
+
|
1575
|
+
IMPORTANT CONTEXT - Known variants from lineage extraction:
|
1576
|
+
{lineage_context}
|
1577
|
+
|
1578
|
+
Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
|
1579
|
+
"""
|
1580
|
+
else:
|
1581
|
+
prompt = base_prompt
|
1582
|
+
|
1570
1583
|
data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
|
1571
1584
|
return _parse_sequences(data)
|
1572
1585
|
|
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
1620
1633
|
return blocks
|
1621
1634
|
|
1622
1635
|
# --- 7.5 Convenience wrapper -------------------------------------------------
|
1623
|
-
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
|
1636
|
+
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
1624
1637
|
# Phase 1: Identify where sequences might be located
|
1625
1638
|
locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
|
1626
1639
|
|
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
1685
1698
|
if focused_text and len(focused_text) < len(text):
|
1686
1699
|
log.info("Reduced text from %d to %d chars using validated location",
|
1687
1700
|
len(text), len(focused_text))
|
1688
|
-
|
1701
|
+
# Build lineage context if available
|
1702
|
+
lineage_context = None
|
1703
|
+
if lineage_variants:
|
1704
|
+
variant_info = []
|
1705
|
+
for v in lineage_variants[:20]: # Limit to first 20
|
1706
|
+
info = f"- {v.variant_id} (Gen {v.generation})"
|
1707
|
+
if v.mutations:
|
1708
|
+
info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
|
1709
|
+
variant_info.append(info)
|
1710
|
+
lineage_context = "\n".join(variant_info)
|
1711
|
+
|
1712
|
+
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
1689
1713
|
else:
|
1690
1714
|
log.warning("Location validation failed or returned invalid location: %s",
|
1691
1715
|
validation.get("reason", "Unknown"))
|
1692
1716
|
|
1693
1717
|
# Fallback to full text
|
1694
1718
|
log.info("Using full text for sequence extraction")
|
1695
|
-
|
1719
|
+
# Build lineage context if available
|
1720
|
+
lineage_context = None
|
1721
|
+
if lineage_variants:
|
1722
|
+
variant_info = []
|
1723
|
+
for v in lineage_variants[:20]: # Limit to first 20
|
1724
|
+
info = f"- {v.variant_id} (Gen {v.generation})"
|
1725
|
+
if v.mutations:
|
1726
|
+
info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
|
1727
|
+
variant_info.append(info)
|
1728
|
+
lineage_context = "\n".join(variant_info)
|
1729
|
+
|
1730
|
+
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
1696
1731
|
|
1697
1732
|
# === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
|
1698
1733
|
"""When no sequences are found in the paper, attempt to fetch them from PDB."""
|
@@ -1989,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
1989
2024
|
text = text[4:].strip()
|
1990
2025
|
|
1991
2026
|
matches = json.loads(text)
|
2027
|
+
log.info(f"Gemini returned matches: {matches}")
|
2028
|
+
|
2029
|
+
# Debug: Log what sequences we actually have
|
2030
|
+
log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
|
1992
2031
|
|
1993
2032
|
# Apply the matches
|
1994
2033
|
for lineage_id, seq_id in matches.items():
|
1995
|
-
if lineage_id in unmatched_lineage
|
1996
|
-
#
|
1997
|
-
seq_data =
|
2034
|
+
if lineage_id in unmatched_lineage:
|
2035
|
+
# Find the sequence data - be flexible with matching
|
2036
|
+
seq_data = None
|
1998
2037
|
|
1999
|
-
#
|
2000
|
-
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2004
|
-
|
2038
|
+
# First try exact match
|
2039
|
+
seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
|
2040
|
+
if len(seq_matches) > 0:
|
2041
|
+
seq_data = seq_matches.iloc[0]
|
2042
|
+
else:
|
2043
|
+
# Try to find by checking various matching strategies
|
2044
|
+
for idx, row in unmatched_seqs.iterrows():
|
2045
|
+
variant_id = row['variant_id']
|
2046
|
+
# Check if one is contained in the other
|
2047
|
+
if seq_id in variant_id or variant_id in seq_id:
|
2048
|
+
seq_data = row
|
2049
|
+
break
|
2050
|
+
# Check if they share the same core identifier (e.g., G0, G1, etc.)
|
2051
|
+
seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
|
2052
|
+
variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
|
2053
|
+
if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
|
2054
|
+
seq_data = row
|
2055
|
+
break
|
2005
2056
|
|
2006
|
-
|
2057
|
+
if seq_data is not None:
|
2058
|
+
# Update the dataframe
|
2059
|
+
mask = df['variant_id'] == lineage_id
|
2060
|
+
if mask.any():
|
2061
|
+
# Log before update
|
2062
|
+
log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
|
2063
|
+
|
2064
|
+
df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
|
2065
|
+
df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
|
2066
|
+
df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
|
2067
|
+
df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
|
2068
|
+
|
2069
|
+
# Log after update
|
2070
|
+
log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
|
2071
|
+
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
|
2072
|
+
else:
|
2073
|
+
log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
|
2074
|
+
else:
|
2075
|
+
log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
|
2007
2076
|
|
2008
2077
|
# Log the final state after all matches
|
2009
2078
|
matched_count = (~df['aa_seq'].isna()).sum()
|
@@ -2184,7 +2253,7 @@ def run_pipeline(
|
|
2184
2253
|
)
|
2185
2254
|
|
2186
2255
|
# 4. Extract sequences (Section 7) ----------------------------------------
|
2187
|
-
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2256
|
+
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
|
2188
2257
|
|
2189
2258
|
# 4a. Try PDB extraction if no sequences found -----------------------------
|
2190
2259
|
# Check if we need PDB sequences (no sequences or only partial sequences)
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
9
|
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
|
14
|
+
debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|