debase 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.7 → debase-0.1.9}/PKG-INFO +1 -1
- {debase-0.1.7 → debase-0.1.9}/src/debase/_version.py +1 -1
- {debase-0.1.7 → debase-0.1.9}/src/debase/enzyme_lineage_extractor.py +63 -6
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.7 → debase-0.1.9}/.gitignore +0 -0
- {debase-0.1.7 → debase-0.1.9}/CONTRIBUTING.md +0 -0
- {debase-0.1.7 → debase-0.1.9}/LICENSE +0 -0
- {debase-0.1.7 → debase-0.1.9}/MANIFEST.in +0 -0
- {debase-0.1.7 → debase-0.1.9}/README.md +0 -0
- {debase-0.1.7 → debase-0.1.9}/docs/README.md +0 -0
- {debase-0.1.7 → debase-0.1.9}/docs/examples/README.md +0 -0
- {debase-0.1.7 → debase-0.1.9}/environment.yml +0 -0
- {debase-0.1.7 → debase-0.1.9}/pyproject.toml +0 -0
- {debase-0.1.7 → debase-0.1.9}/setup.cfg +0 -0
- {debase-0.1.7 → debase-0.1.9}/setup.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/__init__.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/PIPELINE_FLOW.md +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/__init__.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/__main__.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/build_db.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/lineage_format.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/reaction_info_extractor.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase/wrapper.py +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/top_level.txt +0 -0
@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
|
|
1562
1562
|
```
|
1563
1563
|
""".strip()
|
1564
1564
|
|
1565
|
-
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
|
1565
|
+
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
|
1566
1566
|
"""Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
|
1567
|
-
|
1567
|
+
base_prompt = _SEQ_EXTRACTION_PROMPT.format(
|
1568
1568
|
schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
|
1569
1569
|
)
|
1570
|
+
|
1571
|
+
# Add lineage context if available
|
1572
|
+
if lineage_context:
|
1573
|
+
prompt = f"""{base_prompt}
|
1574
|
+
|
1575
|
+
IMPORTANT CONTEXT - Known variants from lineage extraction:
|
1576
|
+
{lineage_context}
|
1577
|
+
|
1578
|
+
Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
|
1579
|
+
"""
|
1580
|
+
else:
|
1581
|
+
prompt = base_prompt
|
1582
|
+
|
1570
1583
|
data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
|
1571
1584
|
return _parse_sequences(data)
|
1572
1585
|
|
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
1620
1633
|
return blocks
|
1621
1634
|
|
1622
1635
|
# --- 7.5 Convenience wrapper -------------------------------------------------
|
1623
|
-
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
|
1636
|
+
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
1624
1637
|
# Phase 1: Identify where sequences might be located
|
1625
1638
|
locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
|
1626
1639
|
|
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
1685
1698
|
if focused_text and len(focused_text) < len(text):
|
1686
1699
|
log.info("Reduced text from %d to %d chars using validated location",
|
1687
1700
|
len(text), len(focused_text))
|
1688
|
-
|
1701
|
+
# Build lineage context if available
|
1702
|
+
lineage_context = None
|
1703
|
+
if lineage_variants:
|
1704
|
+
variant_info = []
|
1705
|
+
for v in lineage_variants[:20]: # Limit to first 20
|
1706
|
+
info = f"- {v.variant_id} (Gen {v.generation})"
|
1707
|
+
if v.mutations:
|
1708
|
+
info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
|
1709
|
+
variant_info.append(info)
|
1710
|
+
lineage_context = "\n".join(variant_info)
|
1711
|
+
|
1712
|
+
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
1689
1713
|
else:
|
1690
1714
|
log.warning("Location validation failed or returned invalid location: %s",
|
1691
1715
|
validation.get("reason", "Unknown"))
|
1692
1716
|
|
1693
1717
|
# Fallback to full text
|
1694
1718
|
log.info("Using full text for sequence extraction")
|
1695
|
-
|
1719
|
+
# Build lineage context if available
|
1720
|
+
lineage_context = None
|
1721
|
+
if lineage_variants:
|
1722
|
+
variant_info = []
|
1723
|
+
for v in lineage_variants[:20]: # Limit to first 20
|
1724
|
+
info = f"- {v.variant_id} (Gen {v.generation})"
|
1725
|
+
if v.mutations:
|
1726
|
+
info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
|
1727
|
+
variant_info.append(info)
|
1728
|
+
lineage_context = "\n".join(variant_info)
|
1729
|
+
|
1730
|
+
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
1696
1731
|
|
1697
1732
|
# === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
|
1698
1733
|
"""When no sequences are found in the paper, attempt to fetch them from PDB."""
|
@@ -2005,6 +2040,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2005
2040
|
|
2006
2041
|
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
|
2007
2042
|
|
2043
|
+
# Log the final state after all matches
|
2044
|
+
matched_count = (~df['aa_seq'].isna()).sum()
|
2045
|
+
log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
|
2046
|
+
|
2008
2047
|
except Exception as e:
|
2009
2048
|
log.warning(f"Failed to match variants using Gemini: {e}")
|
2010
2049
|
|
@@ -2025,6 +2064,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2025
2064
|
# 5. Sort rows: primary by generation, then by variant_id
|
2026
2065
|
df = df.sort_values(["generation", "variant_id"], kind="mergesort")
|
2027
2066
|
|
2067
|
+
# Debug: Log final merge state
|
2068
|
+
seq_count = (~df['aa_seq'].isna()).sum()
|
2069
|
+
log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
|
2070
|
+
if seq_count > 0:
|
2071
|
+
log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
|
2072
|
+
|
2028
2073
|
return df
|
2029
2074
|
|
2030
2075
|
# --- 8.3 Public API -----------------------------------------------------------
|
@@ -2053,6 +2098,10 @@ def merge_and_score(
|
|
2053
2098
|
if missing_rate > 0.5:
|
2054
2099
|
log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
|
2055
2100
|
|
2101
|
+
# Debug log before returning
|
2102
|
+
seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
|
2103
|
+
log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
|
2104
|
+
|
2056
2105
|
return df
|
2057
2106
|
|
2058
2107
|
# -------------------------------------------------------------------- end 8 ---
|
@@ -2170,7 +2219,7 @@ def run_pipeline(
|
|
2170
2219
|
)
|
2171
2220
|
|
2172
2221
|
# 4. Extract sequences (Section 7) ----------------------------------------
|
2173
|
-
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2222
|
+
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
|
2174
2223
|
|
2175
2224
|
# 4a. Try PDB extraction if no sequences found -----------------------------
|
2176
2225
|
# Check if we need PDB sequences (no sequences or only partial sequences)
|
@@ -2236,6 +2285,14 @@ def run_pipeline(
|
|
2236
2285
|
output_csv_path = Path(output_csv)
|
2237
2286
|
# Save final data with sequences using same filename (overwrites lineage-only)
|
2238
2287
|
sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
|
2288
|
+
|
2289
|
+
# Debug: Log what we're about to save
|
2290
|
+
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2291
|
+
log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
|
2292
|
+
if seq_count > 0 and 'aa_seq' in df_final:
|
2293
|
+
with_seq = df_final[~df_final['aa_seq'].isna()]
|
2294
|
+
log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
|
2295
|
+
|
2239
2296
|
df_final.to_csv(sequence_path, index=False)
|
2240
2297
|
log.info(
|
2241
2298
|
"Overwrote with final results -> %s (%.1f kB)",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|