PyPI - debase - Versions diffs - 0.1.7__tar.gz → 0.1.9__tar.gz - Mend

debase 0.1.7tar.gz → 0.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{debase-0.1.7 → debase-0.1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.7
+Version: 0.1.9
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.7 → debase-0.1.9}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.7"
+__version__ = "0.1.9"

{debase-0.1.7 → debase-0.1.9}/src/debase/enzyme_lineage_extractor.py RENAMED Viewed

@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
 ```
 """.strip()
-def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
+def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
     """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
-    prompt = _SEQ_EXTRACTION_PROMPT.format(
+    base_prompt = _SEQ_EXTRACTION_PROMPT.format(
         schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
     )
+    # Add lineage context if available
+    if lineage_context:
+        prompt = f"""{base_prompt}
+IMPORTANT CONTEXT - Known variants from lineage extraction:
+{lineage_context}
+Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
+"""
+    else:
+        prompt = base_prompt
     data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
     return _parse_sequences(data)
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
     return blocks
 # --- 7.5  Convenience wrapper -------------------------------------------------
-def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
+def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
     # Phase 1: Identify where sequences might be located
     locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             if focused_text and len(focused_text) < len(text):
                 log.info("Reduced text from %d to %d chars using validated location",
                          len(text), len(focused_text))
-                return extract_sequences(focused_text, model, debug_dir=debug_dir)
+                # Build lineage context if available
+                lineage_context = None
+                if lineage_variants:
+                    variant_info = []
+                    for v in lineage_variants[:20]:  # Limit to first 20
+                        info = f"- {v.variant_id} (Gen {v.generation})"
+                        if v.mutations:
+                            info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
+                        variant_info.append(info)
+                    lineage_context = "\n".join(variant_info)
+                return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
         else:
             log.warning("Location validation failed or returned invalid location: %s",
                        validation.get("reason", "Unknown"))
     # Fallback to full text
     log.info("Using full text for sequence extraction")
-    return extract_sequences(text, model, debug_dir=debug_dir)
+    # Build lineage context if available
+    lineage_context = None
+    if lineage_variants:
+        variant_info = []
+        for v in lineage_variants[:20]:  # Limit to first 20
+            info = f"- {v.variant_id} (Gen {v.generation})"
+            if v.mutations:
+                info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
+            variant_info.append(info)
+        lineage_context = "\n".join(variant_info)
+    return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
 # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
 """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2005,6 +2040,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
                         log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
+                # Log the final state after all matches
+                matched_count = (~df['aa_seq'].isna()).sum()
+                log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
             except Exception as e:
                 log.warning(f"Failed to match variants using Gemini: {e}")
@@ -2025,6 +2064,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
     # 5. Sort rows: primary by generation, then by variant_id
     df = df.sort_values(["generation", "variant_id"], kind="mergesort")
+    # Debug: Log final merge state
+    seq_count = (~df['aa_seq'].isna()).sum()
+    log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
+    if seq_count > 0:
+        log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
     return df
 # --- 8.3  Public API -----------------------------------------------------------
@@ -2053,6 +2098,10 @@ def merge_and_score(
     if missing_rate > 0.5:
         log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
+    # Debug log before returning
+    seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
+    log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
     return df
 # -------------------------------------------------------------------- end 8 ---
@@ -2170,7 +2219,7 @@ def run_pipeline(
         )
     # 4. Extract sequences (Section 7) ----------------------------------------
-    sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
     # 4a. Try PDB extraction if no sequences found -----------------------------
     # Check if we need PDB sequences (no sequences or only partial sequences)
@@ -2236,6 +2285,14 @@ def run_pipeline(
         output_csv_path = Path(output_csv)
         # Save final data with sequences using same filename (overwrites lineage-only)
         sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
+        # Debug: Log what we're about to save
+        seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
+        log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
+        if seq_count > 0 and 'aa_seq' in df_final:
+            with_seq = df_final[~df_final['aa_seq'].isna()]
+            log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
         df_final.to_csv(sequence_path, index=False)
         log.info(
             "Overwrote with final results -> %s (%.1f kB)",

{debase-0.1.7 → debase-0.1.9}/src/debase.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.7
+Version: 0.1.9
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team