PyPI - debase - Versions diffs - 0.1.18__tar.gz → 0.4.0__tar.gz - Mend

debase 0.1.18tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{debase-0.1.18 → debase-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.18
+Version: 0.4.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.18 → debase-0.4.0}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.18"
+__version__ = "0.4.0"

{debase-0.1.18 → debase-0.4.0}/src/debase/cleanup_sequence.py RENAMED Viewed

@@ -827,20 +827,52 @@ class SequenceProcessor:
         log.info(f"Saved results to {self.output_csv}")
     def run(self) -> None:
-        """Run the complete processing pipeline."""
+        """Run the complete processing pipeline with campaign-based processing."""
         log.info("Starting sequence generation pipeline")
         # Load data
         self.load_data()
-        # Flag complex mutations
-        self.flag_complex_mutations()
+        # Process each campaign separately
+        campaigns = self.df['campaign_id'].unique()
+        log.info(f"Processing {len(campaigns)} campaigns: {list(campaigns)}")
-        # Process in order
-        self.process_simple_mutations()
-        self.process_complex_mutations()
-        self.process_remaining()
-        self.backward_pass()
+        for campaign_id in campaigns:
+            if pd.isna(campaign_id):
+                campaign_id = "unknown"
+            log.info(f"Processing campaign: {campaign_id}")
+            # Filter data for this campaign
+            campaign_mask = self.df['campaign_id'] == campaign_id
+            if pd.isna(campaign_id):
+                campaign_mask = self.df['campaign_id'].isna()
+            # Store original dataframe
+            original_df = self.df
+            # Process only this campaign's data
+            self.df = self.df[campaign_mask].copy()
+            # Rebuild relationships for this campaign
+            self.generator = SequenceGenerator(self.df)
+            # Flag complex mutations
+            self.flag_complex_mutations()
+            # Process in order
+            self.process_simple_mutations()
+            self.process_complex_mutations()
+            self.process_remaining()
+            self.backward_pass()
+            # Update the original dataframe with results
+            original_df.loc[campaign_mask, :] = self.df
+            # Restore original dataframe
+            self.df = original_df
+            log.info(f"Completed campaign: {campaign_id}")
         # Save results
         self.save_results()

{debase-0.1.18 → debase-0.4.0}/src/debase/enzyme_lineage_extractor.py RENAMED Viewed

@@ -377,13 +377,28 @@ def get_model():
 # === 5.3  Unified call helper ----------------------------------------------
-def _extract_text(resp) -> str:
+def _extract_text_and_track_tokens(resp) -> str:
     """
     Pull the *first* textual part out of a GenerativeAI response, handling both
-    the old prerelease SDK and the >=1.0 SDK.
+    the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
     Returns an empty string if no textual content is found.
     """
+    # Track token usage if available
+    try:
+        if hasattr(resp, 'usage_metadata'):
+            input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
+            output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
+            if input_tokens or output_tokens:
+                # Import wrapper token tracking
+                try:
+                    from .wrapper import add_token_usage
+                    add_token_usage('enzyme_lineage_extractor', input_tokens, output_tokens)
+                except ImportError:
+                    pass  # wrapper not available
+    except Exception:
+        pass  # token tracking is best-effort
     # 1) Legacy SDK (<= 0.4) - still has nice `.text`
     if getattr(resp, "text", None):
         return resp.text
@@ -409,6 +424,10 @@ def _extract_text(resp) -> str:
     # 3) As a last resort fall back to str()
     return str(resp)
+def _extract_text(resp) -> str:
+    """Backward compatibility wrapper for _extract_text_and_track_tokens."""
+    return _extract_text_and_track_tokens(resp)
 def generate_json_with_retry(
     model,
@@ -572,7 +591,7 @@ Look for:
 Return a JSON array of campaigns:
 [
   {{
-    "campaign_id": "unique_id",
+    "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
     "campaign_name": "descriptive name",
     "description": "what this campaign evolved for",
     "model_substrate": "substrate name/id",
@@ -585,6 +604,9 @@ Return a JSON array of campaigns:
   }}
 ]
+IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
+Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
 TEXT:
 {text}
 """.strip()
@@ -1559,6 +1581,82 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
         return []
 # --- 7.2  Page-based extraction helper ---------------------------------------
+def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
+    """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
+    # Extract mutations from variants
+    mutations = []
+    for variant in variants:
+        if variant.mutations:
+            mutations.extend(variant.mutations)
+    if not mutations:
+        return None
+    # Take a sample of mutations for validation
+    sample_mutations = mutations[:10]  # Check first 10 mutations
+    # First do a quick local check for obvious inconsistencies
+    local_issues = []
+    for mutation in sample_mutations:
+        if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
+            pos = mutation.position - 1  # Convert to 0-indexed
+            if 0 <= pos < len(sequence):
+                actual_aa = sequence[pos]
+                expected_aa = mutation.original
+                if actual_aa != expected_aa:
+                    local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
+    if not local_issues:
+        return None  # No obvious issues found
+    log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
+    prompt = f"""
+You are validating a protein sequence that was extracted from a scientific paper.
+The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
+Original sequence (length {len(sequence)}):
+{sequence}
+Known mutations that should be applicable to this sequence:
+{', '.join(str(m) for m in sample_mutations)}
+Potential issues detected:
+{chr(10).join(local_issues)}
+Please check if the sequence is consistent with these mutations:
+1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
+2. If you find inconsistencies, suggest the most likely correction
+3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
+4. Pay special attention to consecutive identical amino acids that might be OCR errors
+Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
+If you cannot determine the correct sequence, return "UNCERTAIN".
+"""
+    try:
+        response = model.generate_content(prompt)
+        result = _extract_text(response).strip()
+        if result == "VALID":
+            return None  # No changes needed
+        elif result == "UNCERTAIN":
+            log.warning("Gemini could not validate sequence against mutations")
+            return None
+        elif result.startswith("M") and len(result) > 50:
+            # Gemini returned a corrected sequence
+            log.info(f"Gemini suggested sequence correction (length {len(result)})")
+            return result
+        else:
+            log.warning(f"Unexpected validation response: {result[:100]}...")
+            return None
+    except Exception as e:
+        log.warning(f"Failed to validate sequence: {e}")
+        return None
 def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
     """Extract text from a specific page number in the PDFs.
@@ -2040,7 +2138,13 @@ If you cannot determine certain fields, set them to null.
             # Clean the sequence
             seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
             # Validate it looks like a protein sequence
-            if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
+            if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
+                # Sanity check the sequence against known mutations
+                validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
+                if validated_seq:
+                    seq = validated_seq
+                    log.info(f"Sequence validated and potentially corrected by Gemini")
                 # Map to the first variant or wild-type
                 wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
                 if wt_variant:
@@ -2365,7 +2469,8 @@ Papers often use different naming conventions for the same variant:
 - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
 - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
-Match variants by analyzing generation numbers, prefixes, and patterns.
+Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
+use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
 Lineage variant IDs (need sequences):
 {json.dumps(unmatched_lineage_ids)}
@@ -2378,8 +2483,24 @@ Format: {{"lineage_id": "sequence_id", ...}}
 """
             try:
+                log.info("Sending variant matching request to Gemini...")
+                log.debug(f"Prompt length: {len(prompt)} characters")
                 response = model.generate_content(prompt)
+                log.debug(f"Gemini response object: {response}")
+                log.debug(f"Response candidates: {getattr(response, 'candidates', 'N/A')}")
                 text = _extract_text(response).strip()
+                log.info(f"Extracted text length: {len(text)}")
+                if not text:
+                    log.error("Gemini returned empty text - API call may have failed")
+                    log.error(f"Response object: {response}")
+                    if hasattr(response, 'prompt_feedback'):
+                        log.error(f"Prompt feedback: {response.prompt_feedback}")
+                    raise ValueError("Empty response from Gemini")
+                log.debug(f"Raw response (first 500 chars): {text[:500]}")
                 # Parse JSON response
                 if text.startswith("```"):
@@ -2387,8 +2508,31 @@ Format: {{"lineage_id": "sequence_id", ...}}
                     if text.startswith("json"):
                         text = text[4:].strip()
-                matches = json.loads(text)
-                log.info(f"Gemini returned {len(matches)} matches")
+                log.debug(f"Cleaned text for JSON parsing (first 500 chars): {text[:500]}")
+                if not text.strip():
+                    log.error("Text is empty after cleaning")
+                    matches = {}
+                else:
+                    try:
+                        matches = json.loads(text)
+                        log.info(f"Successfully parsed {len(matches)} matches from Gemini")
+                    except json.JSONDecodeError as e:
+                        log.error(f"JSON parsing failed: {e}")
+                        log.error(f"Full cleaned text: {text}")
+                        # Try to extract JSON from within the response
+                        import re
+                        json_match = re.search(r'\{.*\}', text, re.DOTALL)
+                        if json_match:
+                            try:
+                                matches = json.loads(json_match.group(0))
+                                log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
+                            except json.JSONDecodeError:
+                                log.error("Failed to extract JSON from response")
+                                matches = {}
+                        else:
+                            log.error("No JSON object found in response")
+                            matches = {}
                 # Create a mapping of sequence IDs to their data for efficient lookup
                 seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
@@ -2456,8 +2600,8 @@ Format: {{"lineage_id": "sequence_id", ...}}
     # 5. Attach DOI column
     df["doi"] = doi
-    # 6. Sort by generation, then variant_id
-    df = df.sort_values(["generation", "variant_id"], kind="mergesort")
+    # 6. Sort by campaign_id, then generation
+    df = df.sort_values(["campaign_id", "generation"], kind="mergesort")
     # 7. Log final state
     aa_count = (~df['aa_seq'].isna()).sum()

debase 0.1.18__tar.gz → 0.4.0__tar.gz

debase 0.1.18tar.gz → 0.4.0tar.gz