PyPI - debase - Versions diffs - 0.1.17__tar.gz → 0.1.19__tar.gz - Mend

debase 0.1.17tar.gz → 0.1.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{debase-0.1.17 → debase-0.1.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.17
+Version: 0.1.19
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.17 → debase-0.1.19}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.17"
+__version__ = "0.1.19"

{debase-0.1.17 → debase-0.1.19}/src/debase/reaction_info_extractor.py RENAMED Viewed

@@ -927,12 +927,77 @@ Ignore locations that contain data for other campaigns.
     # ------------------------------------------------------------------
     # 6.3 Extract metrics in batch
     # ------------------------------------------------------------------
+    def _validate_location_exists(self, ref: str) -> bool:
+        """Verify that the referenced location actually exists in the document."""
+        # Search for the actual reference in the document
+        for page_num in range(len(self.doc)):
+            page = self.doc[page_num]
+            text = page.get_text()
+            # Look for table references like "Table 1", "Table S1", etc.
+            if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
+                return True
+        return False
+    def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
+        """Validate that the context contains meaningful content for extraction."""
+        if not snippet or len(snippet.strip()) < 50:
+            LOGGER.warning("Insufficient context for extraction from %s - skipping", ref)
+            return False
+        # Check if context actually mentions the enzymes we're looking for
+        enzyme_mentions = sum(1 for enzyme in enzyme_list if enzyme.lower() in snippet.lower())
+        if enzyme_mentions == 0:
+            LOGGER.warning("No enzyme mentions found in context for %s - skipping", ref)
+            return False
+        # Check for performance-related keywords
+        performance_keywords = ['yield', 'selectivity', 'conversion', 'ee', 'er', 'ttn', 'ton', 'tof', '%', 'percent']
+        has_performance_data = any(keyword in snippet.lower() for keyword in performance_keywords)
+        if not has_performance_data:
+            LOGGER.warning("No performance metrics found in context for %s - skipping", ref)
+            return False
+        LOGGER.info("Context validated for %s: %d chars, %d enzyme mentions", ref, len(snippet), enzyme_mentions)
+        return True
+    def _validate_response(self, data: Dict, enzyme_list: List[str], ref: str) -> bool:
+        """Validate that the response contains meaningful data for the requested enzymes."""
+        if not data or not isinstance(data, dict):
+            LOGGER.warning("Invalid response format from %s - skipping", ref)
+            return False
+        # Check if we got data for at least one enzyme
+        enzymes_with_data = 0
+        for enzyme in enzyme_list:
+            enzyme_data = data.get(enzyme, {})
+            if isinstance(enzyme_data, dict) and enzyme_data:
+                # Check if there's at least one non-null metric
+                metrics = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
+                has_metric = any(enzyme_data.get(metric) is not None for metric in metrics)
+                if has_metric:
+                    enzymes_with_data += 1
+        if enzymes_with_data == 0:
+            LOGGER.warning("No valid metrics found in response from %s - skipping", ref)
+            return False
+        LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
+        return True
     def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
         """Extract performance metrics for multiple enzymes from the identified location in batch."""
         ref_lc = ref.lower()
         image_b64: Optional[str] = None
+        # First, validate that the location actually exists in the document
+        if not self._validate_location_exists(ref):
+            LOGGER.warning("Location %s not found in document - skipping", ref)
+            return []
         # Add campaign context if available
         campaign_context = ""
         if self.campaign_filter:
@@ -953,6 +1018,10 @@ Ignore locations that contain data for other campaigns.
         else:
             snippet = self._page_with_reference(ref) or ""
+        # Validate context before sending to Gemini
+        if not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
+            return []
         enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
         if image_b64:
@@ -961,8 +1030,9 @@ Ignore locations that contain data for other campaigns.
             LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
             tag = f"extract_metrics_batch_vision"
         else:
-            # Add enzyme names to prompt for batch extraction
-            prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\n=== CONTEXT ===\n" + snippet[:4000]
+            # Add enzyme names to prompt for batch extraction with explicit format requirement
+            format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
+            prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
             LOGGER.info("Gemini: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
             tag = f"extract_metrics_batch"
@@ -976,6 +1046,10 @@ Ignore locations that contain data for other campaigns.
                 image_b64=image_b64
             )
+            # Validate response has meaningful data
+            if not self._validate_response(data, enzyme_list, ref):
+                return []
             # Handle the response format - expecting a dict with enzyme names as keys
             results = []
             if isinstance(data, dict):

{debase-0.1.17 → debase-0.1.19}/src/debase.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.17
+Version: 0.1.19
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team