PyPI - debase - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

debase 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{debase-0.1.2 → debase-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.2
+Version: 0.1.3
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.2 → debase-0.1.3}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.2"
+__version__ = "0.1.3"

{debase-0.1.2 → debase-0.1.3}/src/debase/enzyme_lineage_extractor.py RENAMED Viewed

@@ -1297,6 +1297,8 @@ _SEQUENCE_SCHEMA_HINT = """
 _SEQ_LOC_PROMPT = """
 Find where FULL-LENGTH protein or DNA sequences are located in this document.
+PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
 Look for table of contents entries or section listings that mention sequences.
 Return a JSON array where each element has:
 - "section": the section heading or description
@@ -1305,6 +1307,7 @@ Return a JSON array where each element has:
 Focus on:
 - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
 - Return the EXACT notation as shown.
+- Prioritize sections that mention "protein" or "amino acid" sequences
 Return [] if no sequence sections are found.
 Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -1465,10 +1468,16 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
 # --- 7.3  Main extraction prompt ---------------------------------------------
 _SEQ_EXTRACTION_PROMPT = """
 Extract EVERY distinct enzyme-variant sequence you can find in the text.
+IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
+- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
+- Only extract dna_seq if NO amino acid sequence is available for that variant
+- This reduces redundancy since protein sequences are usually more relevant
 For each variant return:
   * variant_id  - the label used in the paper (e.g. "R4-10")
   * aa_seq      - amino-acid sequence (uppercase), or null
-  * dna_seq     - DNA sequence (A/C/G/T), or null
+  * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
 Respond ONLY with **minified JSON** that matches the schema below.
 NO markdown, no code fences, no commentary.

{debase-0.1.2 → debase-0.1.3}/src/debase/reaction_info_extractor.py RENAMED Viewed

@@ -1055,7 +1055,20 @@ Different campaigns may use different model reactions.
         """Extract text around a given location identifier."""
         location_lower = location.lower()
-        # Search in all pages
+        # Handle compound locations like "Figure 2 caption and Section I"
+        # Extract the first figure/table/scheme reference
+        figure_match = re.search(r"(figure|scheme|table)\s*\d+", location_lower)
+        if figure_match:
+            primary_location = figure_match.group(0)
+            # Try to find this primary location first
+            for page_text in self.all_pages:
+                if primary_location in page_text.lower():
+                    idx = page_text.lower().index(primary_location)
+                    start = max(0, idx - 500)
+                    end = min(len(page_text), idx + 3000)
+                    return page_text[start:end]
+        # Search in all pages for exact match
         for page_text in self.all_pages:
             if location_lower in page_text.lower():
                 # Find the location and extract context around it

{debase-0.1.2 → debase-0.1.3}/src/debase.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.2
+Version: 0.1.3
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team