PyPI - debase - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

debase 0.4.5py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +123 -0
debase/enzyme_lineage_extractor.py +243 -309
debase/reaction_info_extractor.py +192 -68
{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/METADATA +1 -1
debase-0.5.1.dist-info/RECORD +16 -0
debase-0.4.5.dist-info/RECORD +0 -16
{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/WHEEL +0 -0
{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/entry_points.txt +0 -0
{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/top_level.txt +0 -0

debase/reaction_info_extractor.py CHANGED Viewed

@@ -54,11 +54,11 @@ class Config:
     """Centralised tunables so tests can override them easily."""
     model_name: str = "gemini-2.5-flash"
-    location_temperature: float = 0.2
+    location_temperature: float = 0.0
     extract_temperature: float = 0.0
     model_reaction_temperature: float = 0.0
     top_p: float = 1.0
-    max_tokens: int = 12288  # Increased 3x from 4096
+    max_tokens: int = 12288
     pdf_cache_size: int = 8
     retries: int = 2
@@ -778,50 +778,62 @@ class ReactionExtractor:
     # ------------------------------------------------------------------
     def _collect_captions_and_titles(self) -> str:
-        # Pattern to match Table or Figure with optional leading whitespace
+        # Pattern to match Table or Figure with optional leading whitespace and page numbers
         # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
-        # Also handles cases where there's whitespace before the caption
-        cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
+        # Also handles cases where there's whitespace or page numbers before the caption
+        cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
         captions: List[str] = []
-        # Collect from all pages
-        all_text = "\n".join(self.all_pages)
-        # Find all figure/table captions with more context
-        for match in cap_pattern.finditer(all_text):
-            caption_start = match.start()
-            # Include some context before the caption (up to 200 chars)
-            context_start = max(0, caption_start - 200)
-            # Find the start of the sentence/paragraph before the caption
-            context_text = all_text[context_start:caption_start]
-            last_period = context_text.rfind('.')
-            if last_period != -1:
-                context_start = context_start + last_period + 1
-            # For tables, include much more content after the caption to show actual table data
-            # For figures, include more content to ensure complete captions
-            is_table = 'table' in match.group(1).lower()
-            max_chars = 8000 if is_table else 5000
-            # Get up to max_chars or until double newline (but ensure we get complete caption)
-            # First, try to find the end of the caption sentence
-            caption_end = caption_start
-            period_pos = all_text.find('. ', caption_start)
-            if period_pos != -1 and period_pos < caption_start + 1000:
-                # Include at least to the end of the caption sentence
-                caption_end = period_pos + 1
-            # Then extend to include more context or until double newline
-            double_newline_pos = all_text.find("\n\n", caption_end)
-            if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
-                caption_end = caption_start + max_chars
-            else:
-                caption_end = double_newline_pos
-            # Include the context and full caption with table content
-            full_caption = all_text[context_start:caption_end].strip()
-            captions.append(full_caption)
+        # Process each page individually to avoid TOC entries
+        for page_idx, page_text in enumerate(self.all_pages):
+            # Skip if this looks like a TOC page
+            if self._is_toc_page(page_text):
+                LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
+                continue
+            # Find all figure/table captions with more context
+            for match in cap_pattern.finditer(page_text):
+                caption_line = match.group(0).strip()
+                # Skip if this looks like a TOC entry (has page number at end or dots)
+                if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
+                    LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
+                    continue
+                caption_start = match.start()
+                # For tables, include much more content after the caption to show actual table data
+                # For figures, include substantial content to show what the figure contains
+                is_table = 'table' in match.group(1).lower()
+                # Increase context for figures to ensure we capture descriptive text
+                max_chars = 8000 if is_table else 3000
+                # Get context including text before and after the caption
+                # Include some text before to help identify the location
+                context_before = max(0, caption_start - 200)
+                context_after = min(len(page_text), caption_start + max_chars)
+                # Extract the full context
+                full_context = page_text[context_before:context_after].strip()
+                # Find the actual caption text (not just the "Figure X" part)
+                # Look for text after the figure/table identifier that forms the caption
+                caption_text = page_text[caption_start:context_after]
+                # Try to find the end of the caption (usually ends with a period before next paragraph)
+                caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
+                if caption_end_match:
+                    actual_caption = caption_text[:caption_end_match.end()].strip()
+                else:
+                    # Fallback: take first few lines
+                    lines = caption_text.split('\n')
+                    actual_caption = '\n'.join(lines[:3]).strip()
+                # Ensure we have meaningful content, not just the figure number
+                if len(actual_caption) > 20:  # More than just "Figure S23."
+                    # For the prompt, include the full context to help identify what's in the figure
+                    caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
+                    captions.append(caption_with_context)
         # Also look for SI section titles
         si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
     # 6.2 Figure / Table context helpers
     # ------------------------------------------------------------------
+    def _is_toc_page(self, page_text: str) -> bool:
+        """Detect if a page is a Table of Contents page."""
+        # Look for common TOC indicators
+        toc_indicators = [
+            "table of contents",
+            "contents",
+            r"\.{5,}",  # Multiple dots (common in TOCs)
+            r"\d+\s*\n\s*\d+\s*\n\s*\d+",  # Multiple page numbers in sequence
+        ]
+        # Count how many TOC-like patterns we find
+        toc_score = 0
+        text_lower = page_text.lower()
+        # Check for explicit TOC title
+        if "table of contents" in text_lower or (
+            "contents" in text_lower and text_lower.index("contents") < 200
+        ):
+            toc_score += 3
+        # Check for multiple figure/table references with page numbers
+        figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
+        table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
+        if len(figure_with_page) + len(table_with_page) > 5:
+            toc_score += 2
+        # Check for many dotted lines
+        if len(re.findall(r'\.{5,}', page_text)) > 3:
+            toc_score += 1
+        return toc_score >= 2
     def _page_with_reference(self, ref_id: str) -> Optional[str]:
         for page in self.all_pages:
             if ref_id.lower() in page.lower():
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
                 LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
                            page_number + 1, doc_name, len(page_text))
-                # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
-                # For subfigures like "Figure 1C", extract the main figure "Figure 1"
-                figure_num = ref.replace('Figure ', '').replace('figure ', '')
+                # Skip Table of Contents pages
+                if self._is_toc_page(page_text):
+                    LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
+                    continue
+                # Look for figure caption pattern more flexibly
+                # Normalize the reference to handle variations
+                figure_num = ref.replace('Figure', '').replace('figure', '').strip()
                 # Extract main figure number from subfigure (e.g., "1C" -> "1")
                 main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1142,33 +1192,69 @@ class ReactionExtractor:
                 else:
                     main_figure_num = figure_num
-                caption_patterns = [
-                    rf"^Figure\s+{re.escape(main_figure_num)}\.",  # "Figure 1."
-                    rf"^Figure\s+{re.escape(main_figure_num)}:",   # "Figure 1:"
-                    rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]",  # "Figure 1 Performance"
-                    rf"^Figure\s+{re.escape(main_figure_num)}\s*$",  # "Figure 1" at end of line
-                    rf"Figure\s+{re.escape(main_figure_num)}\s*\.",  # "Figure 1." anywhere in line
-                    rf"Figure\s+{re.escape(main_figure_num)}\s*:",  # "Figure 1:" anywhere in line
-                ]
+                # Create a flexible pattern that handles various spacing and formatting
+                # This pattern looks for "Figure" (case insensitive) followed by optional spaces
+                # then the figure number, then any of: period, colon, space+capital letter, or end of line
+                # Also match at the beginning of a line to catch captions
+                flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
-                LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
-                           main_figure_num, ref, caption_patterns)
+                LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
+                           main_figure_num, flexible_pattern)
                 caption_found = False
                 cap_rect = None
-                for pattern in caption_patterns:
-                    matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
-                    if matches:
-                        LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
-                        # Found actual figure caption, get its position
-                        caption_text = matches.group(0)
-                        text_instances = page.search_for(caption_text, quads=False)
-                        if text_instances:
-                            cap_rect = text_instances[0]
-                            caption_found = True
-                            LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
-                            break
+                # Search for all matches of the flexible pattern
+                for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
+                    LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
+                    # Check if this is likely an actual caption (not just a reference)
+                    match_start = match.start()
+                    match_end = match.end()
+                    # Get surrounding context
+                    context_start = max(0, match_start - 50)
+                    context_end = min(len(page_text), match_end + 100)
+                    context = page_text[context_start:context_end]
+                    # Check if this looks like a real caption (not just a reference)
+                    # Look for words that typically precede figure references
+                    preceding_text = page_text[max(0, match_start-20):match_start].lower()
+                    if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
+                        LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
+                        continue
+                    # Check if there's descriptive text after the figure number
+                    remaining_text = page_text[match_end:match_end+100].strip()
+                    # For actual captions, there should be substantial descriptive text
+                    if len(remaining_text) < 20:
+                        LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
+                        continue
+                    # Check if the remaining text looks like a caption (contains descriptive words)
+                    # Expanded list of caption keywords to be more inclusive
+                    first_words = remaining_text[:50].lower()
+                    caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
+                                      'illustrates', 'demonstrates', 'results', 'data',
+                                      'chromatogram', 'spectra', 'analysis', 'site-directed',
+                                      'mutagenesis', 'mutants', 'evolution', 'directed',
+                                      'screening', 'reaction', 'variant', 'enzyme', 'protein',
+                                      'activity', 'performance', 'yield', 'selectivity',
+                                      'characterization', 'optimization', 'development',
+                                      'structure', 'domain', 'crystal', 'model']
+                    if not any(word in first_words for word in caption_keywords):
+                        LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
+                        continue
+                    # Found actual figure caption, get its position
+                    caption_text = match.group(0)
+                    text_instances = page.search_for(caption_text, quads=False)
+                    if text_instances:
+                        cap_rect = text_instances[0]
+                        caption_found = True
+                        LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
+                                  ref, caption_text, remaining_text[:50])
+                        break
                 if not caption_found:
                     # Debug: show what figure-related text is actually on this page
@@ -1243,6 +1329,39 @@ class ReactionExtractor:
                     self._figure_cache.put(cache_key, result)
                     return result
+        # Fallback: If no caption found, try to find any page that mentions this figure
+        LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
+        for doc_idx, doc in enumerate(docs):
+            doc_name = "MS" if doc_idx == 0 else "SI"
+            for page_number in range(doc.page_count):
+                page = doc.load_page(page_number)
+                page_text = page.get_text()
+                # Look for any mention of the figure reference
+                if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
+                    LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
+                               ref, page_number + 1, doc_name)
+                    # Extract the entire page as the figure might be on this page
+                    mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat)
+                    pix = self._ensure_rgb_pixmap(pix)
+                    img_bytes = pix.tobytes("png")
+                    # Save PNG to debug directory if available
+                    if self.debug_dir:
+                        timestamp = int(time.time())
+                        png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
+                        with open(png_file, 'wb') as f:
+                            f.write(img_bytes)
+                        LOGGER.info("Saved fallback page image to: %s", png_file)
+                    result = b64encode(img_bytes).decode()
+                    # Cache the result
+                    self._figure_cache.put(cache_key, result)
+                    return result
         LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
         return None
@@ -1258,6 +1377,11 @@ class ReactionExtractor:
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
+                # Skip Table of Contents pages
+                if self._is_toc_page(page_text):
+                    LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
+                    continue
                 # Check for actual figure caption first
                 if ref.lower().startswith('figure'):
                     figure_num = ref.replace('Figure ', '').replace('figure ', '')

{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.5
+Version: 0.5.1
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
+debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
+debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
+debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
+debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
+debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
+debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
+debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.5.1.dist-info/RECORD,,

debase-0.4.5.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
-debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
-debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
-debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
-debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
-debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
-debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
-debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.4.5.dist-info/RECORD,,

{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

debase 0.4.5py3-none-any.whl → 0.5.1py3-none-any.whl