PyPI - debase - Versions diffs - 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

debase 0.6.2py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +278 -163
debase/lineage_format.py +11 -5
debase/reaction_info_extractor.py +209 -36
debase/substrate_scope_extractor.py +157 -56
{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
debase-0.7.0.dist-info/RECORD +18 -0
debase-0.6.2.dist-info/RECORD +0 -18
{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -321,51 +321,81 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: O
     else:
         base_figure_ref = figure_ref
-    # Determine search order based on document hint
-    search_paths = list(pdf_paths)  # Create a copy
+    # Determine which PDFs to search based on document hint
     if document_hint and len(pdf_paths) > 1:
         if document_hint.lower() == "manuscript":
-            # Prioritize manuscript (first PDF)
-            search_paths = [pdf_paths[0]] + pdf_paths[1:]
-            log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
+            # ONLY search in manuscript (first PDF)
+            search_paths = [pdf_paths[0]]
+            log.info("Searching ONLY in manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
         elif document_hint.lower() == "supplementary":
-            # Prioritize SI (second PDF if available)
-            search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
-            log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
+            # ONLY search in SI (second PDF if available)
+            search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
+            log.info("Searching ONLY in supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
+        else:
+            # No specific hint, search all PDFs
+            search_paths = list(pdf_paths)
+    else:
+        # No hint or single PDF, search all available
+        search_paths = list(pdf_paths)
-    for pdf_path in search_paths:
+    # Extract figure number BEFORE the loop (e.g., "Figure 3" -> "3", "Figure S3" -> "S3", "Fig. 3" -> "3")
+    figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '').replace('Fig. ', '').replace('fig. ', '').replace('Fig ', '').replace('fig ', '')
+    for pdf_idx, pdf_path in enumerate(search_paths):
         doc = _open_doc(pdf_path)
         try:
+            log.debug("Searching %s (document %d/%d) with %d pages for figure '%s'",
+                     pdf_path.name, pdf_idx + 1, len(search_paths), doc.page_count, figure_num)
             for page_num in range(doc.page_count):
                 page = doc.load_page(page_num)
                 page_text = page.get_text()
+                # Debug: Check if page contains any mention of the figure
+                if figure_num.lower() in page_text.lower():
+                    log.debug("Page %d contains mention of figure number '%s'", page_num + 1, figure_num)
                 # Check if this page contains the figure caption
                 found = False
                 caption_rect = None
-                # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
-                figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
                 # First try to find using caption hint if provided
                 if caption_hint and len(caption_hint) > 10:
-                    # Try to find the exact caption text
-                    caption_snippet = caption_hint[:100]  # Use first 100 chars
-                    if caption_snippet in page_text:
-                        caption_instances = page.search_for(caption_snippet, quads=False)
-                        if caption_instances:
-                            caption_rect = caption_instances[0]
-                            found = True
-                            log.info("Found figure using caption hint on page %d", page_num + 1)
+                    # Try to find a unique portion of the caption
+                    # Start with longer snippets and work down to shorter ones
+                    caption_lengths = [100, 50, 30, 20]
+                    for length in caption_lengths:
+                        if len(caption_hint) >= length:
+                            snippet = caption_hint[:length]
+                            # Clean up the snippet to avoid partial words
+                            if length < len(caption_hint):
+                                # Try to end at a word boundary
+                                last_space = snippet.rfind(' ')
+                                if last_space > length * 0.6:  # Don't trim too much
+                                    snippet = snippet[:last_space]
+                            if snippet in page_text:
+                                caption_instances = page.search_for(snippet, quads=False)
+                                if caption_instances:
+                                    caption_rect = caption_instances[0]
+                                    found = True
+                                    log.info("Found figure using caption snippet (%d chars) on page %d", len(snippet), page_num + 1)
+                                    break
                 # If not found with hint, look for actual figure captions using regex patterns
                 if not found:
                     caption_patterns = [
-                        rf"^Figure\s+{re.escape(figure_num)}\.",  # "Figure 3." at start of line
-                        rf"^Figure\s+{re.escape(figure_num)}:",   # "Figure 3:" at start of line
+                        # More flexible patterns to match various formats
+                        rf"^Figure\s+{re.escape(figure_num)}[\s\.\:]",  # "Figure 3." or "Figure 3:" or "Figure 3 " at start
+                        rf"^Fig\.?\s*{re.escape(figure_num)}[\s\.\:]",  # "Fig. 3." or "Fig 3:" at start
+                        rf"Figure\s+{re.escape(figure_num)}[\s\.\:]",  # "Figure 3." anywhere
+                        rf"Fig\.?\s*{re.escape(figure_num)}[\s\.\:]",  # "Fig. 3." anywhere
                         rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]",  # "Figure 3 Substrate scope"
-                        rf"Figure\s+{re.escape(figure_num)}\s*\.",  # "Figure 3." anywhere
-                        rf"Figure\s+{re.escape(figure_num)}\s*:",  # "Figure 3:" anywhere
+                        rf"^Fig\.?\s*{re.escape(figure_num)}\s+[A-Z]",  # "Fig. 3 Substrate"
+                        # Special patterns for edge cases
+                        rf"Fig\.\s*{re.escape(figure_num)}\s*\|",  # "Fig. 3 |"
+                        rf"^\s*{re.escape(figure_num)}\.",  # Just "3." at line start (some formats)
                     ]
                     for pattern in caption_patterns:
@@ -381,6 +411,29 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: O
                                 break
                 if not found:
+                    # Try a fuzzy search for lines that look like figure captions
+                    lines = page_text.split('\n')
+                    for i, line in enumerate(lines):
+                        line_stripped = line.strip()
+                        line_lower = line_stripped.lower()
+                        # Check if this looks like a figure caption (starts with fig/figure)
+                        # and NOT an inline reference (which would have text before it)
+                        if (line_lower.startswith(('fig.', 'fig ', 'figure')) and
+                            figure_num.lower() in line_lower and
+                            len(line_stripped) < 200 and
+                            not line_lower.endswith(')')):  # Exclude inline refs like "(Fig. 2)"
+                            # Found a potential caption line
+                            caption_instances = page.search_for(line_stripped[:50], quads=False)
+                            if caption_instances:
+                                caption_rect = caption_instances[0]
+                                found = True
+                                log.info("Found figure via fuzzy search: '%s' on page %d", line_stripped[:50], page_num + 1)
+                                break
+                if not found:
+                    # Skip this page if we didn't find the actual figure caption
                     continue
                 log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
@@ -562,15 +615,29 @@ def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
     return caption_index
-def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
+def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000, document_hint: Optional[str] = None) -> str:
     """Extract text around a specific reference using caption index."""
     import re
-    # Build caption index if not already built
-    if not hasattr(_extract_text_around_reference, '_caption_index'):
-        _extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
+    # Filter PDFs based on document hint BEFORE building caption index
+    search_paths = pdf_paths
+    if document_hint and len(pdf_paths) > 1:
+        if document_hint.lower() == "manuscript":
+            # ONLY search in manuscript (first PDF)
+            search_paths = [pdf_paths[0]]
+            log.info("Text extraction: Searching ONLY in manuscript document for '%s'", ref)
+        elif document_hint.lower() == "supplementary":
+            # ONLY search in SI (second PDF if available)
+            search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
+            log.info("Text extraction: Searching ONLY in supplementary document for '%s'", ref)
+    # Build caption index from filtered paths
+    # Use a cache key that includes the document hint
+    cache_key = f"_caption_index_{document_hint or 'all'}"
+    if not hasattr(_extract_text_around_reference, cache_key):
+        setattr(_extract_text_around_reference, cache_key, _build_caption_index(search_paths))
-    caption_index = _extract_text_around_reference._caption_index
+    caption_index = getattr(_extract_text_around_reference, cache_key)
     ref_lower = ref.lower().strip()
     # Try multiple matching strategies
@@ -607,6 +674,8 @@ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_char
                 if info not in matches:
                     matches.append(info)
+    # No need to filter by document hint here since we already filtered the PDFs
     # Extract text from matches
     extracted_sections = []
     for match in matches:
@@ -1206,16 +1275,34 @@ def identify_iupac_sections(
     debug_dir: str | Path | None = None,
 ) -> List[dict]:
     """Identify sections containing IUPAC names from SI table of contents."""
-    # Extract only SI TOC pages (first 5 pages of SI)
+    # Extract SI TOC pages and scan for compound synthesis sections
     si_toc_text = ""
     if pdf_paths and len(pdf_paths) > 1:
         si_pdf = pdf_paths[1]  # Second PDF is SI
         doc = _open_doc(si_pdf)
         try:
-            for page_num in range(min(5, doc.page_count)):
+            # First get TOC from first 10 pages
+            for page_num in range(min(10, doc.page_count)):
                 page = doc.load_page(page_num)
                 page_text = page.get_text()
                 si_toc_text += f"\n[SI Page {page_num + 1}]\n{page_text}"
+            # Also scan for pages containing "synthesis of" or "characterization" patterns
+            compound_section_pages = []
+            for page_num in range(doc.page_count):
+                page = doc.load_page(page_num)
+                page_text = page.get_text().lower()
+                if any(pattern in page_text for pattern in [
+                    "synthesis of 1", "synthesis of 3", "compound 1", "compound 3",
+                    "characterization data", "nmr data", "1h nmr", "13c nmr"
+                ]):
+                    compound_section_pages.append(page_num + 1)
+            if compound_section_pages:
+                log.info("Found potential compound characterization on SI pages: %s", compound_section_pages[:10])
+                # Add a hint about these pages
+                si_toc_text += f"\n\n[Additional compound characterization detected on pages: {compound_section_pages[:10]}]"
         finally:
             doc.close()
@@ -1685,17 +1772,17 @@ def _extract_text_for_compound_mapping(
                 if page_text:
                     extraction_text += f"\n\n=== Section: '{section_title}' starting from page {page_range} ===\n{page_text}"
             else:
-                # Try title-based extraction as fallback
-                section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=10000)
+                # Try title-based extraction as fallback with more content
+                section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=30000)
                 if section_text:
                     extraction_text += section_text
         if not extraction_text:
-            log.warning("No text extracted from IUPAC sections, falling back to limited text")
-            extraction_text = text_fallback[:30_000]
+            log.warning("No text extracted from IUPAC sections, using full text")
+            extraction_text = text_fallback
     else:
-        # Fallback to limited text
-        extraction_text = text_fallback[:30_000]
+        # Fallback to full text - no limit
+        extraction_text = text_fallback
     return extraction_text
@@ -1739,24 +1826,19 @@ def extract_compound_mappings(
                  len(missing_compounds), sorted(missing_compounds))
         log.info("Expanding search to additional sections...")
-        # Define additional sections that might contain compound definitions
-        additional_sections = [
-            "Engineering strategy",
-            "Screening for benzyl acrylate cyclopropanation",
-            "Evolution campaign",
-            "General procedure",
-            "Experimental procedures",
-            "Materials and methods",
-            "Substrate synthesis"
-        ]
+        # For expanded search, use full manuscript and SI text
+        log.info("Using full manuscript and SI text for comprehensive compound search")
-        # Extract text from additional sections
-        additional_text = _extract_sections_by_title(
-            pdf_paths, additional_sections, max_chars_per_section=5000
-        )
+        # Extract complete text from both PDFs
+        additional_text = ""
+        for i, pdf_path in enumerate(pdf_paths):
+            doc_type = "manuscript" if i == 0 else "supplementary"
+            doc_text = extract_text(pdf_path)
+            additional_text += f"\n\n=== FULL {doc_type.upper()} TEXT ===\n{doc_text}"
+            log.info("Added %d chars from %s", len(doc_text), doc_type)
         if additional_text:
-            log.info("Extracted %d chars from additional sections", len(additional_text))
+            log.info("Total expanded text: %d chars", len(additional_text))
             # Second extraction attempt with expanded text
             expanded_mappings = _extract_compound_mappings_from_text(
@@ -1922,16 +2004,35 @@ def extract_substrate_scope_entries_for_campaign(
     for ref in all_refs:
         if ref and pdf_paths:
-            ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
+            # Get document hint for this reference
+            document_hint = location_map.get(ref, {}).get('document', '')
+            ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000, document_hint=document_hint)
             if ref_text:
                 extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
                 # Extract figure images for this reference (crop page around figure)
                 try:
-                    # Get caption and document hints if available
+                    # Get caption hint if available
                     caption_hint = location_map.get(ref, {}).get('caption', '')
-                    document_hint = location_map.get(ref, {}).get('document', '')
-                    fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
+                    # If we have a good caption, try to extract based on caption pattern
+                    if caption_hint and len(caption_hint) > 20:
+                        # Extract the figure reference from the caption itself
+                        import re
+                        caption_fig_match = re.match(r'((?:Figure|Fig\.?)\s*\d+[a-zA-Z]?)', caption_hint, re.IGNORECASE)
+                        if caption_fig_match:
+                            # Use the figure reference from the caption for more accurate matching
+                            fig_ref_from_caption = caption_fig_match.group(1)
+                            log.info("Campaign %s - using figure reference from caption: '%s' (original: '%s')",
+                                    campaign_id, fig_ref_from_caption, ref)
+                            fig_base64 = extract_figure_image(pdf_paths, fig_ref_from_caption, caption_hint=caption_hint, document_hint=document_hint)
+                        else:
+                            # Fallback to original reference
+                            fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
+                    else:
+                        # No caption hint, use original reference
+                        fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
                     if fig_base64:
                         figure_images[ref] = fig_base64
                         log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)

{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.6.2
+Version: 0.7.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=eraKmkwzA-nS1rqm0gvetysX_jPj7RUdCTZBi3t9g6g,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
+debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
+debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
+debase/enzyme_lineage_extractor.py,sha256=7BGkjp7Pl31etWrkjBw2f2KO3fd_1IJLjUnxvUswXuw,194404
+debase/lineage_format.py,sha256=NhpkeBFE94t50uLAuE47ZP3Ti12XvzPHYt-k-_d7OLA,59730
+debase/reaction_info_extractor.py,sha256=tmL-VBMtRKVy36QMTTLSCy9PU4RLpwaJyPRfc8jIalU,198299
+debase/substrate_scope_extractor.py,sha256=AQFJ-6wvKFVZJ5vTwvYMez_v02W2ic95HQF5shidGFY,129282
+debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
+debase-0.7.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.7.0.dist-info/METADATA,sha256=Wsz_cofQy3NaS_icwm6p0unDOlBknaggce2JQc-4k2c,4047
+debase-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.7.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.7.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.7.0.dist-info/RECORD,,

debase-0.6.2.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
-debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
-debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
-debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
-debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
-debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
-debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
-debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
-debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
-debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.6.2.dist-info/RECORD,,

{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.6.2.dist-info → debase-0.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

debase 0.6.2py3-none-any.whl → 0.7.0py3-none-any.whl