PyPI - debase - Versions diffs - 0.4.0__tar.gz → 0.4.1__tar.gz - Mend

debase 0.4.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{debase-0.4.0/src/debase.egg-info → debase-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.0
+Version: 0.4.1
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.4.0 → debase-0.4.1}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.0"
+__version__ = "0.4.1"

{debase-0.4.0 → debase-0.4.1}/src/debase/substrate_scope_extractor.py RENAMED Viewed

@@ -309,68 +309,28 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
                 log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
-                # Extract multi-page region including the figure and content below
-                # The figure should be between the top of the viewable area and extend to subsequent pages
+                # Extract just the figure with its caption, avoiding excessive white space
                 page_rect = page.rect
-                # Define the region to extract
-                # Extract everything above the caption plus additional content from subsequent pages
-                top_margin = 0  # Start from the very top of the page
-                additional_pages = 2  # Number of additional pages to include
-                left_margin = 0  # Use full page width
-                right_margin = 0
-                # Calculate the figure region for the first page
-                fig_top = top_margin
-                fig_bottom = max(caption_rect.y0 + 150, page_rect.height)  # At least 150px below caption or full page
-                fig_left = left_margin
-                fig_right = page_rect.width - right_margin
-                # Create list to store all page images
-                page_images = []
+                # Calculate the figure region on current page only
+                # Extract from top of page to just below the caption
+                fig_top = 0  # Start from top of page
+                fig_bottom = min(caption_rect.y0 + 200, page_rect.height)  # 200px below caption, but not more than page height
+                fig_left = 0  # Full width
+                fig_right = page_rect.width
-                # Extract first page (from top to bottom)
+                # Extract only the figure region (no additional pages to avoid white space)
                 clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
                 mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
                 pix = page.get_pixmap(clip=clip_rect, matrix=mat)
-                page_images.append(pix)
-                # Extract additional pages if they exist
-                for additional_page_offset in range(1, additional_pages + 1):
-                    next_page_num = page_num + additional_page_offset
-                    if next_page_num < doc.page_count:
-                        next_page = doc.load_page(next_page_num)
-                        next_page_rect = next_page.rect
-                        # Extract full page for additional pages
-                        next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
-                        next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
-                        page_images.append(next_pix)
-                        log.info("Added page %d to multi-page extraction", next_page_num + 1)
-                # Combine all page images vertically
-                if len(page_images) == 1:
-                    # Single page extraction
-                    combined_pix = page_images[0]
-                else:
-                    # Multi-page extraction - combine vertically
-                    total_width = max(pix.width for pix in page_images)
-                    total_height = sum(pix.height for pix in page_images)
-                    # Create a new pixmap to hold the combined image
-                    combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
-                    combined_pix.clear_with(255)  # White background
-                    current_y = 0
-                    for pix in page_images:
-                        # Copy each page image to the combined image
-                        combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
-                        current_y += pix.height
+                log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
+                         pix.width, pix.height, page_num + 1)
                 # Convert to PNG
-                img_bytes = combined_pix.tobytes("png")
-                log.info("Extracted multi-page figure region: %dx%d pixels from %d pages starting at page %d",
-                         combined_pix.width, combined_pix.height, len(page_images), page_num + 1)
+                img_bytes = pix.tobytes("png")
+                log.info("Extracted figure region: %dx%d pixels from page %d",
+                         pix.width, pix.height, page_num + 1)
                 return b64encode(img_bytes).decode()

{debase-0.4.0 → debase-0.4.1/src/debase.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.0
+Version: 0.4.1
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team