debase 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.4.0/src/debase.egg-info → debase-0.4.1}/PKG-INFO +1 -1
- {debase-0.4.0 → debase-0.4.1}/src/debase/_version.py +1 -1
- {debase-0.4.0 → debase-0.4.1}/src/debase/substrate_scope_extractor.py +13 -53
- {debase-0.4.0 → debase-0.4.1/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.4.0 → debase-0.4.1}/LICENSE +0 -0
- {debase-0.4.0 → debase-0.4.1}/MANIFEST.in +0 -0
- {debase-0.4.0 → debase-0.4.1}/README.md +0 -0
- {debase-0.4.0 → debase-0.4.1}/pyproject.toml +0 -0
- {debase-0.4.0 → debase-0.4.1}/setup.cfg +0 -0
- {debase-0.4.0 → debase-0.4.1}/setup.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/__init__.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/__main__.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/build_db.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/enzyme_lineage_extractor.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/lineage_format.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/reaction_info_extractor.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase/wrapper.py +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.4.0 → debase-0.4.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -309,68 +309,28 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
309
309
|
|
310
310
|
log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
|
311
311
|
|
312
|
-
# Extract
|
313
|
-
# The figure should be between the top of the viewable area and extend to subsequent pages
|
312
|
+
# Extract just the figure with its caption, avoiding excessive white space
|
314
313
|
page_rect = page.rect
|
315
314
|
|
316
|
-
#
|
317
|
-
# Extract
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
# Calculate the figure region for the first page
|
324
|
-
fig_top = top_margin
|
325
|
-
fig_bottom = max(caption_rect.y0 + 150, page_rect.height) # At least 150px below caption or full page
|
326
|
-
fig_left = left_margin
|
327
|
-
fig_right = page_rect.width - right_margin
|
328
|
-
|
329
|
-
# Create list to store all page images
|
330
|
-
page_images = []
|
315
|
+
# Calculate the figure region on current page only
|
316
|
+
# Extract from top of page to just below the caption
|
317
|
+
fig_top = 0 # Start from top of page
|
318
|
+
fig_bottom = min(caption_rect.y0 + 200, page_rect.height) # 200px below caption, but not more than page height
|
319
|
+
fig_left = 0 # Full width
|
320
|
+
fig_right = page_rect.width
|
331
321
|
|
332
|
-
# Extract
|
322
|
+
# Extract only the figure region (no additional pages to avoid white space)
|
333
323
|
clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
|
334
324
|
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
335
325
|
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
336
|
-
page_images.append(pix)
|
337
326
|
|
338
|
-
|
339
|
-
|
340
|
-
next_page_num = page_num + additional_page_offset
|
341
|
-
if next_page_num < doc.page_count:
|
342
|
-
next_page = doc.load_page(next_page_num)
|
343
|
-
next_page_rect = next_page.rect
|
344
|
-
|
345
|
-
# Extract full page for additional pages
|
346
|
-
next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
|
347
|
-
next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
|
348
|
-
page_images.append(next_pix)
|
349
|
-
log.info("Added page %d to multi-page extraction", next_page_num + 1)
|
350
|
-
|
351
|
-
# Combine all page images vertically
|
352
|
-
if len(page_images) == 1:
|
353
|
-
# Single page extraction
|
354
|
-
combined_pix = page_images[0]
|
355
|
-
else:
|
356
|
-
# Multi-page extraction - combine vertically
|
357
|
-
total_width = max(pix.width for pix in page_images)
|
358
|
-
total_height = sum(pix.height for pix in page_images)
|
359
|
-
|
360
|
-
# Create a new pixmap to hold the combined image
|
361
|
-
combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
|
362
|
-
combined_pix.clear_with(255) # White background
|
363
|
-
|
364
|
-
current_y = 0
|
365
|
-
for pix in page_images:
|
366
|
-
# Copy each page image to the combined image
|
367
|
-
combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
|
368
|
-
current_y += pix.height
|
327
|
+
log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
|
328
|
+
pix.width, pix.height, page_num + 1)
|
369
329
|
|
370
330
|
# Convert to PNG
|
371
|
-
img_bytes =
|
372
|
-
log.info("Extracted
|
373
|
-
|
331
|
+
img_bytes = pix.tobytes("png")
|
332
|
+
log.info("Extracted figure region: %dx%d pixels from page %d",
|
333
|
+
pix.width, pix.height, page_num + 1)
|
374
334
|
|
375
335
|
return b64encode(img_bytes).decode()
|
376
336
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|