debase 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {debase-0.5.0/src/debase.egg-info → debase-0.5.1}/PKG-INFO +1 -1
  2. {debase-0.5.0 → debase-0.5.1}/src/debase/_version.py +1 -1
  3. {debase-0.5.0 → debase-0.5.1}/src/debase/reaction_info_extractor.py +45 -5
  4. {debase-0.5.0 → debase-0.5.1/src/debase.egg-info}/PKG-INFO +1 -1
  5. {debase-0.5.0 → debase-0.5.1}/.gitignore +0 -0
  6. {debase-0.5.0 → debase-0.5.1}/LICENSE +0 -0
  7. {debase-0.5.0 → debase-0.5.1}/MANIFEST.in +0 -0
  8. {debase-0.5.0 → debase-0.5.1}/README.md +0 -0
  9. {debase-0.5.0 → debase-0.5.1}/environment.yml +0 -0
  10. {debase-0.5.0 → debase-0.5.1}/pyproject.toml +0 -0
  11. {debase-0.5.0 → debase-0.5.1}/setup.cfg +0 -0
  12. {debase-0.5.0 → debase-0.5.1}/setup.py +0 -0
  13. {debase-0.5.0 → debase-0.5.1}/src/__init__.py +0 -0
  14. {debase-0.5.0 → debase-0.5.1}/src/debase/__init__.py +0 -0
  15. {debase-0.5.0 → debase-0.5.1}/src/debase/__main__.py +0 -0
  16. {debase-0.5.0 → debase-0.5.1}/src/debase/build_db.py +0 -0
  17. {debase-0.5.0 → debase-0.5.1}/src/debase/cleanup_sequence.py +0 -0
  18. {debase-0.5.0 → debase-0.5.1}/src/debase/enzyme_lineage_extractor.py +0 -0
  19. {debase-0.5.0 → debase-0.5.1}/src/debase/lineage_format.py +0 -0
  20. {debase-0.5.0 → debase-0.5.1}/src/debase/substrate_scope_extractor.py +0 -0
  21. {debase-0.5.0 → debase-0.5.1}/src/debase/wrapper.py +0 -0
  22. {debase-0.5.0 → debase-0.5.1}/src/debase.egg-info/SOURCES.txt +0 -0
  23. {debase-0.5.0 → debase-0.5.1}/src/debase.egg-info/dependency_links.txt +0 -0
  24. {debase-0.5.0 → debase-0.5.1}/src/debase.egg-info/entry_points.txt +0 -0
  25. {debase-0.5.0 → debase-0.5.1}/src/debase.egg-info/requires.txt +0 -0
  26. {debase-0.5.0 → debase-0.5.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.5.0"
3
+ __version__ = "0.5.1"
@@ -1195,7 +1195,8 @@ class ReactionExtractor:
1195
1195
  # Create a flexible pattern that handles various spacing and formatting
1196
1196
  # This pattern looks for "Figure" (case insensitive) followed by optional spaces
1197
1197
  # then the figure number, then any of: period, colon, space+capital letter, or end of line
1198
- flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1198
+ # Also match at the beginning of a line to catch captions
1199
+ flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1199
1200
 
1200
1201
  LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
1201
1202
  main_figure_num, flexible_pattern)
@@ -1231,11 +1232,17 @@ class ReactionExtractor:
1231
1232
  continue
1232
1233
 
1233
1234
  # Check if the remaining text looks like a caption (contains descriptive words)
1235
+ # Expanded list of caption keywords to be more inclusive
1234
1236
  first_words = remaining_text[:50].lower()
1235
- if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
1236
- 'illustrates', 'demonstrates', 'results', 'data',
1237
- 'chromatogram', 'spectra', 'analysis', 'site-directed',
1238
- 'mutagenesis', 'mutants']):
1237
+ caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
1238
+ 'illustrates', 'demonstrates', 'results', 'data',
1239
+ 'chromatogram', 'spectra', 'analysis', 'site-directed',
1240
+ 'mutagenesis', 'mutants', 'evolution', 'directed',
1241
+ 'screening', 'reaction', 'variant', 'enzyme', 'protein',
1242
+ 'activity', 'performance', 'yield', 'selectivity',
1243
+ 'characterization', 'optimization', 'development',
1244
+ 'structure', 'domain', 'crystal', 'model']
1245
+ if not any(word in first_words for word in caption_keywords):
1239
1246
  LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
1240
1247
  continue
1241
1248
 
@@ -1322,6 +1329,39 @@ class ReactionExtractor:
1322
1329
  self._figure_cache.put(cache_key, result)
1323
1330
  return result
1324
1331
 
1332
+ # Fallback: If no caption found, try to find any page that mentions this figure
1333
+ LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
1334
+
1335
+ for doc_idx, doc in enumerate(docs):
1336
+ doc_name = "MS" if doc_idx == 0 else "SI"
1337
+ for page_number in range(doc.page_count):
1338
+ page = doc.load_page(page_number)
1339
+ page_text = page.get_text()
1340
+
1341
+ # Look for any mention of the figure reference
1342
+ if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
1343
+ LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
1344
+ ref, page_number + 1, doc_name)
1345
+
1346
+ # Extract the entire page as the figure might be on this page
1347
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
1348
+ pix = page.get_pixmap(matrix=mat)
1349
+ pix = self._ensure_rgb_pixmap(pix)
1350
+ img_bytes = pix.tobytes("png")
1351
+
1352
+ # Save PNG to debug directory if available
1353
+ if self.debug_dir:
1354
+ timestamp = int(time.time())
1355
+ png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
1356
+ with open(png_file, 'wb') as f:
1357
+ f.write(img_bytes)
1358
+ LOGGER.info("Saved fallback page image to: %s", png_file)
1359
+
1360
+ result = b64encode(img_bytes).decode()
1361
+ # Cache the result
1362
+ self._figure_cache.put(cache_key, result)
1363
+ return result
1364
+
1325
1365
  LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
1326
1366
  return None
1327
1367
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes