debase 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/reaction_info_extractor.py +45 -5
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/METADATA +1 -1
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/RECORD +8 -8
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/WHEEL +0 -0
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/entry_points.txt +0 -0
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {debase-0.5.0.dist-info → debase-0.5.1.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -1195,7 +1195,8 @@ class ReactionExtractor:
|
|
1195
1195
|
# Create a flexible pattern that handles various spacing and formatting
|
1196
1196
|
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
1197
|
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1198
|
-
|
1198
|
+
# Also match at the beginning of a line to catch captions
|
1199
|
+
flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
|
1199
1200
|
|
1200
1201
|
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1201
1202
|
main_figure_num, flexible_pattern)
|
@@ -1231,11 +1232,17 @@ class ReactionExtractor:
|
|
1231
1232
|
continue
|
1232
1233
|
|
1233
1234
|
# Check if the remaining text looks like a caption (contains descriptive words)
|
1235
|
+
# Expanded list of caption keywords to be more inclusive
|
1234
1236
|
first_words = remaining_text[:50].lower()
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1237
|
+
caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
|
1238
|
+
'illustrates', 'demonstrates', 'results', 'data',
|
1239
|
+
'chromatogram', 'spectra', 'analysis', 'site-directed',
|
1240
|
+
'mutagenesis', 'mutants', 'evolution', 'directed',
|
1241
|
+
'screening', 'reaction', 'variant', 'enzyme', 'protein',
|
1242
|
+
'activity', 'performance', 'yield', 'selectivity',
|
1243
|
+
'characterization', 'optimization', 'development',
|
1244
|
+
'structure', 'domain', 'crystal', 'model']
|
1245
|
+
if not any(word in first_words for word in caption_keywords):
|
1239
1246
|
LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
|
1240
1247
|
continue
|
1241
1248
|
|
@@ -1322,6 +1329,39 @@ class ReactionExtractor:
|
|
1322
1329
|
self._figure_cache.put(cache_key, result)
|
1323
1330
|
return result
|
1324
1331
|
|
1332
|
+
# Fallback: If no caption found, try to find any page that mentions this figure
|
1333
|
+
LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
|
1334
|
+
|
1335
|
+
for doc_idx, doc in enumerate(docs):
|
1336
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1337
|
+
for page_number in range(doc.page_count):
|
1338
|
+
page = doc.load_page(page_number)
|
1339
|
+
page_text = page.get_text()
|
1340
|
+
|
1341
|
+
# Look for any mention of the figure reference
|
1342
|
+
if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
|
1343
|
+
LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
|
1344
|
+
ref, page_number + 1, doc_name)
|
1345
|
+
|
1346
|
+
# Extract the entire page as the figure might be on this page
|
1347
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
1348
|
+
pix = page.get_pixmap(matrix=mat)
|
1349
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1350
|
+
img_bytes = pix.tobytes("png")
|
1351
|
+
|
1352
|
+
# Save PNG to debug directory if available
|
1353
|
+
if self.debug_dir:
|
1354
|
+
timestamp = int(time.time())
|
1355
|
+
png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
|
1356
|
+
with open(png_file, 'wb') as f:
|
1357
|
+
f.write(img_bytes)
|
1358
|
+
LOGGER.info("Saved fallback page image to: %s", png_file)
|
1359
|
+
|
1360
|
+
result = b64encode(img_bytes).decode()
|
1361
|
+
# Cache the result
|
1362
|
+
self._figure_cache.put(cache_key, result)
|
1363
|
+
return result
|
1364
|
+
|
1325
1365
|
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1326
1366
|
return None
|
1327
1367
|
|
@@ -1,16 +1,16 @@
|
|
1
1
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
2
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=
|
3
|
+
debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
|
4
4
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
5
|
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
6
|
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
7
|
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
-
debase/reaction_info_extractor.py,sha256=
|
8
|
+
debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
|
9
9
|
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
10
|
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.5.
|
12
|
-
debase-0.5.
|
13
|
-
debase-0.5.
|
14
|
-
debase-0.5.
|
15
|
-
debase-0.5.
|
16
|
-
debase-0.5.
|
11
|
+
debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
|
13
|
+
debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.5.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|