debase 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +123 -0
- debase/enzyme_lineage_extractor.py +243 -309
- debase/reaction_info_extractor.py +192 -68
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/METADATA +1 -1
- debase-0.5.1.dist-info/RECORD +16 -0
- debase-0.4.5.dist-info/RECORD +0 -16
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/WHEEL +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/entry_points.txt +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/top_level.txt +0 -0
@@ -54,11 +54,11 @@ class Config:
|
|
54
54
|
"""Centralised tunables so tests can override them easily."""
|
55
55
|
|
56
56
|
model_name: str = "gemini-2.5-flash"
|
57
|
-
location_temperature: float = 0.
|
57
|
+
location_temperature: float = 0.0
|
58
58
|
extract_temperature: float = 0.0
|
59
59
|
model_reaction_temperature: float = 0.0
|
60
60
|
top_p: float = 1.0
|
61
|
-
max_tokens: int = 12288
|
61
|
+
max_tokens: int = 12288
|
62
62
|
pdf_cache_size: int = 8
|
63
63
|
retries: int = 2
|
64
64
|
|
@@ -778,50 +778,62 @@ class ReactionExtractor:
|
|
778
778
|
# ------------------------------------------------------------------
|
779
779
|
|
780
780
|
def _collect_captions_and_titles(self) -> str:
|
781
|
-
# Pattern to match Table or Figure with optional leading whitespace
|
781
|
+
# Pattern to match Table or Figure with optional leading whitespace and page numbers
|
782
782
|
# This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
|
783
|
-
# Also handles cases where there's whitespace before the caption
|
784
|
-
cap_pattern = re.compile(r"
|
783
|
+
# Also handles cases where there's whitespace or page numbers before the caption
|
784
|
+
cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
785
785
|
captions: List[str] = []
|
786
786
|
|
787
|
-
#
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
#
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
# Include
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
787
|
+
# Process each page individually to avoid TOC entries
|
788
|
+
for page_idx, page_text in enumerate(self.all_pages):
|
789
|
+
# Skip if this looks like a TOC page
|
790
|
+
if self._is_toc_page(page_text):
|
791
|
+
LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
|
792
|
+
continue
|
793
|
+
|
794
|
+
# Find all figure/table captions with more context
|
795
|
+
for match in cap_pattern.finditer(page_text):
|
796
|
+
caption_line = match.group(0).strip()
|
797
|
+
|
798
|
+
# Skip if this looks like a TOC entry (has page number at end or dots)
|
799
|
+
if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
|
800
|
+
LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
|
801
|
+
continue
|
802
|
+
|
803
|
+
caption_start = match.start()
|
804
|
+
|
805
|
+
# For tables, include much more content after the caption to show actual table data
|
806
|
+
# For figures, include substantial content to show what the figure contains
|
807
|
+
is_table = 'table' in match.group(1).lower()
|
808
|
+
# Increase context for figures to ensure we capture descriptive text
|
809
|
+
max_chars = 8000 if is_table else 3000
|
810
|
+
|
811
|
+
# Get context including text before and after the caption
|
812
|
+
# Include some text before to help identify the location
|
813
|
+
context_before = max(0, caption_start - 200)
|
814
|
+
context_after = min(len(page_text), caption_start + max_chars)
|
815
|
+
|
816
|
+
# Extract the full context
|
817
|
+
full_context = page_text[context_before:context_after].strip()
|
818
|
+
|
819
|
+
# Find the actual caption text (not just the "Figure X" part)
|
820
|
+
# Look for text after the figure/table identifier that forms the caption
|
821
|
+
caption_text = page_text[caption_start:context_after]
|
822
|
+
|
823
|
+
# Try to find the end of the caption (usually ends with a period before next paragraph)
|
824
|
+
caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
|
825
|
+
if caption_end_match:
|
826
|
+
actual_caption = caption_text[:caption_end_match.end()].strip()
|
827
|
+
else:
|
828
|
+
# Fallback: take first few lines
|
829
|
+
lines = caption_text.split('\n')
|
830
|
+
actual_caption = '\n'.join(lines[:3]).strip()
|
831
|
+
|
832
|
+
# Ensure we have meaningful content, not just the figure number
|
833
|
+
if len(actual_caption) > 20: # More than just "Figure S23."
|
834
|
+
# For the prompt, include the full context to help identify what's in the figure
|
835
|
+
caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
|
836
|
+
captions.append(caption_with_context)
|
825
837
|
|
826
838
|
# Also look for SI section titles
|
827
839
|
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
|
|
1058
1070
|
# 6.2 Figure / Table context helpers
|
1059
1071
|
# ------------------------------------------------------------------
|
1060
1072
|
|
1073
|
+
def _is_toc_page(self, page_text: str) -> bool:
|
1074
|
+
"""Detect if a page is a Table of Contents page."""
|
1075
|
+
# Look for common TOC indicators
|
1076
|
+
toc_indicators = [
|
1077
|
+
"table of contents",
|
1078
|
+
"contents",
|
1079
|
+
r"\.{5,}", # Multiple dots (common in TOCs)
|
1080
|
+
r"\d+\s*\n\s*\d+\s*\n\s*\d+", # Multiple page numbers in sequence
|
1081
|
+
]
|
1082
|
+
|
1083
|
+
# Count how many TOC-like patterns we find
|
1084
|
+
toc_score = 0
|
1085
|
+
text_lower = page_text.lower()
|
1086
|
+
|
1087
|
+
# Check for explicit TOC title
|
1088
|
+
if "table of contents" in text_lower or (
|
1089
|
+
"contents" in text_lower and text_lower.index("contents") < 200
|
1090
|
+
):
|
1091
|
+
toc_score += 3
|
1092
|
+
|
1093
|
+
# Check for multiple figure/table references with page numbers
|
1094
|
+
figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1095
|
+
table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1096
|
+
|
1097
|
+
if len(figure_with_page) + len(table_with_page) > 5:
|
1098
|
+
toc_score += 2
|
1099
|
+
|
1100
|
+
# Check for many dotted lines
|
1101
|
+
if len(re.findall(r'\.{5,}', page_text)) > 3:
|
1102
|
+
toc_score += 1
|
1103
|
+
|
1104
|
+
return toc_score >= 2
|
1105
|
+
|
1061
1106
|
def _page_with_reference(self, ref_id: str) -> Optional[str]:
|
1062
1107
|
for page in self.all_pages:
|
1063
1108
|
if ref_id.lower() in page.lower():
|
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
|
|
1131
1176
|
LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
|
1132
1177
|
page_number + 1, doc_name, len(page_text))
|
1133
1178
|
|
1134
|
-
#
|
1135
|
-
|
1136
|
-
|
1179
|
+
# Skip Table of Contents pages
|
1180
|
+
if self._is_toc_page(page_text):
|
1181
|
+
LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
|
1182
|
+
continue
|
1183
|
+
|
1184
|
+
# Look for figure caption pattern more flexibly
|
1185
|
+
# Normalize the reference to handle variations
|
1186
|
+
figure_num = ref.replace('Figure', '').replace('figure', '').strip()
|
1137
1187
|
|
1138
1188
|
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1139
1189
|
main_figure_num = re.match(r'^(\d+)', figure_num)
|
@@ -1142,33 +1192,69 @@ class ReactionExtractor:
|
|
1142
1192
|
else:
|
1143
1193
|
main_figure_num = figure_num
|
1144
1194
|
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
|
1151
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
|
1152
|
-
]
|
1195
|
+
# Create a flexible pattern that handles various spacing and formatting
|
1196
|
+
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
|
+
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1198
|
+
# Also match at the beginning of a line to catch captions
|
1199
|
+
flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
|
1153
1200
|
|
1154
|
-
LOGGER.debug("Looking for
|
1155
|
-
main_figure_num,
|
1201
|
+
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1202
|
+
main_figure_num, flexible_pattern)
|
1156
1203
|
|
1157
1204
|
caption_found = False
|
1158
1205
|
cap_rect = None
|
1159
1206
|
|
1160
|
-
for
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1207
|
+
# Search for all matches of the flexible pattern
|
1208
|
+
for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
|
1209
|
+
LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
|
1210
|
+
# Check if this is likely an actual caption (not just a reference)
|
1211
|
+
match_start = match.start()
|
1212
|
+
match_end = match.end()
|
1213
|
+
|
1214
|
+
# Get surrounding context
|
1215
|
+
context_start = max(0, match_start - 50)
|
1216
|
+
context_end = min(len(page_text), match_end + 100)
|
1217
|
+
context = page_text[context_start:context_end]
|
1218
|
+
|
1219
|
+
# Check if this looks like a real caption (not just a reference)
|
1220
|
+
# Look for words that typically precede figure references
|
1221
|
+
preceding_text = page_text[max(0, match_start-20):match_start].lower()
|
1222
|
+
if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
|
1223
|
+
LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
|
1224
|
+
continue
|
1225
|
+
|
1226
|
+
# Check if there's descriptive text after the figure number
|
1227
|
+
remaining_text = page_text[match_end:match_end+100].strip()
|
1228
|
+
|
1229
|
+
# For actual captions, there should be substantial descriptive text
|
1230
|
+
if len(remaining_text) < 20:
|
1231
|
+
LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
|
1232
|
+
continue
|
1233
|
+
|
1234
|
+
# Check if the remaining text looks like a caption (contains descriptive words)
|
1235
|
+
# Expanded list of caption keywords to be more inclusive
|
1236
|
+
first_words = remaining_text[:50].lower()
|
1237
|
+
caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
|
1238
|
+
'illustrates', 'demonstrates', 'results', 'data',
|
1239
|
+
'chromatogram', 'spectra', 'analysis', 'site-directed',
|
1240
|
+
'mutagenesis', 'mutants', 'evolution', 'directed',
|
1241
|
+
'screening', 'reaction', 'variant', 'enzyme', 'protein',
|
1242
|
+
'activity', 'performance', 'yield', 'selectivity',
|
1243
|
+
'characterization', 'optimization', 'development',
|
1244
|
+
'structure', 'domain', 'crystal', 'model']
|
1245
|
+
if not any(word in first_words for word in caption_keywords):
|
1246
|
+
LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
|
1247
|
+
continue
|
1248
|
+
|
1249
|
+
# Found actual figure caption, get its position
|
1250
|
+
caption_text = match.group(0)
|
1251
|
+
text_instances = page.search_for(caption_text, quads=False)
|
1252
|
+
if text_instances:
|
1253
|
+
cap_rect = text_instances[0]
|
1254
|
+
caption_found = True
|
1255
|
+
LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
|
1256
|
+
ref, caption_text, remaining_text[:50])
|
1257
|
+
break
|
1172
1258
|
|
1173
1259
|
if not caption_found:
|
1174
1260
|
# Debug: show what figure-related text is actually on this page
|
@@ -1243,6 +1329,39 @@ class ReactionExtractor:
|
|
1243
1329
|
self._figure_cache.put(cache_key, result)
|
1244
1330
|
return result
|
1245
1331
|
|
1332
|
+
# Fallback: If no caption found, try to find any page that mentions this figure
|
1333
|
+
LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
|
1334
|
+
|
1335
|
+
for doc_idx, doc in enumerate(docs):
|
1336
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1337
|
+
for page_number in range(doc.page_count):
|
1338
|
+
page = doc.load_page(page_number)
|
1339
|
+
page_text = page.get_text()
|
1340
|
+
|
1341
|
+
# Look for any mention of the figure reference
|
1342
|
+
if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
|
1343
|
+
LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
|
1344
|
+
ref, page_number + 1, doc_name)
|
1345
|
+
|
1346
|
+
# Extract the entire page as the figure might be on this page
|
1347
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
1348
|
+
pix = page.get_pixmap(matrix=mat)
|
1349
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1350
|
+
img_bytes = pix.tobytes("png")
|
1351
|
+
|
1352
|
+
# Save PNG to debug directory if available
|
1353
|
+
if self.debug_dir:
|
1354
|
+
timestamp = int(time.time())
|
1355
|
+
png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
|
1356
|
+
with open(png_file, 'wb') as f:
|
1357
|
+
f.write(img_bytes)
|
1358
|
+
LOGGER.info("Saved fallback page image to: %s", png_file)
|
1359
|
+
|
1360
|
+
result = b64encode(img_bytes).decode()
|
1361
|
+
# Cache the result
|
1362
|
+
self._figure_cache.put(cache_key, result)
|
1363
|
+
return result
|
1364
|
+
|
1246
1365
|
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1247
1366
|
return None
|
1248
1367
|
|
@@ -1258,6 +1377,11 @@ class ReactionExtractor:
|
|
1258
1377
|
page = doc.load_page(page_number)
|
1259
1378
|
page_text = page.get_text()
|
1260
1379
|
|
1380
|
+
# Skip Table of Contents pages
|
1381
|
+
if self._is_toc_page(page_text):
|
1382
|
+
LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
|
1383
|
+
continue
|
1384
|
+
|
1261
1385
|
# Check for actual figure caption first
|
1262
1386
|
if ref.lower().startswith('figure'):
|
1263
1387
|
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
|
+
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
+
debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
|
9
|
+
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
+
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
+
debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
|
13
|
+
debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.5.1.dist-info/RECORD,,
|
debase-0.4.5.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
|
7
|
-
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
-
debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
|
9
|
-
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
|
13
|
-
debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.4.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|