debase 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,11 +54,11 @@ class Config:
54
54
  """Centralised tunables so tests can override them easily."""
55
55
 
56
56
  model_name: str = "gemini-2.5-flash"
57
- location_temperature: float = 0.2
57
+ location_temperature: float = 0.0
58
58
  extract_temperature: float = 0.0
59
59
  model_reaction_temperature: float = 0.0
60
60
  top_p: float = 1.0
61
- max_tokens: int = 12288 # Increased 3x from 4096
61
+ max_tokens: int = 12288
62
62
  pdf_cache_size: int = 8
63
63
  retries: int = 2
64
64
 
@@ -778,50 +778,62 @@ class ReactionExtractor:
778
778
  # ------------------------------------------------------------------
779
779
 
780
780
  def _collect_captions_and_titles(self) -> str:
781
- # Pattern to match Table or Figure with optional leading whitespace
781
+ # Pattern to match Table or Figure with optional leading whitespace and page numbers
782
782
  # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
783
- # Also handles cases where there's whitespace before the caption
784
- cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
783
+ # Also handles cases where there's whitespace or page numbers before the caption
784
+ cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
785
785
  captions: List[str] = []
786
786
 
787
- # Collect from all pages
788
- all_text = "\n".join(self.all_pages)
789
-
790
- # Find all figure/table captions with more context
791
- for match in cap_pattern.finditer(all_text):
792
- caption_start = match.start()
793
-
794
- # Include some context before the caption (up to 200 chars)
795
- context_start = max(0, caption_start - 200)
796
- # Find the start of the sentence/paragraph before the caption
797
- context_text = all_text[context_start:caption_start]
798
- last_period = context_text.rfind('.')
799
- if last_period != -1:
800
- context_start = context_start + last_period + 1
801
-
802
- # For tables, include much more content after the caption to show actual table data
803
- # For figures, include more content to ensure complete captions
804
- is_table = 'table' in match.group(1).lower()
805
- max_chars = 8000 if is_table else 5000
806
-
807
- # Get up to max_chars or until double newline (but ensure we get complete caption)
808
- # First, try to find the end of the caption sentence
809
- caption_end = caption_start
810
- period_pos = all_text.find('. ', caption_start)
811
- if period_pos != -1 and period_pos < caption_start + 1000:
812
- # Include at least to the end of the caption sentence
813
- caption_end = period_pos + 1
814
-
815
- # Then extend to include more context or until double newline
816
- double_newline_pos = all_text.find("\n\n", caption_end)
817
- if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
818
- caption_end = caption_start + max_chars
819
- else:
820
- caption_end = double_newline_pos
821
-
822
- # Include the context and full caption with table content
823
- full_caption = all_text[context_start:caption_end].strip()
824
- captions.append(full_caption)
787
+ # Process each page individually to avoid TOC entries
788
+ for page_idx, page_text in enumerate(self.all_pages):
789
+ # Skip if this looks like a TOC page
790
+ if self._is_toc_page(page_text):
791
+ LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
792
+ continue
793
+
794
+ # Find all figure/table captions with more context
795
+ for match in cap_pattern.finditer(page_text):
796
+ caption_line = match.group(0).strip()
797
+
798
+ # Skip if this looks like a TOC entry (has page number at end or dots)
799
+ if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
800
+ LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
801
+ continue
802
+
803
+ caption_start = match.start()
804
+
805
+ # For tables, include much more content after the caption to show actual table data
806
+ # For figures, include substantial content to show what the figure contains
807
+ is_table = 'table' in match.group(1).lower()
808
+ # Increase context for figures to ensure we capture descriptive text
809
+ max_chars = 8000 if is_table else 3000
810
+
811
+ # Get context including text before and after the caption
812
+ # Include some text before to help identify the location
813
+ context_before = max(0, caption_start - 200)
814
+ context_after = min(len(page_text), caption_start + max_chars)
815
+
816
+ # Extract the full context
817
+ full_context = page_text[context_before:context_after].strip()
818
+
819
+ # Find the actual caption text (not just the "Figure X" part)
820
+ # Look for text after the figure/table identifier that forms the caption
821
+ caption_text = page_text[caption_start:context_after]
822
+
823
+ # Try to find the end of the caption (usually ends with a period before next paragraph)
824
+ caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
825
+ if caption_end_match:
826
+ actual_caption = caption_text[:caption_end_match.end()].strip()
827
+ else:
828
+ # Fallback: take first few lines
829
+ lines = caption_text.split('\n')
830
+ actual_caption = '\n'.join(lines[:3]).strip()
831
+
832
+ # Ensure we have meaningful content, not just the figure number
833
+ if len(actual_caption) > 20: # More than just "Figure S23."
834
+ # For the prompt, include the full context to help identify what's in the figure
835
+ caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
836
+ captions.append(caption_with_context)
825
837
 
826
838
  # Also look for SI section titles
827
839
  si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
1058
1070
  # 6.2 Figure / Table context helpers
1059
1071
  # ------------------------------------------------------------------
1060
1072
 
1073
+ def _is_toc_page(self, page_text: str) -> bool:
1074
+ """Detect if a page is a Table of Contents page."""
1075
+ # Look for common TOC indicators
1076
+ toc_indicators = [
1077
+ "table of contents",
1078
+ "contents",
1079
+ r"\.{5,}", # Multiple dots (common in TOCs)
1080
+ r"\d+\s*\n\s*\d+\s*\n\s*\d+", # Multiple page numbers in sequence
1081
+ ]
1082
+
1083
+ # Count how many TOC-like patterns we find
1084
+ toc_score = 0
1085
+ text_lower = page_text.lower()
1086
+
1087
+ # Check for explicit TOC title
1088
+ if "table of contents" in text_lower or (
1089
+ "contents" in text_lower and text_lower.index("contents") < 200
1090
+ ):
1091
+ toc_score += 3
1092
+
1093
+ # Check for multiple figure/table references with page numbers
1094
+ figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
1095
+ table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
1096
+
1097
+ if len(figure_with_page) + len(table_with_page) > 5:
1098
+ toc_score += 2
1099
+
1100
+ # Check for many dotted lines
1101
+ if len(re.findall(r'\.{5,}', page_text)) > 3:
1102
+ toc_score += 1
1103
+
1104
+ return toc_score >= 2
1105
+
1061
1106
  def _page_with_reference(self, ref_id: str) -> Optional[str]:
1062
1107
  for page in self.all_pages:
1063
1108
  if ref_id.lower() in page.lower():
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
1131
1176
  LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
1132
1177
  page_number + 1, doc_name, len(page_text))
1133
1178
 
1134
- # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
1135
- # For subfigures like "Figure 1C", extract the main figure "Figure 1"
1136
- figure_num = ref.replace('Figure ', '').replace('figure ', '')
1179
+ # Skip Table of Contents pages
1180
+ if self._is_toc_page(page_text):
1181
+ LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
1182
+ continue
1183
+
1184
+ # Look for figure caption pattern more flexibly
1185
+ # Normalize the reference to handle variations
1186
+ figure_num = ref.replace('Figure', '').replace('figure', '').strip()
1137
1187
 
1138
1188
  # Extract main figure number from subfigure (e.g., "1C" -> "1")
1139
1189
  main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1142,33 +1192,69 @@ class ReactionExtractor:
1142
1192
  else:
1143
1193
  main_figure_num = figure_num
1144
1194
 
1145
- caption_patterns = [
1146
- rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
1147
- rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
1148
- rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
1149
- rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
1150
- rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
1151
- rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
1152
- ]
1195
+ # Create a flexible pattern that handles various spacing and formatting
1196
+ # This pattern looks for "Figure" (case insensitive) followed by optional spaces
1197
+ # then the figure number, then any of: period, colon, space+capital letter, or end of line
1198
+ # Also match at the beginning of a line to catch captions
1199
+ flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1153
1200
 
1154
- LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
1155
- main_figure_num, ref, caption_patterns)
1201
+ LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
1202
+ main_figure_num, flexible_pattern)
1156
1203
 
1157
1204
  caption_found = False
1158
1205
  cap_rect = None
1159
1206
 
1160
- for pattern in caption_patterns:
1161
- matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
1162
- if matches:
1163
- LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
1164
- # Found actual figure caption, get its position
1165
- caption_text = matches.group(0)
1166
- text_instances = page.search_for(caption_text, quads=False)
1167
- if text_instances:
1168
- cap_rect = text_instances[0]
1169
- caption_found = True
1170
- LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
1171
- break
1207
+ # Search for all matches of the flexible pattern
1208
+ for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
1209
+ LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
1210
+ # Check if this is likely an actual caption (not just a reference)
1211
+ match_start = match.start()
1212
+ match_end = match.end()
1213
+
1214
+ # Get surrounding context
1215
+ context_start = max(0, match_start - 50)
1216
+ context_end = min(len(page_text), match_end + 100)
1217
+ context = page_text[context_start:context_end]
1218
+
1219
+ # Check if this looks like a real caption (not just a reference)
1220
+ # Look for words that typically precede figure references
1221
+ preceding_text = page_text[max(0, match_start-20):match_start].lower()
1222
+ if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
1223
+ LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
1224
+ continue
1225
+
1226
+ # Check if there's descriptive text after the figure number
1227
+ remaining_text = page_text[match_end:match_end+100].strip()
1228
+
1229
+ # For actual captions, there should be substantial descriptive text
1230
+ if len(remaining_text) < 20:
1231
+ LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
1232
+ continue
1233
+
1234
+ # Check if the remaining text looks like a caption (contains descriptive words)
1235
+ # Expanded list of caption keywords to be more inclusive
1236
+ first_words = remaining_text[:50].lower()
1237
+ caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
1238
+ 'illustrates', 'demonstrates', 'results', 'data',
1239
+ 'chromatogram', 'spectra', 'analysis', 'site-directed',
1240
+ 'mutagenesis', 'mutants', 'evolution', 'directed',
1241
+ 'screening', 'reaction', 'variant', 'enzyme', 'protein',
1242
+ 'activity', 'performance', 'yield', 'selectivity',
1243
+ 'characterization', 'optimization', 'development',
1244
+ 'structure', 'domain', 'crystal', 'model']
1245
+ if not any(word in first_words for word in caption_keywords):
1246
+ LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
1247
+ continue
1248
+
1249
+ # Found actual figure caption, get its position
1250
+ caption_text = match.group(0)
1251
+ text_instances = page.search_for(caption_text, quads=False)
1252
+ if text_instances:
1253
+ cap_rect = text_instances[0]
1254
+ caption_found = True
1255
+ LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
1256
+ ref, caption_text, remaining_text[:50])
1257
+ break
1172
1258
 
1173
1259
  if not caption_found:
1174
1260
  # Debug: show what figure-related text is actually on this page
@@ -1243,6 +1329,39 @@ class ReactionExtractor:
1243
1329
  self._figure_cache.put(cache_key, result)
1244
1330
  return result
1245
1331
 
1332
+ # Fallback: If no caption found, try to find any page that mentions this figure
1333
+ LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
1334
+
1335
+ for doc_idx, doc in enumerate(docs):
1336
+ doc_name = "MS" if doc_idx == 0 else "SI"
1337
+ for page_number in range(doc.page_count):
1338
+ page = doc.load_page(page_number)
1339
+ page_text = page.get_text()
1340
+
1341
+ # Look for any mention of the figure reference
1342
+ if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
1343
+ LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
1344
+ ref, page_number + 1, doc_name)
1345
+
1346
+ # Extract the entire page as the figure might be on this page
1347
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
1348
+ pix = page.get_pixmap(matrix=mat)
1349
+ pix = self._ensure_rgb_pixmap(pix)
1350
+ img_bytes = pix.tobytes("png")
1351
+
1352
+ # Save PNG to debug directory if available
1353
+ if self.debug_dir:
1354
+ timestamp = int(time.time())
1355
+ png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
1356
+ with open(png_file, 'wb') as f:
1357
+ f.write(img_bytes)
1358
+ LOGGER.info("Saved fallback page image to: %s", png_file)
1359
+
1360
+ result = b64encode(img_bytes).decode()
1361
+ # Cache the result
1362
+ self._figure_cache.put(cache_key, result)
1363
+ return result
1364
+
1246
1365
  LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
1247
1366
  return None
1248
1367
 
@@ -1258,6 +1377,11 @@ class ReactionExtractor:
1258
1377
  page = doc.load_page(page_number)
1259
1378
  page_text = page.get_text()
1260
1379
 
1380
+ # Skip Table of Contents pages
1381
+ if self._is_toc_page(page_text):
1382
+ LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
1383
+ continue
1384
+
1261
1385
  # Check for actual figure caption first
1262
1386
  if ref.lower().startswith('figure'):
1263
1387
  figure_num = ref.replace('Figure ', '').replace('figure ', '')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.4.5
3
+ Version: 0.5.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,16 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
6
+ debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
7
+ debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
8
+ debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
9
+ debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
10
+ debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
11
+ debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
+ debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
13
+ debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
+ debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
+ debase-0.5.1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
6
- debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
7
- debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
8
- debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
9
- debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
10
- debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
11
- debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
- debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
13
- debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
- debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
- debase-0.4.5.dist-info/RECORD,,
File without changes