debase 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.4"
3
+ __version__ = "0.1.5"
@@ -1024,6 +1024,38 @@ IMPORTANT: Only extract variants that belong to this specific campaign.
1024
1024
 
1025
1025
  # ---- 6.3 Helper for location-based extraction -----------------------------
1026
1026
 
1027
+ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1028
+ """Check if a found pattern is likely a table of contents entry."""
1029
+ # Find the line containing this position
1030
+ line_start = text.rfind('\n', 0, position)
1031
+ line_end = text.find('\n', position)
1032
+
1033
+ if line_start == -1:
1034
+ line_start = 0
1035
+ else:
1036
+ line_start += 1
1037
+
1038
+ if line_end == -1:
1039
+ line_end = len(text)
1040
+
1041
+ line = text[line_start:line_end]
1042
+
1043
+ # TOC indicators:
1044
+ # 1. Line contains dots (...) followed by page number
1045
+ # 2. Line ends with just a page number
1046
+ # 3. Line has "Table S12:" or similar followed by title and page
1047
+ if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
1048
+ return True
1049
+
1050
+ # Check if this is in a contents/TOC section
1051
+ # Look backwards up to 500 chars for "Contents" or "Table of Contents"
1052
+ context_start = max(0, position - 500)
1053
+ context = text[context_start:position].lower()
1054
+ if 'contents' in context or 'table of contents' in context:
1055
+ return True
1056
+
1057
+ return False
1058
+
1027
1059
  def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
1028
1060
  """Extract text around identified locations."""
1029
1061
  if not locations:
@@ -1082,11 +1114,25 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1082
1114
  pos = -1
1083
1115
  used_pattern = None
1084
1116
  for pattern in page_patterns:
1085
- temp_pos = text_lower.find(pattern.lower())
1086
- if temp_pos != -1:
1117
+ search_pos = 0
1118
+ while search_pos < len(text_lower):
1119
+ temp_pos = text_lower.find(pattern.lower(), search_pos)
1120
+ if temp_pos == -1:
1121
+ break
1122
+
1123
+ # Check if this is a TOC entry
1124
+ if _is_toc_entry(text, temp_pos, pattern):
1125
+ log.debug("Skipping TOC entry for pattern '%s' at position %d", pattern, temp_pos)
1126
+ search_pos = temp_pos + len(pattern)
1127
+ continue
1128
+
1129
+ # Found non-TOC entry
1087
1130
  pos = temp_pos
1088
1131
  used_pattern = pattern
1089
- log.debug("Found pattern '%s' at position %d", pattern, pos)
1132
+ log.debug("Found pattern '%s' at position %d (not TOC)", pattern, pos)
1133
+ break
1134
+
1135
+ if pos != -1:
1090
1136
  break
1091
1137
 
1092
1138
  if pos != -1:
@@ -1275,7 +1321,9 @@ def get_lineage(
1275
1321
 
1276
1322
  # Use text-based extraction (works for tables and text sections)
1277
1323
  # Extract from full text, not caption text - use only primary location
1278
- focused_text = _extract_text_at_locations(full_text, [primary_location])
1324
+ # Use more context for tables since they often span multiple pages
1325
+ context_size = 15000 if location_type == 'table' else 5000
1326
+ focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
1279
1327
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
1280
1328
  len(full_text), len(focused_text),
1281
1329
  primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=mcDHWqAxAKwMNAAyHmpWVDTK-zafQ1kQjmiwnsZbUD4,49
4
+ debase/_version.py,sha256=Bj9n2sI-8fEKj8LGa2ZU_dV7G5OnubUV9yK63_ZmeUU,49
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=s1kPOomvJjfMSN5odxeyXNmxiaOzXyOZICr4YUWU6j8,89288
7
+ debase/enzyme_lineage_extractor.py,sha256=kn3pfPWctiaWC-oaynEOike9MQ-63ApAK1cmoHbTPzU,91159
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.4.dist-info/METADATA,sha256=fZwXCP1i1s0VNq7Ds5bd2ys3pONgaV1XCe_edUkQdRU,10789
14
- debase-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.4.dist-info/RECORD,,
12
+ debase-0.1.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.5.dist-info/METADATA,sha256=1vMeMX3yGLXnnvnp9lN3mTRDCsdqFklE0puYlBemfyE,10789
14
+ debase-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.5.dist-info/RECORD,,
File without changes