debase 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +52 -4
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/METADATA +1 -1
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/RECORD +8 -8
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/WHEEL +0 -0
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/entry_points.txt +0 -0
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.4.dist-info → debase-0.1.5.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -1024,6 +1024,38 @@ IMPORTANT: Only extract variants that belong to this specific campaign.
|
|
1024
1024
|
|
1025
1025
|
# ---- 6.3 Helper for location-based extraction -----------------------------
|
1026
1026
|
|
1027
|
+
def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
1028
|
+
"""Check if a found pattern is likely a table of contents entry."""
|
1029
|
+
# Find the line containing this position
|
1030
|
+
line_start = text.rfind('\n', 0, position)
|
1031
|
+
line_end = text.find('\n', position)
|
1032
|
+
|
1033
|
+
if line_start == -1:
|
1034
|
+
line_start = 0
|
1035
|
+
else:
|
1036
|
+
line_start += 1
|
1037
|
+
|
1038
|
+
if line_end == -1:
|
1039
|
+
line_end = len(text)
|
1040
|
+
|
1041
|
+
line = text[line_start:line_end]
|
1042
|
+
|
1043
|
+
# TOC indicators:
|
1044
|
+
# 1. Line contains dots (...) followed by page number
|
1045
|
+
# 2. Line ends with just a page number
|
1046
|
+
# 3. Line has "Table S12:" or similar followed by title and page
|
1047
|
+
if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
|
1048
|
+
return True
|
1049
|
+
|
1050
|
+
# Check if this is in a contents/TOC section
|
1051
|
+
# Look backwards up to 500 chars for "Contents" or "Table of Contents"
|
1052
|
+
context_start = max(0, position - 500)
|
1053
|
+
context = text[context_start:position].lower()
|
1054
|
+
if 'contents' in context or 'table of contents' in context:
|
1055
|
+
return True
|
1056
|
+
|
1057
|
+
return False
|
1058
|
+
|
1027
1059
|
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
|
1028
1060
|
"""Extract text around identified locations."""
|
1029
1061
|
if not locations:
|
@@ -1082,11 +1114,25 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1082
1114
|
pos = -1
|
1083
1115
|
used_pattern = None
|
1084
1116
|
for pattern in page_patterns:
|
1085
|
-
|
1086
|
-
|
1117
|
+
search_pos = 0
|
1118
|
+
while search_pos < len(text_lower):
|
1119
|
+
temp_pos = text_lower.find(pattern.lower(), search_pos)
|
1120
|
+
if temp_pos == -1:
|
1121
|
+
break
|
1122
|
+
|
1123
|
+
# Check if this is a TOC entry
|
1124
|
+
if _is_toc_entry(text, temp_pos, pattern):
|
1125
|
+
log.debug("Skipping TOC entry for pattern '%s' at position %d", pattern, temp_pos)
|
1126
|
+
search_pos = temp_pos + len(pattern)
|
1127
|
+
continue
|
1128
|
+
|
1129
|
+
# Found non-TOC entry
|
1087
1130
|
pos = temp_pos
|
1088
1131
|
used_pattern = pattern
|
1089
|
-
log.debug("Found pattern '%s' at position %d", pattern, pos)
|
1132
|
+
log.debug("Found pattern '%s' at position %d (not TOC)", pattern, pos)
|
1133
|
+
break
|
1134
|
+
|
1135
|
+
if pos != -1:
|
1090
1136
|
break
|
1091
1137
|
|
1092
1138
|
if pos != -1:
|
@@ -1275,7 +1321,9 @@ def get_lineage(
|
|
1275
1321
|
|
1276
1322
|
# Use text-based extraction (works for tables and text sections)
|
1277
1323
|
# Extract from full text, not caption text - use only primary location
|
1278
|
-
|
1324
|
+
# Use more context for tables since they often span multiple pages
|
1325
|
+
context_size = 15000 if location_type == 'table' else 5000
|
1326
|
+
focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
|
1279
1327
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
1280
1328
|
len(full_text), len(focused_text),
|
1281
1329
|
primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=Bj9n2sI-8fEKj8LGa2ZU_dV7G5OnubUV9yK63_ZmeUU,49
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=kn3pfPWctiaWC-oaynEOike9MQ-63ApAK1cmoHbTPzU,91159
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
9
|
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.5.dist-info/METADATA,sha256=1vMeMX3yGLXnnvnp9lN3mTRDCsdqFklE0puYlBemfyE,10789
|
14
|
+
debase-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|