debase 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +11 -6
- debase/lineage_format.py +22 -18
- debase/reaction_info_extractor.py +45 -11
- debase/substrate_scope_extractor.py +3 -2
- {debase-0.4.4.dist-info → debase-0.4.5.dist-info}/METADATA +1 -1
- debase-0.4.5.dist-info/RECORD +16 -0
- debase-0.4.4.dist-info/RECORD +0 -16
- {debase-0.4.4.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
- {debase-0.4.4.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
- {debase-0.4.4.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.4.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -2943,12 +2943,15 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2943
2943
|
validate_sequences=True
|
2944
2944
|
)
|
2945
2945
|
|
2946
|
-
if
|
2947
|
-
|
2948
|
-
|
2949
|
-
|
2950
|
-
|
2951
|
-
|
2946
|
+
# Use focused text if we got any content, regardless of size
|
2947
|
+
if focused_text:
|
2948
|
+
if len(focused_text) < len(text):
|
2949
|
+
log.info("Reduced text from %d to %d chars using validated location",
|
2950
|
+
len(text), len(focused_text))
|
2951
|
+
else:
|
2952
|
+
log.info("Extracted focused text (%d chars) from validated location (full text: %d chars)",
|
2953
|
+
len(focused_text), len(text))
|
2954
|
+
|
2952
2955
|
# Build lineage context if available
|
2953
2956
|
lineage_context = None
|
2954
2957
|
if lineage_variants:
|
@@ -2961,6 +2964,8 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2961
2964
|
lineage_context = "\n".join(variant_info)
|
2962
2965
|
|
2963
2966
|
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
|
2967
|
+
else:
|
2968
|
+
log.warning("Failed to extract focused text from validated location, will use full text")
|
2964
2969
|
else:
|
2965
2970
|
log.warning("Location validation failed or returned invalid location: %s",
|
2966
2971
|
validation.get("reason", "Unknown"))
|
debase/lineage_format.py
CHANGED
@@ -1183,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1183
1183
|
if generation != "0":
|
1184
1184
|
for cid, cmap in campaign_idmap.items():
|
1185
1185
|
if cid == campaign_id:
|
1186
|
+
# First try to find generation 0
|
1186
1187
|
for enzyme_id, enzyme_row in cmap.items():
|
1187
1188
|
enzyme_gen = str(enzyme_row.get("generation", "")).strip()
|
1188
1189
|
if enzyme_gen == "0" or enzyme_gen == "0.0":
|
1189
1190
|
reference_row = enzyme_row
|
1190
1191
|
log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
|
1191
1192
|
break
|
1193
|
+
|
1194
|
+
# If no generation 0 found, find the earliest generation
|
1195
|
+
if not reference_row:
|
1196
|
+
earliest_gen = float('inf')
|
1197
|
+
earliest_enzyme = None
|
1198
|
+
for enzyme_id, enzyme_row in cmap.items():
|
1199
|
+
try:
|
1200
|
+
enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
|
1201
|
+
if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
|
1202
|
+
earliest_gen = enzyme_gen
|
1203
|
+
earliest_enzyme = enzyme_id
|
1204
|
+
reference_row = enzyme_row
|
1205
|
+
except (ValueError, AttributeError):
|
1206
|
+
continue
|
1207
|
+
|
1208
|
+
if reference_row:
|
1209
|
+
log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
|
1210
|
+
else:
|
1211
|
+
log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
|
1192
1212
|
break
|
1193
|
-
if not reference_row:
|
1194
|
-
log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
|
1195
1213
|
|
1196
1214
|
reference_aa = ""
|
1197
1215
|
reference_nt = ""
|
@@ -1332,24 +1350,10 @@ def run_pipeline(reaction_csv: str | Path | None = None,
|
|
1332
1350
|
if not dfs:
|
1333
1351
|
raise ValueError("At least one input CSV must be provided")
|
1334
1352
|
|
1335
|
-
# Combine dataframes
|
1353
|
+
# Combine dataframes without deduplication
|
1336
1354
|
if len(dfs) > 1:
|
1337
1355
|
df_in = pd.concat(dfs, ignore_index=True)
|
1338
|
-
log.info("Combined data: %d total entries
|
1339
|
-
|
1340
|
-
# Deduplicate based on unique combination of campaign, variant, fitness, and product
|
1341
|
-
# Define the key columns that should be unique
|
1342
|
-
unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
|
1343
|
-
|
1344
|
-
# Check if we have these columns
|
1345
|
-
available_cols = [col for col in unique_cols if col in df_in.columns]
|
1346
|
-
|
1347
|
-
if len(available_cols) >= 2: # Need at least campaign_id and enzyme_id
|
1348
|
-
# Keep the first occurrence of each unique combination
|
1349
|
-
df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
|
1350
|
-
log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
|
1351
|
-
else:
|
1352
|
-
log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
|
1356
|
+
log.info("Combined data: %d total entries", len(df_in))
|
1353
1357
|
else:
|
1354
1358
|
df_in = dfs[0]
|
1355
1359
|
|
@@ -730,8 +730,8 @@ Return as JSON:
|
|
730
730
|
###############################################################################
|
731
731
|
|
732
732
|
class ReactionExtractor:
|
733
|
-
_FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
734
|
-
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
733
|
+
_FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
734
|
+
_TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
735
735
|
|
736
736
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
737
737
|
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
|
@@ -800,14 +800,24 @@ class ReactionExtractor:
|
|
800
800
|
context_start = context_start + last_period + 1
|
801
801
|
|
802
802
|
# For tables, include much more content after the caption to show actual table data
|
803
|
-
# For figures,
|
804
|
-
is_table = match.group(1).lower()
|
805
|
-
max_chars =
|
806
|
-
|
807
|
-
# Get up to max_chars or until double newline
|
808
|
-
|
809
|
-
|
803
|
+
# For figures, include more content to ensure complete captions
|
804
|
+
is_table = 'table' in match.group(1).lower()
|
805
|
+
max_chars = 8000 if is_table else 5000
|
806
|
+
|
807
|
+
# Get up to max_chars or until double newline (but ensure we get complete caption)
|
808
|
+
# First, try to find the end of the caption sentence
|
809
|
+
caption_end = caption_start
|
810
|
+
period_pos = all_text.find('. ', caption_start)
|
811
|
+
if period_pos != -1 and period_pos < caption_start + 1000:
|
812
|
+
# Include at least to the end of the caption sentence
|
813
|
+
caption_end = period_pos + 1
|
814
|
+
|
815
|
+
# Then extend to include more context or until double newline
|
816
|
+
double_newline_pos = all_text.find("\n\n", caption_end)
|
817
|
+
if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
|
810
818
|
caption_end = caption_start + max_chars
|
819
|
+
else:
|
820
|
+
caption_end = double_newline_pos
|
811
821
|
|
812
822
|
# Include the context and full caption with table content
|
813
823
|
full_caption = all_text[context_start:caption_end].strip()
|
@@ -1090,6 +1100,7 @@ class ReactionExtractor:
|
|
1090
1100
|
If extract_figure_only=True, extracts just the figure above the caption.
|
1091
1101
|
If False, extracts the entire page (useful for tables).
|
1092
1102
|
Returns a base64-encoded PNG or None."""
|
1103
|
+
LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
|
1093
1104
|
|
1094
1105
|
# Check cache first
|
1095
1106
|
cache_key = f"{ref}_{extract_figure_only}"
|
@@ -1107,10 +1118,18 @@ class ReactionExtractor:
|
|
1107
1118
|
return None
|
1108
1119
|
|
1109
1120
|
# For figure extraction, search both documents for actual figure captions
|
1110
|
-
|
1121
|
+
docs = list(filter(None, [self.ms_doc, self.si_doc]))
|
1122
|
+
LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
|
1123
|
+
|
1124
|
+
for doc_idx, doc in enumerate(docs):
|
1125
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1126
|
+
LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
|
1127
|
+
|
1111
1128
|
for page_number in range(doc.page_count):
|
1112
1129
|
page = doc.load_page(page_number)
|
1113
1130
|
page_text = page.get_text()
|
1131
|
+
LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
|
1132
|
+
page_number + 1, doc_name, len(page_text))
|
1114
1133
|
|
1115
1134
|
# Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
|
1116
1135
|
# For subfigures like "Figure 1C", extract the main figure "Figure 1"
|
@@ -1158,6 +1177,14 @@ class ReactionExtractor:
|
|
1158
1177
|
if figure_mentions:
|
1159
1178
|
LOGGER.debug("Page %d has figure mentions but no caption match: %s",
|
1160
1179
|
page_number, figure_mentions[:3])
|
1180
|
+
|
1181
|
+
# For supplementary figures, also check for "supplementary" mentions
|
1182
|
+
if 'supplementary' in ref.lower():
|
1183
|
+
supp_mentions = [line.strip() for line in page_text.split('\n')
|
1184
|
+
if 'supplementary' in line.lower() and 'figure' in line.lower()]
|
1185
|
+
if supp_mentions:
|
1186
|
+
LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
|
1187
|
+
page_number + 1, supp_mentions[:3])
|
1161
1188
|
continue
|
1162
1189
|
|
1163
1190
|
if extract_figure_only:
|
@@ -1215,6 +1242,8 @@ class ReactionExtractor:
|
|
1215
1242
|
# Cache the result
|
1216
1243
|
self._figure_cache.put(cache_key, result)
|
1217
1244
|
return result
|
1245
|
+
|
1246
|
+
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1218
1247
|
return None
|
1219
1248
|
|
1220
1249
|
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
@@ -1445,6 +1474,7 @@ class ReactionExtractor:
|
|
1445
1474
|
|
1446
1475
|
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
1447
1476
|
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
1477
|
+
LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
|
1448
1478
|
ref_lc = ref.lower()
|
1449
1479
|
image_b64: Optional[str] = None
|
1450
1480
|
|
@@ -1466,11 +1496,15 @@ class ReactionExtractor:
|
|
1466
1496
|
snippet = self._extract_table_context(ref)
|
1467
1497
|
elif self._FIG_RE.search(ref_lc):
|
1468
1498
|
# For figures, extract just the figure image (same logic as compound mapping)
|
1499
|
+
LOGGER.debug("Attempting to extract figure image for '%s'", ref)
|
1469
1500
|
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1470
1501
|
if not image_b64:
|
1471
|
-
LOGGER.
|
1502
|
+
LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
|
1472
1503
|
snippet = self._extract_figure_caption(ref)
|
1504
|
+
LOGGER.debug("Caption extraction result: %s",
|
1505
|
+
f"'{snippet[:100]}...'" if snippet else "empty")
|
1473
1506
|
else:
|
1507
|
+
LOGGER.info("Successfully extracted figure image for '%s'", ref)
|
1474
1508
|
# If figure is found, ignore text information - use image only
|
1475
1509
|
snippet = ""
|
1476
1510
|
else:
|
@@ -2543,7 +2543,8 @@ def merge_with_lineage(
|
|
2543
2543
|
data = lineage_map[matched_name]
|
2544
2544
|
entry.parent_id = data['parent_id']
|
2545
2545
|
entry.mutations = data['mutations']
|
2546
|
-
|
2546
|
+
# Skip generation - to be filled by lineage_format
|
2547
|
+
# entry.generation = data['generation']
|
2547
2548
|
entry.aa_seq = data['aa_seq']
|
2548
2549
|
entry.dna_seq = data['dna_seq']
|
2549
2550
|
entry.confidence = data['confidence']
|
@@ -2571,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2571
2572
|
'enzyme_id': entry.enzyme_id,
|
2572
2573
|
'parent_enzyme_id': entry.parent_id or '',
|
2573
2574
|
'mutations': entry.mutations or '',
|
2574
|
-
'generation':
|
2575
|
+
'generation': '', # Empty generation - to be filled by lineage_format
|
2575
2576
|
'campaign_id': entry.campaign_id or '',
|
2576
2577
|
'protein_sequence': entry.aa_seq or '',
|
2577
2578
|
'nucleotide_sequence': entry.dna_seq or '',
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
|
7
|
+
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
+
debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
|
9
|
+
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
+
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
+
debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
|
13
|
+
debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.4.5.dist-info/RECORD,,
|
debase-0.4.4.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=Vtl1u7rFItRnkcTvBiUypIltuuzta9Uy3PxMO2NgNgc,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=jWyDRfOY792zjY5SZCvhNfQxVcEOC1JjTGb9Wo2qZ4I,170543
|
7
|
-
debase/lineage_format.py,sha256=ch5kyoUqD_4Hj7K0hJrRbKrN_FysqFrFXgbyDIgp2oA,57515
|
8
|
-
debase/reaction_info_extractor.py,sha256=Gv1qgzInNWxdaEJdsWGlgyy5syL2qClVoKHFQpR_6q0,158498
|
9
|
-
debase/substrate_scope_extractor.py,sha256=7JyTE3CiIQVDDetwfENCoiq5bLnHElsY3Db1ThVLEBE,115884
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.4.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.4.4.dist-info/METADATA,sha256=Gwx754a5Zr_0yp-HXQuRRLylgEp0hD15MhhMjSOVMHo,4047
|
13
|
-
debase-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.4.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.4.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.4.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|