debase 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +423 -86
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +73 -61
- debase/substrate_scope_extractor.py +84 -32
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
|
|
30
30
|
|
31
31
|
import argparse
|
32
32
|
import csv
|
33
|
+
import difflib
|
33
34
|
import json
|
34
35
|
import logging
|
35
36
|
import os
|
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
|
|
726
727
|
gemini_matched_count += 1
|
727
728
|
log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
|
728
729
|
else:
|
729
|
-
|
730
|
+
# Try fuzzy matching when exact match fails
|
731
|
+
best_match = None
|
732
|
+
best_score = 0
|
733
|
+
|
734
|
+
# Try all possible keys in seq_lookup
|
735
|
+
for key in seq_lookup.keys():
|
736
|
+
if campaign_id in key: # Only consider keys from same campaign
|
737
|
+
# Extract enzyme_id part from composite key
|
738
|
+
try:
|
739
|
+
_, key_enzyme_id = key.split('_', 1)
|
740
|
+
except ValueError:
|
741
|
+
continue
|
742
|
+
|
743
|
+
# Calculate similarity score
|
744
|
+
score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
|
745
|
+
|
746
|
+
# Always track the highest score
|
747
|
+
if score > best_score:
|
748
|
+
best_score = score
|
749
|
+
best_match = key
|
750
|
+
|
751
|
+
# Use the best match regardless of threshold (let user see the score)
|
752
|
+
if best_match and best_score > 0.5: # Lower threshold but log the score
|
753
|
+
idx = entry["idx"]
|
754
|
+
df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
755
|
+
df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
756
|
+
if seq_lookup[best_match]["nt_sequence"]:
|
757
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
758
|
+
df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
759
|
+
|
760
|
+
# Also copy generation and parent_enzyme_id
|
761
|
+
df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
|
762
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
|
763
|
+
|
764
|
+
# Store the match for later mutation copying
|
765
|
+
_, matched_enzyme = best_match.split('_', 1)
|
766
|
+
df.at[idx, "_matched_enzyme_id"] = matched_enzyme
|
767
|
+
df.at[idx, "_matched_campaign_id"] = campaign_id
|
768
|
+
|
769
|
+
gemini_matched_count += 1
|
770
|
+
log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
|
771
|
+
else:
|
772
|
+
log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
|
730
773
|
|
731
774
|
except Exception as e:
|
732
775
|
log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
|
@@ -442,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
|
|
442
442
|
You are an expert reader of protein engineering manuscripts.
|
443
443
|
Given the following article captions and section titles, identify most promising locations
|
444
444
|
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
445
|
-
activity, etc.) for enzyme variants.
|
445
|
+
activity, etc.) for enzyme variants.
|
446
|
+
|
447
|
+
CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
|
448
|
+
- Look for locations showing data for ALL enzyme variants in the evolution lineage
|
449
|
+
- Prioritize sources that show the complete evolutionary progression (parent → child variants)
|
450
|
+
- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
|
451
|
+
- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
|
446
452
|
|
447
453
|
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
448
454
|
performance data locations. Pay careful attention to:
|
@@ -450,8 +456,13 @@ performance data locations. Pay careful attention to:
|
|
450
456
|
- Enzyme name prefixes that indicate different campaigns
|
451
457
|
- Different substrate/product types mentioned in captions
|
452
458
|
|
459
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
460
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
461
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
462
|
+
- For tables, return the complete reference as it appears
|
463
|
+
|
453
464
|
Respond with a JSON array where each element contains:
|
454
|
-
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
|
465
|
+
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
|
455
466
|
- "type": one of "table", "figure"
|
456
467
|
- "confidence": your confidence score (0-100)
|
457
468
|
- "caption": the exact caption text for this location
|
@@ -459,7 +470,12 @@ Respond with a JSON array where each element contains:
|
|
459
470
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
460
471
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
461
472
|
|
462
|
-
|
473
|
+
PRIORITIZATION RULES:
|
474
|
+
- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
|
475
|
+
- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
|
476
|
+
- LOWEST PRIORITY: Sources showing data for individual variants only
|
477
|
+
|
478
|
+
Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
|
463
479
|
|
464
480
|
IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
|
465
481
|
|
@@ -503,6 +519,13 @@ IMPORTANT:
|
|
503
519
|
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
504
520
|
- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
|
505
521
|
|
522
|
+
CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
|
523
|
+
- Yield (%) measures how much product was formed (0-100%)
|
524
|
+
- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
|
525
|
+
- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
|
526
|
+
- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
|
527
|
+
- Be extremely careful when extracting from tables/figures with multiple columns or data series
|
528
|
+
|
506
529
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
507
530
|
""")
|
508
531
|
|
@@ -530,6 +553,17 @@ STEP 4: Extract values for each matched variant
|
|
530
553
|
- CRITICAL: Read actual scale values from the axis labels and tick marks
|
531
554
|
- Verify: taller bars should have higher values, higher dots should have higher values
|
532
555
|
|
556
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
557
|
+
- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
|
558
|
+
- Yield is typically shown as percentage (0-100%)
|
559
|
+
- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
|
560
|
+
- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
|
561
|
+
- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
|
562
|
+
- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
|
563
|
+
- If looking at grouped bars, ensure you're reading the correct bar for each metric
|
564
|
+
- Double-check that variant A's yield is not confused with variant B's yield
|
565
|
+
- If values are unclear or ambiguous, return null rather than guessing
|
566
|
+
|
533
567
|
Target enzymes to find and extract:
|
534
568
|
{enzyme_names}
|
535
569
|
|
@@ -887,15 +921,22 @@ class ReactionExtractor:
|
|
887
921
|
campaign_context = f"""
|
888
922
|
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
889
923
|
|
890
|
-
|
924
|
+
CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
|
925
|
+
- Campaign ID: {self.campaign_info.get('campaign_id', '')}
|
891
926
|
- Name: {self.campaign_info.get('campaign_name', '')}
|
892
927
|
- Description: {self.campaign_info.get('description', '')}
|
893
928
|
- Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
|
894
929
|
- Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
|
930
|
+
- Notes: {self.campaign_info.get('notes', '')}
|
895
931
|
|
896
932
|
KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
|
897
933
|
These locations are known to contain relevant data - prioritize them highly.
|
898
934
|
|
935
|
+
CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
|
936
|
+
- Look for data showing the entire evolutionary progression of enzyme variants
|
937
|
+
- Prioritize locations that show performance data for ALL variants in the lineage
|
938
|
+
- The campaign description and notes above provide context about the evolution strategy used
|
939
|
+
|
899
940
|
{f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
|
900
941
|
|
901
942
|
CRITICAL: Only return locations that contain data for this specific campaign.
|
@@ -2558,41 +2599,17 @@ Do NOT include compound information from other campaigns.
|
|
2558
2599
|
if not mapping or not mapping.iupac_name:
|
2559
2600
|
missing_compounds.append(cid)
|
2560
2601
|
|
2561
|
-
# Tier 2 (skip directly to full search): Full manuscript + SI search
|
2602
|
+
# Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
|
2562
2603
|
if missing_compounds:
|
2563
|
-
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
|
2604
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
|
2564
2605
|
len(missing_compounds), sorted(missing_compounds))
|
2565
2606
|
|
2566
|
-
# Get all available figures for compound structure analysis
|
2567
|
-
figure_images = {}
|
2568
|
-
|
2569
|
-
# Extract main manuscript figures
|
2570
|
-
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
|
2571
|
-
for ref in figure_refs:
|
2572
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2573
|
-
if img_b64:
|
2574
|
-
figure_images[ref] = img_b64
|
2575
|
-
LOGGER.info("Retrieved %s for compound mapping", ref)
|
2576
|
-
|
2577
|
-
# Get SI figures
|
2578
|
-
si_figure_refs = []
|
2579
|
-
for page in self.si_pages[:10]: # Check first 10 SI pages
|
2580
|
-
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
2581
|
-
si_figure_refs.extend(matches[:10]) # Limit to 10 figures
|
2582
|
-
|
2583
|
-
# Extract SI figures
|
2584
|
-
for ref in set(si_figure_refs):
|
2585
|
-
if ref not in figure_images:
|
2586
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2587
|
-
if img_b64:
|
2588
|
-
figure_images[ref] = img_b64
|
2589
|
-
LOGGER.info("Extracted %s for compound mapping", ref)
|
2590
|
-
|
2591
2607
|
# Full text search including ALL pages (manuscript + SI)
|
2592
2608
|
full_text = "\n\n".join(self.all_pages) # Send everything
|
2593
2609
|
|
2594
|
-
|
2595
|
-
|
2610
|
+
# Use text-only extraction for Tier 2 (no images)
|
2611
|
+
final_mappings = self._extract_compound_mappings_from_text(
|
2612
|
+
full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
|
2596
2613
|
)
|
2597
2614
|
|
2598
2615
|
# Merge final mappings with better compound ID matching
|
@@ -2933,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
|
|
2933
2950
|
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2934
2951
|
list(compound_mappings.keys()))
|
2935
2952
|
|
2936
|
-
# First, populate IUPAC lists directly from compound mappings based on compound_type
|
2937
|
-
substrate_iupacs_from_mappings = []
|
2938
|
-
product_iupacs_from_mappings = []
|
2939
|
-
|
2940
|
-
for mapping in compound_mappings.values():
|
2941
|
-
if mapping.iupac_name and mapping.compound_type:
|
2942
|
-
if mapping.compound_type.lower() == "substrate":
|
2943
|
-
substrate_iupacs_from_mappings.append(mapping.iupac_name)
|
2944
|
-
LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
|
2945
|
-
elif mapping.compound_type.lower() == "product":
|
2946
|
-
product_iupacs_from_mappings.append(mapping.iupac_name)
|
2947
|
-
LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
|
2948
|
-
|
2949
|
-
# Initialize or update the IUPAC lists with mapped compounds
|
2950
|
-
if substrate_iupacs_from_mappings:
|
2951
|
-
existing_substrates = data.get("substrate_iupac_list", []) or []
|
2952
|
-
if isinstance(existing_substrates, list):
|
2953
|
-
data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
|
2954
|
-
else:
|
2955
|
-
data["substrate_iupac_list"] = substrate_iupacs_from_mappings
|
2956
|
-
|
2957
|
-
if product_iupacs_from_mappings:
|
2958
|
-
existing_products = data.get("product_iupac_list", []) or []
|
2959
|
-
if isinstance(existing_products, list):
|
2960
|
-
data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
|
2961
|
-
else:
|
2962
|
-
data["product_iupac_list"] = product_iupacs_from_mappings
|
2963
|
-
|
2964
2953
|
# Try to map substrate/product lists through compound IDs
|
2965
2954
|
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
2966
2955
|
if isinstance(substrate_list, list):
|
@@ -3571,6 +3560,11 @@ def main() -> None:
|
|
3571
3560
|
LOGGER.info("Loading enzyme data from CSV…")
|
3572
3561
|
enzyme_df = pd.read_csv(args.lineage_csv)
|
3573
3562
|
|
3563
|
+
# Rename enzyme_id to enzyme if needed
|
3564
|
+
if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
|
3565
|
+
enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
|
3566
|
+
LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
|
3567
|
+
|
3574
3568
|
# Detect campaign information from the enzyme CSV
|
3575
3569
|
if 'campaign_id' in enzyme_df.columns:
|
3576
3570
|
all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
|
@@ -3601,6 +3595,11 @@ def main() -> None:
|
|
3601
3595
|
campaign_info=campaign_info)
|
3602
3596
|
df_metrics = extractor.run(enzyme_df)
|
3603
3597
|
|
3598
|
+
# For single campaign, also merge with lineage data
|
3599
|
+
if not df_metrics.empty:
|
3600
|
+
df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3601
|
+
LOGGER.info("Merged metrics with lineage data for single campaign")
|
3602
|
+
|
3604
3603
|
elif len(all_campaigns) > 1:
|
3605
3604
|
LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
|
3606
3605
|
all_results = []
|
@@ -3651,6 +3650,10 @@ def main() -> None:
|
|
3651
3650
|
# Merge campaign metrics with lineage data
|
3652
3651
|
campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3653
3652
|
|
3653
|
+
# Rename aa_seq to protein_sequence for consistency
|
3654
|
+
if 'aa_seq' in campaign_final.columns:
|
3655
|
+
campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3656
|
+
|
3654
3657
|
# Save campaign-specific file immediately
|
3655
3658
|
output_dir = args.output.parent
|
3656
3659
|
base_name = args.output.stem
|
@@ -3667,6 +3670,10 @@ def main() -> None:
|
|
3667
3670
|
# Still save an empty campaign file with lineage data
|
3668
3671
|
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3669
3672
|
if not campaign_lineage.empty:
|
3673
|
+
# Rename aa_seq to protein_sequence for consistency
|
3674
|
+
if 'aa_seq' in campaign_lineage.columns:
|
3675
|
+
campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
|
3676
|
+
|
3670
3677
|
output_dir = args.output.parent
|
3671
3678
|
base_name = args.output.stem
|
3672
3679
|
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
@@ -3697,6 +3704,11 @@ def main() -> None:
|
|
3697
3704
|
df_final = df_metrics
|
3698
3705
|
LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
|
3699
3706
|
|
3707
|
+
# Rename aa_seq to protein_sequence for consistency
|
3708
|
+
if df_final is not None and 'aa_seq' in df_final.columns:
|
3709
|
+
df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3710
|
+
LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
|
3711
|
+
|
3700
3712
|
df_final.to_csv(args.output, index=False)
|
3701
3713
|
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
3702
3714
|
|
@@ -296,12 +296,14 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
|
|
296
296
|
|
297
297
|
return "\n".join(chunks)
|
298
298
|
|
299
|
-
def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
|
299
|
+
def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: Optional[str] = None, document_hint: Optional[str] = None) -> Optional[str]:
|
300
300
|
"""Extract figure as a page region when embedded images aren't available.
|
301
301
|
|
302
302
|
Args:
|
303
303
|
pdf_paths: List of PDF paths to search
|
304
304
|
figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
|
305
|
+
caption_hint: Optional caption text to help identify the exact figure
|
306
|
+
document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
|
305
307
|
|
306
308
|
Returns:
|
307
309
|
Base64-encoded PNG string or None if not found
|
@@ -318,8 +320,20 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
318
320
|
log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
|
319
321
|
else:
|
320
322
|
base_figure_ref = figure_ref
|
321
|
-
|
322
|
-
|
323
|
+
|
324
|
+
# Determine search order based on document hint
|
325
|
+
search_paths = list(pdf_paths) # Create a copy
|
326
|
+
if document_hint and len(pdf_paths) > 1:
|
327
|
+
if document_hint.lower() == "manuscript":
|
328
|
+
# Prioritize manuscript (first PDF)
|
329
|
+
search_paths = [pdf_paths[0]] + pdf_paths[1:]
|
330
|
+
log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
|
331
|
+
elif document_hint.lower() == "supplementary":
|
332
|
+
# Prioritize SI (second PDF if available)
|
333
|
+
search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
|
334
|
+
log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
|
335
|
+
|
336
|
+
for pdf_path in search_paths:
|
323
337
|
doc = _open_doc(pdf_path)
|
324
338
|
try:
|
325
339
|
for page_num in range(doc.page_count):
|
@@ -333,26 +347,38 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
333
347
|
# Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
|
334
348
|
figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
|
335
349
|
|
336
|
-
#
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
|
343
|
-
]
|
344
|
-
|
345
|
-
for pattern in caption_patterns:
|
346
|
-
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
347
|
-
if matches:
|
348
|
-
# Found actual figure caption, get its position
|
349
|
-
caption_text = matches.group(0)
|
350
|
-
caption_instances = page.search_for(caption_text, quads=False)
|
350
|
+
# First try to find using caption hint if provided
|
351
|
+
if caption_hint and len(caption_hint) > 10:
|
352
|
+
# Try to find the exact caption text
|
353
|
+
caption_snippet = caption_hint[:100] # Use first 100 chars
|
354
|
+
if caption_snippet in page_text:
|
355
|
+
caption_instances = page.search_for(caption_snippet, quads=False)
|
351
356
|
if caption_instances:
|
352
357
|
caption_rect = caption_instances[0]
|
353
358
|
found = True
|
354
|
-
log.info("Found
|
355
|
-
|
359
|
+
log.info("Found figure using caption hint on page %d", page_num + 1)
|
360
|
+
|
361
|
+
# If not found with hint, look for actual figure captions using regex patterns
|
362
|
+
if not found:
|
363
|
+
caption_patterns = [
|
364
|
+
rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
|
365
|
+
rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
|
366
|
+
rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
|
367
|
+
rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
|
368
|
+
rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
|
369
|
+
]
|
370
|
+
|
371
|
+
for pattern in caption_patterns:
|
372
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
373
|
+
if matches:
|
374
|
+
# Found actual figure caption, get its position
|
375
|
+
caption_text = matches.group(0)
|
376
|
+
caption_instances = page.search_for(caption_text, quads=False)
|
377
|
+
if caption_instances:
|
378
|
+
caption_rect = caption_instances[0]
|
379
|
+
found = True
|
380
|
+
log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
|
381
|
+
break
|
356
382
|
|
357
383
|
if not found:
|
358
384
|
continue
|
@@ -1135,17 +1161,24 @@ Your task is to:
|
|
1135
1161
|
4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
|
1136
1162
|
5. Determine which enzyme variants from this campaign were tested in substrate scope studies
|
1137
1163
|
|
1164
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
1165
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
1166
|
+
- Include the figure caption if available to help with identification
|
1167
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
1168
|
+
|
1138
1169
|
Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
|
1139
1170
|
[
|
1140
1171
|
{{
|
1141
|
-
"location": "
|
1172
|
+
"location": "Main figure/table reference (e.g., 'Figure 2', 'Table S1', NOT 'Figure 2a')",
|
1142
1173
|
"type": "table|figure|text",
|
1143
1174
|
"confidence": 0.0-1.0,
|
1144
1175
|
"enzyme_variants": ["list of enzyme IDs found"],
|
1145
1176
|
"substrates_tested": ["list of substrates if identifiable"],
|
1146
1177
|
"campaign_match": true/false,
|
1147
1178
|
"is_substrate_scope": true/false,
|
1148
|
-
"model_reaction_excluded": "reason why this is not a model reaction"
|
1179
|
+
"model_reaction_excluded": "reason why this is not a model reaction",
|
1180
|
+
"caption": "Include the figure/table caption if available",
|
1181
|
+
"document": "manuscript|supplementary - specify whether this location is in the main manuscript or supplementary information"
|
1149
1182
|
}}
|
1150
1183
|
]
|
1151
1184
|
|
@@ -1865,22 +1898,28 @@ def extract_substrate_scope_entries_for_campaign(
|
|
1865
1898
|
all_refs = []
|
1866
1899
|
|
1867
1900
|
if locations:
|
1868
|
-
#
|
1869
|
-
|
1870
|
-
|
1871
|
-
loc_str = loc.get('location', '')
|
1872
|
-
location_strs.append(loc_str)
|
1873
|
-
all_refs.append(loc_str)
|
1901
|
+
# Sort locations by confidence and use only the PRIMARY (most confident) location
|
1902
|
+
sorted_locations = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
|
1903
|
+
primary_location = sorted_locations[0] if sorted_locations else None
|
1874
1904
|
|
1875
|
-
|
1905
|
+
if primary_location:
|
1906
|
+
primary_ref = primary_location.get('location', '')
|
1907
|
+
all_refs = [primary_ref] # Only extract from primary location
|
1908
|
+
|
1909
|
+
extraction_hints = f"\nPRIMARY substrate scope location for campaign {campaign_id}: {primary_ref}"
|
1910
|
+
extraction_hints += f"\nLocation confidence: {primary_location.get('confidence', 0)}%"
|
1911
|
+
extraction_hints += f"\nLocation type: {primary_location.get('type', 'unknown')}"
|
1876
1912
|
|
1877
1913
|
# Focus on campaign-specific enzyme variants
|
1878
1914
|
extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
|
1879
1915
|
|
1880
|
-
# Extract text from
|
1916
|
+
# Extract text from ONLY the primary location
|
1881
1917
|
extraction_texts = []
|
1882
1918
|
figure_images = {}
|
1883
1919
|
|
1920
|
+
# Create a mapping of location strings to their full location data
|
1921
|
+
location_map = {loc.get('location', ''): loc for loc in locations}
|
1922
|
+
|
1884
1923
|
for ref in all_refs:
|
1885
1924
|
if ref and pdf_paths:
|
1886
1925
|
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
|
@@ -1889,7 +1928,10 @@ def extract_substrate_scope_entries_for_campaign(
|
|
1889
1928
|
|
1890
1929
|
# Extract figure images for this reference (crop page around figure)
|
1891
1930
|
try:
|
1892
|
-
|
1931
|
+
# Get caption and document hints if available
|
1932
|
+
caption_hint = location_map.get(ref, {}).get('caption', '')
|
1933
|
+
document_hint = location_map.get(ref, {}).get('document', '')
|
1934
|
+
fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
|
1893
1935
|
if fig_base64:
|
1894
1936
|
figure_images[ref] = fig_base64
|
1895
1937
|
log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
|
@@ -1942,6 +1984,14 @@ IMPORTANT INSTRUCTIONS:
|
|
1942
1984
|
4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
|
1943
1985
|
5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
|
1944
1986
|
|
1987
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
1988
|
+
- BE EXTREMELY CAREFUL about which substrate ID maps to which yield, TTN, and selectivity values
|
1989
|
+
- Each substrate entry should have its OWN yield, ee, and TTN values - do not mix up values between substrates
|
1990
|
+
- If looking at a table or figure, carefully match each substrate with its corresponding row/bar/data point
|
1991
|
+
- Double-check that substrate 1a's data is not confused with substrate 1b's data, etc.
|
1992
|
+
- If values are unclear or ambiguous for a specific substrate, return null rather than guessing
|
1993
|
+
- Pay special attention when extracting from figures - ensure you're reading the correct bar/point for each substrate
|
1994
|
+
|
1945
1995
|
{extraction_hints}
|
1946
1996
|
|
1947
1997
|
Return your analysis as JSON in this format:
|
@@ -2287,13 +2337,15 @@ def get_substrate_scope(
|
|
2287
2337
|
if should_extract:
|
2288
2338
|
figure_ref = location_str
|
2289
2339
|
confidence = loc.get('confidence', 0)
|
2340
|
+
caption_hint = loc.get('caption', '')
|
2290
2341
|
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
|
2291
2342
|
|
2292
2343
|
# Use appropriate extraction function based on type
|
2293
2344
|
if 'scheme' in location_str.lower() or location_type == 'scheme':
|
2294
2345
|
figure_image = extract_scheme_image(pdf_paths, figure_ref)
|
2295
2346
|
else:
|
2296
|
-
|
2347
|
+
document_hint = loc.get('document', '')
|
2348
|
+
figure_image = extract_figure_image(pdf_paths, figure_ref, caption_hint=caption_hint, document_hint=document_hint)
|
2297
2349
|
|
2298
2350
|
if figure_image:
|
2299
2351
|
log.info("Successfully extracted %s image for %s (%d bytes)",
|
@@ -0,0 +1,18 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
+
debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
|
7
|
+
debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
|
8
|
+
debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
|
9
|
+
debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
|
10
|
+
debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
|
11
|
+
debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
|
12
|
+
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
+
debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
+
debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
|
15
|
+
debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
+
debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
+
debase-0.6.2.dist-info/RECORD,,
|
debase-0.6.1.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
-
debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
|
7
|
-
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
8
|
-
debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
|
9
|
-
debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
|
10
|
-
debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
|
11
|
-
debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
|
12
|
-
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
-
debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
-
debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
|
15
|
-
debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
-
debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
-
debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
-
debase-0.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|