debase 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
30
30
 
31
31
  import argparse
32
32
  import csv
33
+ import difflib
33
34
  import json
34
35
  import logging
35
36
  import os
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
726
727
  gemini_matched_count += 1
727
728
  log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
728
729
  else:
729
- log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
730
+ # Try fuzzy matching when exact match fails
731
+ best_match = None
732
+ best_score = 0
733
+
734
+ # Try all possible keys in seq_lookup
735
+ for key in seq_lookup.keys():
736
+ if campaign_id in key: # Only consider keys from same campaign
737
+ # Extract enzyme_id part from composite key
738
+ try:
739
+ _, key_enzyme_id = key.split('_', 1)
740
+ except ValueError:
741
+ continue
742
+
743
+ # Calculate similarity score
744
+ score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
745
+
746
+ # Always track the highest score
747
+ if score > best_score:
748
+ best_score = score
749
+ best_match = key
750
+
751
+ # Use the best match regardless of threshold (let user see the score)
752
+ if best_match and best_score > 0.5: # Lower threshold but log the score
753
+ idx = entry["idx"]
754
+ df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
755
+ df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
756
+ if seq_lookup[best_match]["nt_sequence"]:
757
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
758
+ df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
759
+
760
+ # Also copy generation and parent_enzyme_id
761
+ df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
762
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
763
+
764
+ # Store the match for later mutation copying
765
+ _, matched_enzyme = best_match.split('_', 1)
766
+ df.at[idx, "_matched_enzyme_id"] = matched_enzyme
767
+ df.at[idx, "_matched_campaign_id"] = campaign_id
768
+
769
+ gemini_matched_count += 1
770
+ log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
771
+ else:
772
+ log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
730
773
 
731
774
  except Exception as e:
732
775
  log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
@@ -442,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
442
442
  You are an expert reader of protein engineering manuscripts.
443
443
  Given the following article captions and section titles, identify most promising locations
444
444
  (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
445
- activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
445
+ activity, etc.) for enzyme variants.
446
+
447
+ CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
448
+ - Look for locations showing data for ALL enzyme variants in the evolution lineage
449
+ - Prioritize sources that show the complete evolutionary progression (parent → child variants)
450
+ - Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
451
+ - Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
446
452
 
447
453
  IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
448
454
  performance data locations. Pay careful attention to:
@@ -450,8 +456,13 @@ performance data locations. Pay careful attention to:
450
456
  - Enzyme name prefixes that indicate different campaigns
451
457
  - Different substrate/product types mentioned in captions
452
458
 
459
+ IMPORTANT FIGURE REFERENCE RULES:
460
+ - For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
461
+ - The extraction system will handle retrieving the entire figure including all sub-panels
462
+ - For tables, return the complete reference as it appears
463
+
453
464
  Respond with a JSON array where each element contains:
454
- - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
465
+ - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
455
466
  - "type": one of "table", "figure"
456
467
  - "confidence": your confidence score (0-100)
457
468
  - "caption": the exact caption text for this location
@@ -459,7 +470,12 @@ Respond with a JSON array where each element contains:
459
470
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
460
471
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
461
472
 
462
- Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
473
+ PRIORITIZATION RULES:
474
+ - HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
475
+ - MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
476
+ - LOWEST PRIORITY: Sources showing data for individual variants only
477
+
478
+ Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
463
479
 
464
480
  IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
465
481
 
@@ -503,6 +519,13 @@ IMPORTANT:
503
519
  - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
504
520
  - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
505
521
 
522
+ CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
523
+ - Yield (%) measures how much product was formed (0-100%)
524
+ - Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
525
+ - TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
526
+ - These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
527
+ - Be extremely careful when extracting from tables/figures with multiple columns or data series
528
+
506
529
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
507
530
  """)
508
531
 
@@ -530,6 +553,17 @@ STEP 4: Extract values for each matched variant
530
553
  - CRITICAL: Read actual scale values from the axis labels and tick marks
531
554
  - Verify: taller bars should have higher values, higher dots should have higher values
532
555
 
556
+ CRITICAL DATA ACCURACY REQUIREMENTS:
557
+ - DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
558
+ - Yield is typically shown as percentage (0-100%)
559
+ - Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
560
+ - TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
561
+ - Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
562
+ - Carefully match each bar/dot to its corresponding enzyme label on the X-axis
563
+ - If looking at grouped bars, ensure you're reading the correct bar for each metric
564
+ - Double-check that variant A's yield is not confused with variant B's yield
565
+ - If values are unclear or ambiguous, return null rather than guessing
566
+
533
567
  Target enzymes to find and extract:
534
568
  {enzyme_names}
535
569
 
@@ -887,15 +921,22 @@ class ReactionExtractor:
887
921
  campaign_context = f"""
888
922
  IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
889
923
 
890
- Campaign Details:
924
+ CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
925
+ - Campaign ID: {self.campaign_info.get('campaign_id', '')}
891
926
  - Name: {self.campaign_info.get('campaign_name', '')}
892
927
  - Description: {self.campaign_info.get('description', '')}
893
928
  - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
894
929
  - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
930
+ - Notes: {self.campaign_info.get('notes', '')}
895
931
 
896
932
  KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
897
933
  These locations are known to contain relevant data - prioritize them highly.
898
934
 
935
+ CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
936
+ - Look for data showing the entire evolutionary progression of enzyme variants
937
+ - Prioritize locations that show performance data for ALL variants in the lineage
938
+ - The campaign description and notes above provide context about the evolution strategy used
939
+
899
940
  {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
900
941
 
901
942
  CRITICAL: Only return locations that contain data for this specific campaign.
@@ -2558,41 +2599,17 @@ Do NOT include compound information from other campaigns.
2558
2599
  if not mapping or not mapping.iupac_name:
2559
2600
  missing_compounds.append(cid)
2560
2601
 
2561
- # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
2602
+ # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
2562
2603
  if missing_compounds:
2563
- LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
2604
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
2564
2605
  len(missing_compounds), sorted(missing_compounds))
2565
2606
 
2566
- # Get all available figures for compound structure analysis
2567
- figure_images = {}
2568
-
2569
- # Extract main manuscript figures
2570
- figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
2571
- for ref in figure_refs:
2572
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2573
- if img_b64:
2574
- figure_images[ref] = img_b64
2575
- LOGGER.info("Retrieved %s for compound mapping", ref)
2576
-
2577
- # Get SI figures
2578
- si_figure_refs = []
2579
- for page in self.si_pages[:10]: # Check first 10 SI pages
2580
- matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
2581
- si_figure_refs.extend(matches[:10]) # Limit to 10 figures
2582
-
2583
- # Extract SI figures
2584
- for ref in set(si_figure_refs):
2585
- if ref not in figure_images:
2586
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2587
- if img_b64:
2588
- figure_images[ref] = img_b64
2589
- LOGGER.info("Extracted %s for compound mapping", ref)
2590
-
2591
2607
  # Full text search including ALL pages (manuscript + SI)
2592
2608
  full_text = "\n\n".join(self.all_pages) # Send everything
2593
2609
 
2594
- final_mappings = self._extract_compound_mappings_with_figures(
2595
- full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2610
+ # Use text-only extraction for Tier 2 (no images)
2611
+ final_mappings = self._extract_compound_mappings_from_text(
2612
+ full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
2596
2613
  )
2597
2614
 
2598
2615
  # Merge final mappings with better compound ID matching
@@ -2933,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
2933
2950
  LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2934
2951
  list(compound_mappings.keys()))
2935
2952
 
2936
- # First, populate IUPAC lists directly from compound mappings based on compound_type
2937
- substrate_iupacs_from_mappings = []
2938
- product_iupacs_from_mappings = []
2939
-
2940
- for mapping in compound_mappings.values():
2941
- if mapping.iupac_name and mapping.compound_type:
2942
- if mapping.compound_type.lower() == "substrate":
2943
- substrate_iupacs_from_mappings.append(mapping.iupac_name)
2944
- LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
2945
- elif mapping.compound_type.lower() == "product":
2946
- product_iupacs_from_mappings.append(mapping.iupac_name)
2947
- LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
2948
-
2949
- # Initialize or update the IUPAC lists with mapped compounds
2950
- if substrate_iupacs_from_mappings:
2951
- existing_substrates = data.get("substrate_iupac_list", []) or []
2952
- if isinstance(existing_substrates, list):
2953
- data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
2954
- else:
2955
- data["substrate_iupac_list"] = substrate_iupacs_from_mappings
2956
-
2957
- if product_iupacs_from_mappings:
2958
- existing_products = data.get("product_iupac_list", []) or []
2959
- if isinstance(existing_products, list):
2960
- data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
2961
- else:
2962
- data["product_iupac_list"] = product_iupacs_from_mappings
2963
-
2964
2953
  # Try to map substrate/product lists through compound IDs
2965
2954
  substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
2966
2955
  if isinstance(substrate_list, list):
@@ -3571,6 +3560,11 @@ def main() -> None:
3571
3560
  LOGGER.info("Loading enzyme data from CSV…")
3572
3561
  enzyme_df = pd.read_csv(args.lineage_csv)
3573
3562
 
3563
+ # Rename enzyme_id to enzyme if needed
3564
+ if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
3565
+ enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
3566
+ LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
3567
+
3574
3568
  # Detect campaign information from the enzyme CSV
3575
3569
  if 'campaign_id' in enzyme_df.columns:
3576
3570
  all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3601,6 +3595,11 @@ def main() -> None:
3601
3595
  campaign_info=campaign_info)
3602
3596
  df_metrics = extractor.run(enzyme_df)
3603
3597
 
3598
+ # For single campaign, also merge with lineage data
3599
+ if not df_metrics.empty:
3600
+ df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
3601
+ LOGGER.info("Merged metrics with lineage data for single campaign")
3602
+
3604
3603
  elif len(all_campaigns) > 1:
3605
3604
  LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
3606
3605
  all_results = []
@@ -3651,6 +3650,10 @@ def main() -> None:
3651
3650
  # Merge campaign metrics with lineage data
3652
3651
  campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
3653
3652
 
3653
+ # Rename aa_seq to protein_sequence for consistency
3654
+ if 'aa_seq' in campaign_final.columns:
3655
+ campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
3656
+
3654
3657
  # Save campaign-specific file immediately
3655
3658
  output_dir = args.output.parent
3656
3659
  base_name = args.output.stem
@@ -3667,6 +3670,10 @@ def main() -> None:
3667
3670
  # Still save an empty campaign file with lineage data
3668
3671
  campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3669
3672
  if not campaign_lineage.empty:
3673
+ # Rename aa_seq to protein_sequence for consistency
3674
+ if 'aa_seq' in campaign_lineage.columns:
3675
+ campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
3676
+
3670
3677
  output_dir = args.output.parent
3671
3678
  base_name = args.output.stem
3672
3679
  campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3697,6 +3704,11 @@ def main() -> None:
3697
3704
  df_final = df_metrics
3698
3705
  LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
3699
3706
 
3707
+ # Rename aa_seq to protein_sequence for consistency
3708
+ if df_final is not None and 'aa_seq' in df_final.columns:
3709
+ df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
3710
+ LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
3711
+
3700
3712
  df_final.to_csv(args.output, index=False)
3701
3713
  LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
3702
3714
 
@@ -296,12 +296,14 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
296
296
 
297
297
  return "\n".join(chunks)
298
298
 
299
- def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
299
+ def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: Optional[str] = None, document_hint: Optional[str] = None) -> Optional[str]:
300
300
  """Extract figure as a page region when embedded images aren't available.
301
301
 
302
302
  Args:
303
303
  pdf_paths: List of PDF paths to search
304
304
  figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
305
+ caption_hint: Optional caption text to help identify the exact figure
306
+ document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
305
307
 
306
308
  Returns:
307
309
  Base64-encoded PNG string or None if not found
@@ -318,8 +320,20 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
318
320
  log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
319
321
  else:
320
322
  base_figure_ref = figure_ref
321
-
322
- for pdf_path in pdf_paths:
323
+
324
+ # Determine search order based on document hint
325
+ search_paths = list(pdf_paths) # Create a copy
326
+ if document_hint and len(pdf_paths) > 1:
327
+ if document_hint.lower() == "manuscript":
328
+ # Prioritize manuscript (first PDF)
329
+ search_paths = [pdf_paths[0]] + pdf_paths[1:]
330
+ log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
331
+ elif document_hint.lower() == "supplementary":
332
+ # Prioritize SI (second PDF if available)
333
+ search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
334
+ log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
335
+
336
+ for pdf_path in search_paths:
323
337
  doc = _open_doc(pdf_path)
324
338
  try:
325
339
  for page_num in range(doc.page_count):
@@ -333,26 +347,38 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
333
347
  # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
334
348
  figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
335
349
 
336
- # Look for actual figure captions using regex patterns
337
- caption_patterns = [
338
- rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
339
- rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
340
- rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
341
- rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
342
- rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
343
- ]
344
-
345
- for pattern in caption_patterns:
346
- matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
347
- if matches:
348
- # Found actual figure caption, get its position
349
- caption_text = matches.group(0)
350
- caption_instances = page.search_for(caption_text, quads=False)
350
+ # First try to find using caption hint if provided
351
+ if caption_hint and len(caption_hint) > 10:
352
+ # Try to find the exact caption text
353
+ caption_snippet = caption_hint[:100] # Use first 100 chars
354
+ if caption_snippet in page_text:
355
+ caption_instances = page.search_for(caption_snippet, quads=False)
351
356
  if caption_instances:
352
357
  caption_rect = caption_instances[0]
353
358
  found = True
354
- log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
355
- break
359
+ log.info("Found figure using caption hint on page %d", page_num + 1)
360
+
361
+ # If not found with hint, look for actual figure captions using regex patterns
362
+ if not found:
363
+ caption_patterns = [
364
+ rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
365
+ rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
366
+ rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
367
+ rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
368
+ rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
369
+ ]
370
+
371
+ for pattern in caption_patterns:
372
+ matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
373
+ if matches:
374
+ # Found actual figure caption, get its position
375
+ caption_text = matches.group(0)
376
+ caption_instances = page.search_for(caption_text, quads=False)
377
+ if caption_instances:
378
+ caption_rect = caption_instances[0]
379
+ found = True
380
+ log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
381
+ break
356
382
 
357
383
  if not found:
358
384
  continue
@@ -1135,17 +1161,24 @@ Your task is to:
1135
1161
  4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
1136
1162
  5. Determine which enzyme variants from this campaign were tested in substrate scope studies
1137
1163
 
1164
+ IMPORTANT FIGURE REFERENCE RULES:
1165
+ - For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
1166
+ - Include the figure caption if available to help with identification
1167
+ - The extraction system will handle retrieving the entire figure including all sub-panels
1168
+
1138
1169
  Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
1139
1170
  [
1140
1171
  {{
1141
- "location": "Description of where the data is found",
1172
+ "location": "Main figure/table reference (e.g., 'Figure 2', 'Table S1', NOT 'Figure 2a')",
1142
1173
  "type": "table|figure|text",
1143
1174
  "confidence": 0.0-1.0,
1144
1175
  "enzyme_variants": ["list of enzyme IDs found"],
1145
1176
  "substrates_tested": ["list of substrates if identifiable"],
1146
1177
  "campaign_match": true/false,
1147
1178
  "is_substrate_scope": true/false,
1148
- "model_reaction_excluded": "reason why this is not a model reaction"
1179
+ "model_reaction_excluded": "reason why this is not a model reaction",
1180
+ "caption": "Include the figure/table caption if available",
1181
+ "document": "manuscript|supplementary - specify whether this location is in the main manuscript or supplementary information"
1149
1182
  }}
1150
1183
  ]
1151
1184
 
@@ -1865,22 +1898,28 @@ def extract_substrate_scope_entries_for_campaign(
1865
1898
  all_refs = []
1866
1899
 
1867
1900
  if locations:
1868
- # Include ALL locations, not just primary
1869
- location_strs = []
1870
- for loc in locations[:3]: # Up to 3 locations
1871
- loc_str = loc.get('location', '')
1872
- location_strs.append(loc_str)
1873
- all_refs.append(loc_str)
1901
+ # Sort locations by confidence and use only the PRIMARY (most confident) location
1902
+ sorted_locations = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
1903
+ primary_location = sorted_locations[0] if sorted_locations else None
1874
1904
 
1875
- extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
1905
+ if primary_location:
1906
+ primary_ref = primary_location.get('location', '')
1907
+ all_refs = [primary_ref] # Only extract from primary location
1908
+
1909
+ extraction_hints = f"\nPRIMARY substrate scope location for campaign {campaign_id}: {primary_ref}"
1910
+ extraction_hints += f"\nLocation confidence: {primary_location.get('confidence', 0)}%"
1911
+ extraction_hints += f"\nLocation type: {primary_location.get('type', 'unknown')}"
1876
1912
 
1877
1913
  # Focus on campaign-specific enzyme variants
1878
1914
  extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
1879
1915
 
1880
- # Extract text from ALL identified locations (like the original function did)
1916
+ # Extract text from ONLY the primary location
1881
1917
  extraction_texts = []
1882
1918
  figure_images = {}
1883
1919
 
1920
+ # Create a mapping of location strings to their full location data
1921
+ location_map = {loc.get('location', ''): loc for loc in locations}
1922
+
1884
1923
  for ref in all_refs:
1885
1924
  if ref and pdf_paths:
1886
1925
  ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
@@ -1889,7 +1928,10 @@ def extract_substrate_scope_entries_for_campaign(
1889
1928
 
1890
1929
  # Extract figure images for this reference (crop page around figure)
1891
1930
  try:
1892
- fig_base64 = extract_figure_image(pdf_paths, ref)
1931
+ # Get caption and document hints if available
1932
+ caption_hint = location_map.get(ref, {}).get('caption', '')
1933
+ document_hint = location_map.get(ref, {}).get('document', '')
1934
+ fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
1893
1935
  if fig_base64:
1894
1936
  figure_images[ref] = fig_base64
1895
1937
  log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
@@ -1942,6 +1984,14 @@ IMPORTANT INSTRUCTIONS:
1942
1984
  4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
1943
1985
  5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
1944
1986
 
1987
+ CRITICAL DATA ACCURACY REQUIREMENTS:
1988
+ - BE EXTREMELY CAREFUL about which substrate ID maps to which yield, TTN, and selectivity values
1989
+ - Each substrate entry should have its OWN yield, ee, and TTN values - do not mix up values between substrates
1990
+ - If looking at a table or figure, carefully match each substrate with its corresponding row/bar/data point
1991
+ - Double-check that substrate 1a's data is not confused with substrate 1b's data, etc.
1992
+ - If values are unclear or ambiguous for a specific substrate, return null rather than guessing
1993
+ - Pay special attention when extracting from figures - ensure you're reading the correct bar/point for each substrate
1994
+
1945
1995
  {extraction_hints}
1946
1996
 
1947
1997
  Return your analysis as JSON in this format:
@@ -2287,13 +2337,15 @@ def get_substrate_scope(
2287
2337
  if should_extract:
2288
2338
  figure_ref = location_str
2289
2339
  confidence = loc.get('confidence', 0)
2340
+ caption_hint = loc.get('caption', '')
2290
2341
  log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
2291
2342
 
2292
2343
  # Use appropriate extraction function based on type
2293
2344
  if 'scheme' in location_str.lower() or location_type == 'scheme':
2294
2345
  figure_image = extract_scheme_image(pdf_paths, figure_ref)
2295
2346
  else:
2296
- figure_image = extract_figure_image(pdf_paths, figure_ref)
2347
+ document_hint = loc.get('document', '')
2348
+ figure_image = extract_figure_image(pdf_paths, figure_ref, caption_hint=caption_hint, document_hint=document_hint)
2297
2349
 
2298
2350
  if figure_image:
2299
2351
  log.info("Successfully extracted %s image for %s (%d bytes)",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.6.1
3
+ Version: 0.6.2
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,18 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
6
+ debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
7
+ debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
8
+ debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
9
+ debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
10
+ debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
11
+ debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
12
+ debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
13
+ debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
14
+ debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
15
+ debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
17
+ debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
18
+ debase-0.6.2.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
6
- debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
7
- debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
8
- debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
9
- debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
10
- debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
11
- debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
12
- debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
13
- debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
14
- debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
15
- debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
17
- debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
18
- debase-0.6.1.dist-info/RECORD,,
File without changes