debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.6.0"
3
+ __version__ = "0.6.2"
@@ -0,0 +1,146 @@
1
+ """Utilities for handling campaign information across extractors.
2
+
3
+ This module provides functions to load and use campaign information
4
+ to improve extraction accuracy by providing context about model substrates,
5
+ products, and data locations.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import List, Dict, Optional, Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def load_campaigns_from_file(campaign_file: Path) -> List[Dict[str, Any]]:
17
+ """Load campaign information from a JSON file.
18
+
19
+ Args:
20
+ campaign_file: Path to campaigns.json file
21
+
22
+ Returns:
23
+ List of campaign dictionaries
24
+ """
25
+ if not campaign_file.exists():
26
+ logger.warning(f"Campaign file not found: {campaign_file}")
27
+ return []
28
+
29
+ try:
30
+ with open(campaign_file, 'r') as f:
31
+ campaigns = json.load(f)
32
+ logger.info(f"Loaded {len(campaigns)} campaigns from {campaign_file}")
33
+ return campaigns
34
+ except Exception as e:
35
+ logger.error(f"Failed to load campaigns from {campaign_file}: {e}")
36
+ return []
37
+
38
+
39
+ def find_campaign_by_id(campaigns: List[Dict[str, Any]], campaign_id: str) -> Optional[Dict[str, Any]]:
40
+ """Find a specific campaign by ID.
41
+
42
+ Args:
43
+ campaigns: List of campaign dictionaries
44
+ campaign_id: Campaign ID to search for
45
+
46
+ Returns:
47
+ Campaign dictionary if found, None otherwise
48
+ """
49
+ for campaign in campaigns:
50
+ if campaign.get('campaign_id') == campaign_id:
51
+ return campaign
52
+ return None
53
+
54
+
55
+ def get_campaign_context(campaign: Dict[str, Any]) -> str:
56
+ """Generate context string for prompts from campaign information.
57
+
58
+ Args:
59
+ campaign: Campaign dictionary
60
+
61
+ Returns:
62
+ Formatted context string for inclusion in prompts
63
+ """
64
+ context_parts = []
65
+
66
+ # Basic campaign info
67
+ context_parts.append(f"Campaign: {campaign.get('campaign_name', 'Unknown')}")
68
+ context_parts.append(f"Description: {campaign.get('description', '')}")
69
+
70
+ # Model reaction info
71
+ if campaign.get('model_substrate'):
72
+ context_parts.append(f"Model Substrate: {campaign['model_substrate']} (ID: {campaign.get('substrate_id', 'unknown')})")
73
+ if campaign.get('model_product'):
74
+ context_parts.append(f"Model Product: {campaign['model_product']} (ID: {campaign.get('product_id', 'unknown')})")
75
+
76
+ # Data locations
77
+ if campaign.get('data_locations'):
78
+ locations = ', '.join(campaign['data_locations'])
79
+ context_parts.append(f"Key Data Locations: {locations}")
80
+
81
+ # Lineage hint if available
82
+ if campaign.get('lineage_hint'):
83
+ context_parts.append(f"Evolution Pathway: {campaign['lineage_hint']}")
84
+
85
+ # Additional notes
86
+ if campaign.get('notes'):
87
+ context_parts.append(f"Notes: {campaign['notes']}")
88
+
89
+ return '\n'.join(context_parts)
90
+
91
+
92
+ def get_location_hints_for_campaign(campaign: Dict[str, Any]) -> List[str]:
93
+ """Extract specific location hints from campaign data.
94
+
95
+ Args:
96
+ campaign: Campaign dictionary
97
+
98
+ Returns:
99
+ List of location strings (e.g., ["Figure 2a", "Table S4"])
100
+ """
101
+ return campaign.get('data_locations', [])
102
+
103
+
104
+ def enhance_prompt_with_campaign(prompt: str, campaign: Optional[Dict[str, Any]],
105
+ section_name: str = "CAMPAIGN CONTEXT") -> str:
106
+ """Enhance a prompt with campaign context information.
107
+
108
+ Args:
109
+ prompt: Original prompt
110
+ campaign: Campaign dictionary (optional)
111
+ section_name: Section header for the campaign context
112
+
113
+ Returns:
114
+ Enhanced prompt with campaign context
115
+ """
116
+ if not campaign:
117
+ return prompt
118
+
119
+ context = get_campaign_context(campaign)
120
+ locations = get_location_hints_for_campaign(campaign)
121
+
122
+ campaign_section = f"\n\n{section_name}:\n{'-' * 50}\n{context}"
123
+
124
+ if locations:
125
+ campaign_section += f"\n\nIMPORTANT: Focus particularly on these locations: {', '.join(locations)}"
126
+
127
+ campaign_section += f"\n{'-' * 50}\n"
128
+
129
+ # Insert campaign context early in the prompt
130
+ # Look for a good insertion point after initial instructions
131
+ lines = prompt.split('\n')
132
+ insert_idx = 0
133
+
134
+ # Find a good place to insert (after first paragraph or instruction block)
135
+ for i, line in enumerate(lines):
136
+ if i > 5 and (not line.strip() or line.startswith('Given') or line.startswith('You')):
137
+ insert_idx = i
138
+ break
139
+
140
+ if insert_idx == 0:
141
+ # Fallback: just prepend
142
+ return campaign_section + prompt
143
+ else:
144
+ # Insert at found position
145
+ lines.insert(insert_idx, campaign_section)
146
+ return '\n'.join(lines)
@@ -0,0 +1,44 @@
1
+ """Universal caption pattern for all DEBase extractors.
2
+
3
+ This module provides a consistent caption pattern that handles various
4
+ formats found in scientific papers, including:
5
+ - Standard formats: Figure 1, Fig. 1, Table 1
6
+ - Supplementary formats: Supplementary Figure 1, Supp. Table 1
7
+ - Extended data: Extended Data Figure 1, ED Fig. 1
8
+ - Other types: Scheme 1, Chart 1
9
+ - Page headers: S14 Table 5
10
+ - Various punctuation: Figure 1. Figure 1: Figure 1 |
11
+ - Inline captions: ...text Table 1. Caption text...
12
+ """
13
+
14
+ import re
15
+
16
+ # Universal caption pattern that handles all common formats
17
+ # Now includes both start-of-line and inline caption patterns
18
+ UNIVERSAL_CAPTION_PATTERN = re.compile(
19
+ r"""
20
+ (?: # Non-capturing group for position
21
+ ^[^\n]{0,20}? # Start of line with up to 20 chars before
22
+ | # OR
23
+ (?<=[a-zA-Z0-9\s]) # Look-behind for alphanumeric or space (for inline)
24
+ )
25
+ ( # Start capture group
26
+ (?:Extended\s+Data\s+)? # Optional "Extended Data" prefix
27
+ (?:ED\s+)? # Optional "ED" prefix
28
+ (?:Supplementary|Supp\.?|Suppl\.?)?\s* # Optional supplementary prefixes
29
+ (?:Table|Fig(?:ure)?|Scheme|Chart) # Main caption types
30
+ ) # End capture group
31
+ (?: # Non-capturing group for what follows
32
+ \s* # Optional whitespace
33
+ (?:S?\d+[A-Za-z]?|[IVX]+) # Number (with optional S prefix or roman)
34
+ (?:[.:|]|\s+\|)? # Optional punctuation (. : or |)
35
+ | # OR
36
+ \. # Just a period (for "Fig." without number)
37
+ )
38
+ """,
39
+ re.I | re.X | re.M
40
+ )
41
+
42
+ def get_universal_caption_pattern():
43
+ """Get the universal caption pattern for use in extractors."""
44
+ return UNIVERSAL_CAPTION_PATTERN
@@ -1016,13 +1016,38 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1016
1016
 
1017
1017
  if source_enzyme_id:
1018
1018
  # Find the source enzyme's sequence in the dataframe
1019
+ # Prefer sequences from OTHER campaigns (not the current empty campaign)
1019
1020
  source_rows = df[df['enzyme_id'] == source_enzyme_id]
1020
1021
  if source_rows.empty:
1021
1022
  log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
1022
1023
  else:
1023
- source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
1024
- if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
1025
- log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
1024
+ # Look for a row with a sequence, preferring other campaigns
1025
+ source_sequence = None
1026
+ source_row_idx = None
1027
+
1028
+ # First, try to find a row with sequence from a different campaign
1029
+ for idx, row in source_rows.iterrows():
1030
+ seq = str(row['protein_sequence']).strip()
1031
+ if seq and seq.lower() not in ["nan", "none", ""]:
1032
+ # Check if this is from a different campaign
1033
+ if row['campaign_id'] != campaign_id:
1034
+ source_sequence = seq
1035
+ source_row_idx = idx
1036
+ log.info(f"Found source sequence for {source_enzyme_id} from campaign {row['campaign_id']}")
1037
+ break
1038
+
1039
+ # If not found in other campaigns, try any row with sequence
1040
+ if not source_sequence:
1041
+ for idx, row in source_rows.iterrows():
1042
+ seq = str(row['protein_sequence']).strip()
1043
+ if seq and seq.lower() not in ["nan", "none", ""]:
1044
+ source_sequence = seq
1045
+ source_row_idx = idx
1046
+ log.info(f"Found source sequence for {source_enzyme_id} from same campaign {row['campaign_id']}")
1047
+ break
1048
+
1049
+ if not source_sequence:
1050
+ log.warning(f"Source enzyme {source_enzyme_id} has no sequence in any campaign")
1026
1051
  else:
1027
1052
  # Find the target enzyme in our empty list
1028
1053
  seed_found = False
@@ -1031,7 +1056,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1031
1056
  if relationship_type == "EXACT_MATCH":
1032
1057
  # Exact match - copy sequence directly
1033
1058
  df.at[entry['idx'], 'protein_sequence'] = source_sequence
1034
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
1059
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1060
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_exact"
1035
1061
  log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
1036
1062
  seed_found = True
1037
1063
 
@@ -1045,7 +1071,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1045
1071
 
1046
1072
  if success:
1047
1073
  df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
1048
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
1074
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1075
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent"
1049
1076
  log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
1050
1077
  seed_found = True
1051
1078
  else:
@@ -1053,7 +1080,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1053
1080
  else:
1054
1081
  # No mutations - use parent sequence directly
1055
1082
  df.at[entry['idx'], 'protein_sequence'] = source_sequence
1056
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
1083
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1084
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent_no_mutations"
1057
1085
  log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
1058
1086
  seed_found = True
1059
1087
  break