debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ import time
30
30
  import logging
31
31
  from pathlib import Path
32
32
  from dataclasses import dataclass, field
33
- from typing import List, Optional, Union, Tuple
33
+ from typing import List, Optional, Union, Tuple, Dict, Any
34
34
 
35
35
  MODEL_NAME: str = "gemini-2.5-flash"
36
36
  MAX_CHARS: int = 150_000 # Max characters sent to LLM
@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
142
142
 
143
143
 
144
144
  def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
145
- """Extract figure/table captions using the improved regex.
145
+ """Extract ALL figure/table captions with extensive surrounding context.
146
146
 
147
147
  The function scans every text line on every page and keeps lines whose first
148
148
  token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
149
- * Fig. 1, Figure 2A, Extended Data Fig 3
149
+ * Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
150
150
  * Table S1, Table 4, Scheme 2, Chart 1B
151
- * Supplementary Fig. S5, Supp Table 2
151
+ * Supplementary Fig. S5A, S5B, S5C (ALL variations)
152
+
153
+ For SI documents, includes extensive context since understanding what each
154
+ section contains is crucial for accurate location identification.
152
155
  """
153
156
 
154
157
  doc = _open_doc(pdf_path)
155
158
  captions: list[str] = []
156
159
  try:
157
- for page in doc:
160
+ for page_num, page in enumerate(doc):
158
161
  page_dict = page.get_text("dict")
162
+
163
+ # Get all text blocks on this page for broader context
164
+ page_text_blocks = []
159
165
  for block in page_dict.get("blocks", []):
166
+ block_text = ""
167
+ for line in block.get("lines", []):
168
+ text_line = "".join(span["text"] for span in line.get("spans", []))
169
+ if text_line.strip():
170
+ block_text += text_line.strip() + " "
171
+ if block_text.strip():
172
+ page_text_blocks.append(block_text.strip())
173
+
174
+ for block_idx, block in enumerate(page_dict.get("blocks", [])):
160
175
  # Get all lines in this block
161
176
  block_lines = []
162
177
  for line in block.get("lines", []):
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
166
181
  # Check if any line starts with a caption prefix
167
182
  for i, line in enumerate(block_lines):
168
183
  if _CAPTION_PREFIX_RE.match(line):
169
- # Found a caption start - collect this line and subsequent lines
170
- # until we hit an empty line or the end of the block
184
+ context_parts = []
185
+
186
+ # Add page context for SI documents (more critical there)
187
+ context_parts.append(f"Page {page_num + 1}")
188
+
189
+ # Add extensive context before the caption (5-7 lines for SI context)
190
+ context_before = []
191
+
192
+ # First try to get context from current block
193
+ for k in range(max(0, i-7), i):
194
+ if k < len(block_lines) and block_lines[k].strip():
195
+ if not _CAPTION_PREFIX_RE.match(block_lines[k]):
196
+ context_before.append(block_lines[k])
197
+
198
+ # If not enough context, look at previous text blocks on the page
199
+ if len(context_before) < 3 and block_idx > 0:
200
+ prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
201
+ if prev_block_text:
202
+ # Get last few sentences from previous block
203
+ sentences = prev_block_text.split('. ')
204
+ context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
205
+
206
+ if context_before:
207
+ # Include more extensive context for better understanding
208
+ context_text = " ".join(context_before[-5:]) # Last 5 lines/sentences of context
209
+ context_parts.append("Context: " + context_text)
210
+
211
+ # Extract the COMPLETE caption including all sub-parts
171
212
  caption_parts = [line]
172
- for j in range(i + 1, len(block_lines)):
213
+ j = i + 1
214
+
215
+ # Continue collecting caption text until we hit a clear break
216
+ while j < len(block_lines):
173
217
  next_line = block_lines[j]
174
- if not next_line: # Empty line signals end of caption
175
- break
176
- # Check if next line is a new caption
218
+
219
+ # Stop if we hit an empty line followed by non-caption text
220
+ if not next_line:
221
+ # Check if the line after empty is a new caption
222
+ if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
223
+ break
224
+ # If next non-empty line is not a caption, continue collecting
225
+ elif j + 1 < len(block_lines):
226
+ j += 1
227
+ continue
228
+ else:
229
+ break
230
+
231
+ # Stop if we hit a new caption
177
232
  if _CAPTION_PREFIX_RE.match(next_line):
178
233
  break
234
+
235
+ # Include this line as part of the caption
179
236
  caption_parts.append(next_line)
237
+ j += 1
180
238
 
181
- # Join the parts with spaces
239
+ # Join the caption parts
182
240
  full_caption = " ".join(caption_parts)
183
- captions.append(full_caption)
241
+ context_parts.append("Caption: " + full_caption)
242
+
243
+ # Add extensive context after the caption (especially important for SI)
244
+ context_after = []
245
+
246
+ # Look for descriptive text following the caption
247
+ for k in range(j, min(len(block_lines), j + 10)): # Look ahead up to 10 lines
248
+ if k < len(block_lines) and block_lines[k].strip():
249
+ if not _CAPTION_PREFIX_RE.match(block_lines[k]):
250
+ context_after.append(block_lines[k])
251
+
252
+ # If not enough context, look at next text blocks
253
+ if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
254
+ next_block_text = page_text_blocks[block_idx + 1]
255
+ if next_block_text:
256
+ # Get first few sentences from next block
257
+ sentences = next_block_text.split('. ')
258
+ context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
259
+
260
+ if context_after:
261
+ # Include extensive following context
262
+ following_text = " ".join(context_after[:7]) # First 7 lines of following context
263
+ context_parts.append("Following: " + following_text)
264
+
265
+ # For SI documents, add section context if this appears to be a section header
266
+ if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
267
+ context_parts.append("SI_SECTION: This appears to be supplementary material content")
268
+
269
+ # Combine all parts with proper separation
270
+ full_caption_with_context = " | ".join(context_parts)
271
+ captions.append(full_caption_with_context)
184
272
  finally:
185
273
  doc.close()
186
274
 
@@ -639,17 +727,18 @@ Return a JSON object with:
639
727
 
640
728
  _LINEAGE_LOC_PROMPT = """
641
729
  You are an expert reader of protein engineering manuscripts.
730
+ {campaign_context}
642
731
  Given the following article text, list up to {max_results} *locations* (page
643
732
  numbers, figure/table IDs, or section headings) that you would review first to
644
733
  find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
645
- came from which parent and what mutations were introduced).
734
+ came from which parent and what mutations were introduced){campaign_specific}.
646
735
 
647
736
  Respond with a JSON array of objects, each containing:
648
737
  - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
649
738
  - "type": one of "table", "figure", "text", "section"
650
739
  - "confidence": your confidence score (0-100) that this location contains lineage data
651
740
  - "reason": brief explanation of why this location likely contains lineage
652
-
741
+ {campaign_field}
653
742
  IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
654
743
 
655
744
  Order by confidence score (highest first). Tables showing complete variant lineages or
@@ -660,9 +749,9 @@ Don't include oligonucleotide results or result from only one round.
660
749
 
661
750
  Example output:
662
751
  [
663
- {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"}},
664
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"}},
665
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"}}
752
+ {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
753
+ {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
754
+ {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
666
755
  ]
667
756
  """.strip()
668
757
 
@@ -827,7 +916,39 @@ def identify_evolution_locations(
827
916
 
828
917
  # Include TOC before the main text
829
918
  combined_text = toc_text + text if toc_text else text
830
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
919
+
920
+ # Add campaign context if provided
921
+ campaign_context = ""
922
+ campaign_specific = ""
923
+ campaign_field = ""
924
+ campaign_example = ""
925
+
926
+ if campaigns and len(campaigns) == 1:
927
+ # Single campaign - make it specific
928
+ camp = campaigns[0]
929
+ campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
930
+ if hasattr(camp, 'notes') and camp.notes:
931
+ campaign_context += f"- Key identifiers: {camp.notes}\n"
932
+ campaign_specific = f" for the '{camp.campaign_name}' campaign"
933
+ campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
934
+ campaign_example = f', "campaign_id": "{camp.campaign_id}"'
935
+ elif campaigns and len(campaigns) > 1:
936
+ # Multiple campaigns - list them all
937
+ campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
938
+ for camp in campaigns:
939
+ campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
940
+ campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
941
+ campaign_specific = " for any of the identified campaigns"
942
+ campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
943
+ campaign_example = ', "campaign_id": "campaign_id_here"'
944
+
945
+ prompt = _LINEAGE_LOC_PROMPT.format(
946
+ campaign_context=campaign_context,
947
+ max_results=max_results,
948
+ campaign_specific=campaign_specific,
949
+ campaign_field=campaign_field,
950
+ campaign_example=campaign_example
951
+ ) + "\n\nTEXT:\n" + combined_text
831
952
  locs: List[dict] = []
832
953
  try:
833
954
  locs = generate_json_with_retry(
@@ -1290,6 +1411,138 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1290
1411
 
1291
1412
  # ---- 6.4 Public API -------------------------------------------------------
1292
1413
 
1414
+ def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
1415
+ """Extract text from a specific location (table, section, etc.) in the full text."""
1416
+ import re
1417
+
1418
+ if location_type == 'table':
1419
+ # Find ALL mentions of this table and combine them
1420
+ location_clean = location.strip()
1421
+
1422
+ # Different ways the table might be referenced
1423
+ search_patterns = [
1424
+ location_clean, # Exact match
1425
+ location_clean.replace("Supplementary ", "Supp. "), # Common abbreviation
1426
+ location_clean.replace("Supplementary ", "S"), # E.g., "Table S3"
1427
+ location_clean.replace("Supplementary Table ", "Table S"), # Another common format
1428
+ ]
1429
+
1430
+ # Collect all occurrences
1431
+ all_occurrences = []
1432
+ seen_positions = set()
1433
+
1434
+ for search_term in search_patterns:
1435
+ pattern = re.compile(re.escape(search_term), re.IGNORECASE)
1436
+ for match in pattern.finditer(full_text):
1437
+ # Avoid duplicates from overlapping patterns
1438
+ if match.start() in seen_positions:
1439
+ continue
1440
+ seen_positions.add(match.start())
1441
+
1442
+ # Extract generous context around each mention
1443
+ start = max(0, match.start() - 1000)
1444
+ end = min(len(full_text), match.end() + 10000)
1445
+ context = full_text[start:end]
1446
+
1447
+ all_occurrences.append({
1448
+ 'position': match.start(),
1449
+ 'context': context,
1450
+ 'match': match.group()
1451
+ })
1452
+
1453
+ if not all_occurrences:
1454
+ log.warning(f"No occurrences of table '{location_clean}' found in text")
1455
+ return None
1456
+
1457
+ log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
1458
+
1459
+ # Combine all occurrences into one text for Gemini to analyze
1460
+ combined_text = f"=== All occurrences of {location_clean} ===\n\n"
1461
+
1462
+ for i, occurrence in enumerate(all_occurrences, 1):
1463
+ combined_text += f"--- Occurrence {i} at position {occurrence['position']} ---\n"
1464
+ combined_text += occurrence['context']
1465
+ combined_text += "\n\n"
1466
+
1467
+ # Limit total length to avoid overwhelming the model
1468
+ if len(combined_text) > 50000:
1469
+ combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
1470
+
1471
+ return combined_text
1472
+
1473
+ elif location_type == 'figure':
1474
+ # For figures, we mainly want the caption and any text description
1475
+ location_clean = location.strip()
1476
+ patterns = [
1477
+ rf'({re.escape(location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})',
1478
+ rf'(Figure\s+S?\d+[^\n]*{re.escape(location_clean.split()[-1] if location_clean.split() else location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})'
1479
+ ]
1480
+
1481
+ for pattern in patterns:
1482
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
1483
+ if match:
1484
+ # For figures, include surrounding context as the data might be described nearby
1485
+ start = max(0, match.start() - 1000)
1486
+ end = min(match.end() + 2000, len(full_text))
1487
+ return full_text[start:end]
1488
+
1489
+ elif location_type == 'section':
1490
+ # Look for section heading
1491
+ location_clean = location.strip()
1492
+ patterns = [
1493
+ # Section with number
1494
+ rf'((?:^|\n)\d+\.?\s*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]).*\n){{0,500}})',
1495
+ # Section without number
1496
+ rf'((?:^|\n){re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]|\n[A-Z]{{2,}}).*\n){{0,500}})',
1497
+ # More flexible section matching
1498
+ rf'((?:^|\n)[^\n]*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+|\n[A-Z]{{2,}}).*\n){{0,500}})'
1499
+ ]
1500
+
1501
+ for pattern in patterns:
1502
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
1503
+ if match:
1504
+ return match.group(1)
1505
+
1506
+ elif location_type == 'text':
1507
+ # Try to find the location as a page marker or general text
1508
+ if location.isdigit():
1509
+ # Page number - look for page markers
1510
+ page_num = int(location)
1511
+ # Look for page breaks or page numbers
1512
+ patterns = [
1513
+ rf'(?:^|\n)\s*-?\s*{page_num}\s*-?\s*\n((?:.*\n){{0,300}})',
1514
+ rf'(?:page|p\.?)\s*{page_num}[^\n]*\n((?:.*\n){{0,300}})',
1515
+ rf'\n{page_num}\n((?:.*\n){{0,300}})'
1516
+ ]
1517
+ for pattern in patterns:
1518
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
1519
+ if match:
1520
+ start = match.start()
1521
+ end = min(start + 15000, len(full_text))
1522
+ return full_text[start:end]
1523
+
1524
+ # Fallback: try fuzzy search for the location string
1525
+ location_words = location.split()
1526
+ if len(location_words) >= 2:
1527
+ # Try to find at least the first two words together
1528
+ search_pattern = rf'{re.escape(location_words[0])}\s+{re.escape(location_words[1])}'
1529
+ match = re.search(search_pattern, full_text, re.IGNORECASE)
1530
+ if match:
1531
+ start = max(0, match.start() - 500)
1532
+ end = min(match.start() + 8000, len(full_text))
1533
+ return full_text[start:end]
1534
+
1535
+ # Last resort: find any occurrence of the location string
1536
+ idx = full_text.lower().find(location.lower())
1537
+ if idx != -1:
1538
+ start = max(0, idx - 500)
1539
+ end = min(idx + 8000, len(full_text))
1540
+ return full_text[start:end]
1541
+
1542
+ log.warning(f"Could not find location '{location}' of type '{location_type}' in text")
1543
+ return None
1544
+
1545
+
1293
1546
  def get_lineage(
1294
1547
  caption_text: str,
1295
1548
  full_text: str,
@@ -1328,63 +1581,220 @@ def get_lineage(
1328
1581
  campaigns = [default_campaign]
1329
1582
  log.info(f"Created default campaign: {default_campaign.campaign_name}")
1330
1583
 
1331
- # Use captions for identification - they're concise and focused
1332
- locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
1333
-
1334
1584
  all_variants = []
1335
1585
 
1336
1586
  if campaigns:
1337
- # If we have campaigns but no specific locations, use general extraction
1338
- if not locations:
1339
- log.info("No specific lineage locations found, extracting from full text with campaign context")
1340
- # Extract lineage for each campaign using full text
1587
+ log.info("Using campaign-aware location identification")
1588
+
1589
+ # Process each campaign separately
1590
+ for campaign in campaigns:
1591
+ log.info(f"\nProcessing campaign: {campaign.campaign_id} - {campaign.campaign_name}")
1592
+
1593
+ # Use identify_evolution_locations with campaign context
1594
+ locations = identify_evolution_locations(
1595
+ caption_text,
1596
+ model,
1597
+ max_results=5,
1598
+ debug_dir=debug_dir,
1599
+ campaigns=[campaign], # Pass single campaign for focused search
1600
+ pdf_paths=pdf_paths
1601
+ )
1602
+
1603
+ if not locations:
1604
+ log.warning(f"No locations found for campaign {campaign.campaign_id}, trying full text extraction")
1605
+ # Fall back to full text extraction
1606
+ campaign_variants = extract_complete_lineage(
1607
+ full_text, model,
1608
+ debug_dir=debug_dir,
1609
+ campaign_id=campaign.campaign_id,
1610
+ campaign_info=campaign,
1611
+ pdf_paths=pdf_paths
1612
+ )
1613
+ all_variants.extend(campaign_variants)
1614
+ continue
1615
+
1616
+ log.info(f"Found {len(locations)} potential locations for campaign {campaign.campaign_id}")
1617
+ for loc in locations:
1618
+ log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
1619
+
1620
+ # Try to extract from the best location
1621
+ extracted_variants = []
1622
+ for location in locations:
1623
+ if extracted_variants:
1624
+ break # Already got variants
1625
+
1626
+ location_str = location.get('location', '')
1627
+ location_type = location.get('type', '')
1628
+ confidence = location.get('confidence', 0)
1629
+
1630
+ # Try figure extraction for high-confidence figures
1631
+ if location_type == 'figure' and confidence >= 70 and pdf_paths:
1632
+ log.info(f"Attempting to extract figure: {location_str}")
1633
+
1634
+ figure_bytes = None
1635
+ for pdf_path in pdf_paths:
1636
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
1637
+ if figure_bytes:
1638
+ log.info(f"Successfully extracted figure from {pdf_path.name}")
1639
+ break
1640
+
1641
+ if figure_bytes:
1642
+ # Save figure if debug enabled
1643
+ if debug_dir:
1644
+ debug_path = Path(debug_dir)
1645
+ debug_path.mkdir(parents=True, exist_ok=True)
1646
+ figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1647
+ _dump(figure_bytes, figure_file)
1648
+ log.info(f"Saved figure to: {figure_file}")
1649
+
1650
+ # Extract lineage from figure
1651
+ variants = extract_lineage_from_figure(
1652
+ figure_bytes, model,
1653
+ debug_dir=debug_dir,
1654
+ campaign_id=campaign.campaign_id,
1655
+ campaign_info=campaign
1656
+ )
1657
+ if variants:
1658
+ log.info(f"Extracted {len(variants)} variants from figure")
1659
+ extracted_variants = variants
1660
+ continue
1661
+
1662
+ # Try table/text extraction
1663
+ if location_type in ['table', 'text', 'section'] and not extracted_variants:
1664
+ log.info(f"Attempting text extraction for {location_type}: {location_str}")
1665
+
1666
+ # Extract the specific section/table from full text
1667
+ section_text = _extract_location_text(full_text, location_str, location_type)
1668
+ if section_text:
1669
+ log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
1670
+ # Save extracted section if debug enabled
1671
+ if debug_dir:
1672
+ debug_path = Path(debug_dir)
1673
+ section_file = debug_path / f"extracted_{location_type}_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
1674
+ _dump(f"=== EXTRACTED {location_type.upper()} ===\nLocation: {location_str}\nLength: {len(section_text)} chars\n{'='*80}\n\n{section_text}", section_file)
1675
+
1676
+ variants = extract_complete_lineage(
1677
+ section_text, model,
1678
+ debug_dir=debug_dir,
1679
+ campaign_id=campaign.campaign_id,
1680
+ campaign_info=campaign,
1681
+ pdf_paths=pdf_paths
1682
+ )
1683
+ if variants:
1684
+ log.info(f"Extracted {len(variants)} variants from {location_type}")
1685
+ extracted_variants = variants
1686
+ else:
1687
+ log.warning(f"Could not extract text from {location_type}: {location_str}")
1688
+
1689
+ # If no variants extracted from specific locations, try full text
1690
+ if not extracted_variants:
1691
+ log.warning(f"Could not extract from specific locations, trying full text for campaign {campaign.campaign_id}")
1692
+ extracted_variants = extract_complete_lineage(
1693
+ full_text, model,
1694
+ debug_dir=debug_dir,
1695
+ campaign_id=campaign.campaign_id,
1696
+ campaign_info=campaign,
1697
+ pdf_paths=pdf_paths
1698
+ )
1699
+
1700
+ all_variants.extend(extracted_variants)
1701
+
1702
+ return all_variants, campaigns
1703
+
1704
+ # Original fallback code for when no campaigns are identified
1705
+ log.info("Processing campaigns with direct caption and TOC analysis (skipping global location finding)")
1706
+
1707
+ # Prepare all captions and TOC with context for campaign-specific selection
1708
+ caption_entries = []
1709
+
1710
+ # Add table of contents entries if available
1711
+ if pdf_paths:
1712
+ toc_sections = []
1713
+ for pdf_path in pdf_paths:
1714
+ # Extract first few pages looking for TOC
1715
+ try:
1716
+ import fitz # PyMuPDF
1717
+ doc = fitz.open(pdf_path)
1718
+ toc_text = ""
1719
+ for page_num in range(min(5, doc.page_count)): # First 5 pages
1720
+ page = doc[page_num] # Correct PyMuPDF syntax
1721
+ page_text = page.get_text()
1722
+ if any(keyword in page_text.lower() for keyword in ['contents', 'table of contents', 'overview']):
1723
+ toc_text += f"\n--- Page {page_num + 1} TOC ---\n{page_text}\n"
1724
+ doc.close()
1725
+ if toc_text:
1726
+ toc_sections.append(toc_text)
1727
+ except Exception as e:
1728
+ log.warning(f"Failed to extract TOC from {pdf_path}: {e}")
1729
+
1730
+ if toc_sections:
1731
+ caption_entries.append({
1732
+ 'type': 'table_of_contents',
1733
+ 'location': 'Table of Contents',
1734
+ 'context': '\n'.join(toc_sections)[:1000] + "..."
1735
+ })
1736
+
1737
+ # Parse figure and table captions from caption_text
1738
+ # Split by common caption patterns
1739
+ caption_patterns = [
1740
+ r'(?:^|\n)(?:Figure|Fig\.?)\s*\d+[:\.]',
1741
+ r'(?:^|\n)(?:Table|Tab\.?)\s*\d+[:\.]',
1742
+ r'(?:^|\n)(?:Scheme|Sch\.?)\s*\d+[:\.]'
1743
+ ]
1744
+
1745
+ import re
1746
+ for pattern in caption_patterns:
1747
+ matches = list(re.finditer(pattern, caption_text, re.MULTILINE | re.IGNORECASE))
1748
+ for i, match in enumerate(matches):
1749
+ start_pos = match.start()
1750
+ # Find the end of this caption (start of next caption or end of text)
1751
+ if i + 1 < len(matches):
1752
+ end_pos = matches[i + 1].start()
1753
+ else:
1754
+ end_pos = min(start_pos + 2000, len(caption_text)) # Max 2000 chars per caption
1755
+
1756
+ caption_content = caption_text[start_pos:end_pos].strip()
1757
+ if len(caption_content) > 20: # Skip very short captions
1758
+ # Extract context from full text around this caption
1759
+ context_start = max(0, full_text.find(caption_content[:100]) - 500)
1760
+ context_end = min(len(full_text), context_start + 2000)
1761
+ context = full_text[context_start:context_end]
1762
+
1763
+ caption_entries.append({
1764
+ 'type': 'figure' if 'fig' in pattern.lower() else 'table' if 'tab' in pattern.lower() else 'scheme',
1765
+ 'location': caption_content.split('\n')[0][:100] + "..." if len(caption_content.split('\n')[0]) > 100 else caption_content.split('\n')[0],
1766
+ 'context': context
1767
+ })
1768
+
1769
+ log.info(f"Prepared {len(caption_entries)} caption/TOC entries for campaign-specific analysis")
1770
+
1771
+ # If no caption entries found, fall back to full text extraction
1772
+ if not caption_entries:
1773
+ log.info("No caption entries found, extracting from full text with campaign context")
1341
1774
  for campaign in campaigns:
1342
1775
  log.info(f"Processing campaign: {campaign.campaign_id}")
1343
- campaign_variants = extract_campaign_lineage(
1344
- full_text, model, campaign_id=campaign.campaign_id,
1345
- debug_dir=debug_dir, pdf_paths=pdf_paths,
1346
- campaign_info=campaign
1776
+ campaign_variants = extract_complete_lineage(
1777
+ full_text, model,
1778
+ debug_dir=debug_dir,
1779
+ campaign_id=campaign.campaign_id,
1780
+ campaign_info=campaign,
1781
+ pdf_paths=pdf_paths
1347
1782
  )
1348
1783
  all_variants.extend(campaign_variants)
1349
1784
  return all_variants, campaigns
1350
- # Original logic for when we have both locations and campaigns
1351
- # Log location information
1352
- location_summary = []
1353
- for loc in locations[:5]:
1354
- if isinstance(loc, dict):
1355
- location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
1356
- else:
1357
- location_summary.append(str(loc))
1358
- log.info("Gemini identified %d potential lineage locations: %s",
1359
- len(locations), ", ".join(location_summary))
1360
-
1361
- # Extract context around each location for better decision making
1362
- locations_with_context = []
1363
- for loc in locations:
1364
- location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1365
- # Extract 1000 chars of context around the location
1366
- context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1367
- locations_with_context.append({
1368
- 'location': loc,
1369
- 'context': context_text # Full extracted context
1370
- })
1371
-
1372
- # For each campaign, ask Gemini to select the best location
1785
+
1786
+ # For each campaign, ask Gemini to select the best location from captions/TOC
1373
1787
  for campaign in campaigns:
1374
1788
  log.info(f"Processing campaign: {campaign.campaign_id}")
1375
1789
 
1376
- # Build locations context string
1790
+ # Build locations context string from caption entries
1377
1791
  locations_str = ""
1378
- for i, loc_ctx in enumerate(locations_with_context):
1379
- loc = loc_ctx['location']
1380
- context = loc_ctx['context']
1381
- location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1382
- location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
1383
- confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
1384
- reason = loc.get('reason', '') if isinstance(loc, dict) else ''
1792
+ for i, entry in enumerate(caption_entries):
1793
+ location_str = entry['location']
1794
+ location_type = entry['type']
1795
+ context = entry['context']
1385
1796
 
1386
- locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
1387
- locations_str += f" Reason: {reason}\n"
1797
+ locations_str += f"\n{i+1}. {location_str} (Type: {location_type})\n"
1388
1798
  locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
1389
1799
 
1390
1800
  # Ask Gemini to select best location for this campaign
@@ -1426,26 +1836,39 @@ def get_lineage(
1426
1836
 
1427
1837
  log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
1428
1838
 
1429
- # Find the actual location object
1430
- for loc in locations:
1431
- loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1432
- if loc_str == selected_location:
1433
- primary_location = loc
1839
+ # Find the actual caption entry
1840
+ selected_entry = None
1841
+ for entry in caption_entries:
1842
+ if entry['location'] == selected_location:
1843
+ selected_entry = entry
1434
1844
  break
1435
1845
 
1436
- if not primary_location:
1437
- log.warning(f"Could not find selected location '{selected_location}' in locations list")
1438
- # Fall back to highest confidence location
1439
- primary_location = sorted(locations,
1440
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1441
- reverse=True)[0] if locations else None
1846
+ if not selected_entry:
1847
+ log.warning(f"Could not find selected location '{selected_location}' in caption entries")
1848
+ # Fall back to first entry
1849
+ selected_entry = caption_entries[0] if caption_entries else None
1850
+
1851
+ # Convert caption entry to location format for compatibility
1852
+ if selected_entry:
1853
+ primary_location = {
1854
+ 'location': selected_entry['location'],
1855
+ 'type': selected_entry['type'],
1856
+ 'confidence': 0.8, # Default confidence for caption-based selection
1857
+ 'reason': f"Selected from {selected_entry['type']} captions"
1858
+ }
1442
1859
 
1443
1860
  except Exception as e:
1444
1861
  log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
1445
- # Fall back to highest confidence location
1446
- primary_location = sorted(locations,
1447
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1448
- reverse=True)[0] if locations else None
1862
+ # Fall back to first caption entry
1863
+ if caption_entries:
1864
+ primary_location = {
1865
+ 'location': caption_entries[0]['location'],
1866
+ 'type': caption_entries[0]['type'],
1867
+ 'confidence': 0.5, # Lower confidence for fallback
1868
+ 'reason': f"Fallback to first {caption_entries[0]['type']} caption"
1869
+ }
1870
+ else:
1871
+ primary_location = None
1449
1872
 
1450
1873
  if not primary_location:
1451
1874
  log.warning(f"No location found for campaign {campaign.campaign_id}")
@@ -1587,6 +2010,97 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
1587
2010
  return []
1588
2011
 
1589
2012
  # --- 7.2 Page-based extraction helper ---------------------------------------
2013
+ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
2014
+ """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
2015
+
2016
+ Args:
2017
+ prompt: The prompt to send to Gemini
2018
+ model: The Gemini model instance
2019
+ context: Additional context for logging (e.g., "validation" or "extraction")
2020
+
2021
+ Returns:
2022
+ The validated sequence or None if no consensus
2023
+ """
2024
+ sequences = []
2025
+ max_attempts = 5 # Increased from 3 to 5
2026
+
2027
+ # Try up to 5 times
2028
+ for attempt in range(max_attempts):
2029
+ try:
2030
+ response = model.generate_content(prompt)
2031
+ result = _extract_text(response).strip()
2032
+
2033
+ # Parse the result to extract just the sequence
2034
+ if result == "VALID":
2035
+ sequences.append("VALID")
2036
+ elif result == "UNCERTAIN":
2037
+ sequences.append("UNCERTAIN")
2038
+ elif result.startswith("M") and len(result) > 50:
2039
+ # Clean the sequence
2040
+ clean_seq = result.upper().replace(" ", "").replace("\n", "")
2041
+ if all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in clean_seq):
2042
+ sequences.append(clean_seq)
2043
+ else:
2044
+ sequences.append("INVALID")
2045
+ else:
2046
+ sequences.append("INVALID")
2047
+
2048
+ log.info(f"Gemini {context} attempt {attempt + 1}: {len(result) if result.startswith('M') else result}")
2049
+
2050
+ except Exception as e:
2051
+ log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
2052
+ sequences.append("ERROR")
2053
+
2054
+ # Check for early consensus after 2 attempts
2055
+ if len(sequences) == 2:
2056
+ # Clean sequences before comparison
2057
+ seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
2058
+ seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
2059
+
2060
+ if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
2061
+ log.info(f"Gemini {context} consensus reached after 2 attempts")
2062
+ return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
2063
+ else:
2064
+ log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
2065
+
2066
+ # After all attempts, find consensus
2067
+ valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
2068
+
2069
+ if not valid_sequences:
2070
+ log.error(f"All {max_attempts} {context} attempts failed")
2071
+ return None
2072
+
2073
+ # Find any matching pair
2074
+ for i in range(len(sequences)):
2075
+ for j in range(i + 1, len(sequences)):
2076
+ # Clean sequences before comparison
2077
+ seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
2078
+ seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
2079
+
2080
+ if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
2081
+ log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
2082
+ return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
2083
+
2084
+ # If no exact match, use adaptive validation
2085
+ # Count occurrences of each valid sequence
2086
+ sequence_counts = {}
2087
+ for seq in valid_sequences:
2088
+ if seq not in ["VALID", "UNCERTAIN"]:
2089
+ # Clean sequence before counting
2090
+ seq_clean = seq.replace(" ", "").replace("\n", "")
2091
+ sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
2092
+
2093
+ # Return the most common sequence if it appears at least twice
2094
+ if sequence_counts:
2095
+ most_common = max(sequence_counts.items(), key=lambda x: x[1])
2096
+ if most_common[1] >= 2:
2097
+ log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
2098
+ return most_common[0]
2099
+
2100
+ log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
2101
+ return None
2102
+
2103
+
1590
2104
  def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
1591
2105
  """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
1592
2106
 
@@ -1616,7 +2130,7 @@ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant],
1616
2130
  if not local_issues:
1617
2131
  return None # No obvious issues found
1618
2132
 
1619
- log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
2133
+ log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
1620
2134
 
1621
2135
  prompt = f"""
1622
2136
  You are validating a protein sequence that was extracted from a scientific paper.
@@ -1641,26 +2155,14 @@ Return ONLY the corrected sequence if changes are needed, or "VALID" if no chang
1641
2155
  If you cannot determine the correct sequence, return "UNCERTAIN".
1642
2156
  """
1643
2157
 
1644
- try:
1645
- response = model.generate_content(prompt)
1646
- result = _extract_text(response).strip()
1647
-
1648
- if result == "VALID":
1649
- return None # No changes needed
1650
- elif result == "UNCERTAIN":
1651
- log.warning("Gemini could not validate sequence against mutations")
1652
- return None
1653
- elif result.startswith("M") and len(result) > 50:
1654
- # Gemini returned a corrected sequence
1655
- log.info(f"Gemini suggested sequence correction (length {len(result)})")
1656
- return result
1657
- else:
1658
- log.warning(f"Unexpected validation response: {result[:100]}...")
1659
- return None
1660
-
1661
- except Exception as e:
1662
- log.warning(f"Failed to validate sequence: {e}")
1663
- return None
2158
+ # Use triple validation
2159
+ result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
2160
+
2161
+ if result == "VALID" or result is None:
2162
+ return None # No changes needed
2163
+ else:
2164
+ log.info(f"Gemini suggested sequence correction (length {len(result)})")
2165
+ return result
1664
2166
 
1665
2167
 
1666
2168
  def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
@@ -1827,10 +2329,18 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
1827
2329
  - Extract the variant_id exactly as written where the sequence appears
1828
2330
  - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
1829
2331
 
2332
+ SEQUENCE EXTRACTION RULES:
2333
+ - Copy sequences EXACTLY as they appear in the text
2334
+ - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2335
+ - Do NOT add, remove, or modify any amino acids
2336
+ - Preserve the exact length and character sequence
2337
+ - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2338
+ - Double-check that consecutive identical amino acids are copied correctly
2339
+
1830
2340
  For each variant return:
1831
2341
  * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
1832
- * aa_seq - amino-acid sequence (uppercase), or null
1833
- * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
2342
+ * aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
2343
+ * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
1834
2344
 
1835
2345
  Respond ONLY with **minified JSON** that matches the schema below.
1836
2346
  NO markdown, no code fences, no commentary.
@@ -1846,8 +2356,258 @@ TEXT (may be truncated):
1846
2356
  ```
1847
2357
  """.strip()
1848
2358
 
1849
- def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
1850
- """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
2359
+ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2360
+ """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
2361
+
2362
+ Args:
2363
+ model: The Gemini model instance
2364
+ prompt: The prompt to send to Gemini
2365
+ schema_hint: The JSON schema hint
2366
+ debug_dir: Optional debug directory
2367
+
2368
+ Returns:
2369
+ The validated sequence JSON data or None if no consensus
2370
+ """
2371
+ responses = []
2372
+ max_attempts = 5 # Increased from 3 to 5
2373
+
2374
+ # Try up to 5 times
2375
+ for attempt in range(max_attempts):
2376
+ try:
2377
+ log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
2378
+ resp = model.generate_content(prompt)
2379
+ raw = _extract_text(resp).strip()
2380
+
2381
+ # Save debug info
2382
+ if debug_dir:
2383
+ debug_path = Path(debug_dir)
2384
+ debug_path.mkdir(parents=True, exist_ok=True)
2385
+ response_file = debug_path / f"sequences_attempt_{attempt + 1}_{int(time.time())}.txt"
2386
+ with open(response_file, 'w') as f:
2387
+ f.write(f"=== SEQUENCE EXTRACTION ATTEMPT {attempt + 1} ===\n")
2388
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
2389
+ f.write(f"Length: {len(raw)} characters\n")
2390
+ f.write("="*80 + "\n\n")
2391
+ f.write(raw)
2392
+
2393
+ # Parse JSON response (similar to generate_json_with_retry logic)
2394
+ fence_re = re.compile(r"```json|```", re.I)
2395
+ if raw.startswith("```"):
2396
+ raw = fence_re.sub("", raw).strip()
2397
+
2398
+ # Try to parse as JSON
2399
+ try:
2400
+ parsed = json.loads(raw)
2401
+ except json.JSONDecodeError:
2402
+ # Look for JSON array or object in the response
2403
+ json_start = -1
2404
+ json_end = -1
2405
+ bracket_stack = []
2406
+ in_string = False
2407
+ escape_next = False
2408
+
2409
+ for i, char in enumerate(raw):
2410
+ if escape_next:
2411
+ escape_next = False
2412
+ continue
2413
+
2414
+ if char == '\\':
2415
+ escape_next = True
2416
+ continue
2417
+
2418
+ if char == '"' and not escape_next:
2419
+ in_string = not in_string
2420
+ continue
2421
+
2422
+ if in_string:
2423
+ continue
2424
+
2425
+ if char in '[{':
2426
+ if json_start == -1:
2427
+ json_start = i
2428
+ bracket_stack.append(char)
2429
+ elif char in ']}':
2430
+ if bracket_stack:
2431
+ opening = bracket_stack.pop()
2432
+ if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
2433
+ if not bracket_stack: # Found complete JSON
2434
+ json_end = i + 1
2435
+ break
2436
+
2437
+ if json_start >= 0 and json_end > json_start:
2438
+ json_str = raw[json_start:json_end]
2439
+ parsed = json.loads(json_str)
2440
+ else:
2441
+ if '[]' in raw:
2442
+ parsed = []
2443
+ else:
2444
+ raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
2445
+
2446
+ # Store both the original and normalized response
2447
+ normalized_response = _normalize_sequence_response(parsed)
2448
+ responses.append((parsed, normalized_response))
2449
+
2450
+ log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
2451
+
2452
+ except Exception as e:
2453
+ log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
2454
+ responses.append(None)
2455
+
2456
+ # Check for early consensus after 2 attempts
2457
+ if len(responses) == 2:
2458
+ if (responses[0] and responses[1] and
2459
+ _sequences_match(responses[0][1], responses[1][1])):
2460
+ log.info("Sequence extraction consensus reached after 2 attempts")
2461
+ return responses[0][0] # Return original parsed data
2462
+ else:
2463
+ log.info("Sequence extraction mismatch after 2 attempts - trying third")
2464
+
2465
+ # After all attempts, use adaptive validation
2466
+ valid_responses = [r for r in responses if r is not None]
2467
+
2468
+ if not valid_responses:
2469
+ log.error(f"All {max_attempts} sequence extraction attempts failed")
2470
+ return None
2471
+
2472
+ # First, try to find exact consensus (any matching pair)
2473
+ for i in range(len(valid_responses)):
2474
+ for j in range(i + 1, len(valid_responses)):
2475
+ if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
2476
+ log.info(f"Sequence extraction consensus found: attempts with matching content")
2477
+ return valid_responses[i][0] # Return original parsed data
2478
+
2479
+ # If no exact consensus, use adaptive validation
2480
+ log.info("No exact consensus found, applying adaptive validation...")
2481
+
2482
+ # Find sequences that appear consistently across multiple attempts
2483
+ consistent_sequences = _find_consistent_sequences(valid_responses)
2484
+
2485
+ if consistent_sequences:
2486
+ log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
2487
+ return consistent_sequences
2488
+
2489
+ # If still no consensus, use the attempt with the most sequences
2490
+ best_response = max(valid_responses,
2491
+ key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
2492
+
2493
+ if best_response and len(best_response[1]) > 0:
2494
+ log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
2495
+ return best_response[0]
2496
+
2497
+ log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
2498
+ return None
2499
+
2500
+
2501
+ def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
2502
+ """Find sequences that appear consistently across multiple extraction attempts.
2503
+
2504
+ Args:
2505
+ valid_responses: List of (original_data, normalized_data) tuples
2506
+
2507
+ Returns:
2508
+ List of consistent sequences with confidence scores, or None if none found
2509
+ """
2510
+ if not valid_responses:
2511
+ return None
2512
+
2513
+ # Count how many times each sequence appears
2514
+ sequence_counts = {}
2515
+ sequence_full_data = {}
2516
+
2517
+ for original, normalized in valid_responses:
2518
+ if not isinstance(normalized, list):
2519
+ continue
2520
+
2521
+ for seq in normalized:
2522
+ variant_id = seq.get("variant_id", "")
2523
+ aa_seq = seq.get("aa_seq", "")
2524
+ # Clean sequence before using in key
2525
+ aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
2526
+
2527
+ # Create a unique key for this sequence
2528
+ key = f"{variant_id}|{aa_seq_clean}"
2529
+
2530
+ if key not in sequence_counts:
2531
+ sequence_counts[key] = 0
2532
+ sequence_full_data[key] = []
2533
+
2534
+ sequence_counts[key] += 1
2535
+
2536
+ # Find the full data for this sequence from the original response
2537
+ if isinstance(original, list):
2538
+ for orig_seq in original:
2539
+ if (orig_seq.get("variant_id") == variant_id and
2540
+ orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
2541
+ sequence_full_data[key].append(orig_seq)
2542
+ break
2543
+
2544
+ # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
2545
+ min_appearances = max(2, len(valid_responses) // 2)
2546
+ consistent_sequences = []
2547
+
2548
+ for key, count in sequence_counts.items():
2549
+ if count >= min_appearances:
2550
+ # Use the first occurrence of the full data
2551
+ if sequence_full_data[key]:
2552
+ seq_data = sequence_full_data[key][0].copy()
2553
+ # Add confidence based on how many times it appeared
2554
+ seq_data["confidence"] = count / len(valid_responses)
2555
+ seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
2556
+ consistent_sequences.append(seq_data)
2557
+
2558
+ return consistent_sequences if consistent_sequences else None
2559
+
2560
+
2561
+ def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
2562
+ """Normalize sequence response for comparison."""
2563
+ if not isinstance(data, list):
2564
+ return []
2565
+
2566
+ normalized = []
2567
+ for item in data:
2568
+ if isinstance(item, dict):
2569
+ # Extract key fields for comparison
2570
+ normalized_item = {
2571
+ "variant_id": item.get("variant_id", ""),
2572
+ "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
2573
+ "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
2574
+ "confidence": item.get("confidence", 0.0)
2575
+ }
2576
+ normalized.append(normalized_item)
2577
+
2578
+ # Sort by variant_id for consistent comparison
2579
+ return sorted(normalized, key=lambda x: x["variant_id"])
2580
+
2581
+
2582
+ def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
2583
+ """Check if two sequence response lists match on key fields."""
2584
+ if len(seq1) != len(seq2):
2585
+ return False
2586
+
2587
+ for i, (s1, s2) in enumerate(zip(seq1, seq2)):
2588
+ # Compare variant IDs
2589
+ if s1.get("variant_id") != s2.get("variant_id"):
2590
+ return False
2591
+
2592
+ # Compare amino acid sequences (most critical)
2593
+ aa1 = s1.get("aa_seq", "")
2594
+ aa2 = s2.get("aa_seq", "")
2595
+ if aa1 and aa2 and aa1 != aa2:
2596
+ return False
2597
+ elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
2598
+ return False
2599
+
2600
+ # Compare DNA sequences if present
2601
+ dna1 = s1.get("dna_seq", "")
2602
+ dna2 = s2.get("dna_seq", "")
2603
+ if dna1 and dna2 and dna1 != dna2:
2604
+ return False
2605
+
2606
+ return True
2607
+
2608
+
2609
+ def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
2610
+ """Prompt Gemini and convert its JSON reply into SequenceBlock objects with triple validation."""
1851
2611
  base_prompt = _SEQ_EXTRACTION_PROMPT.format(
1852
2612
  schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
1853
2613
  )
@@ -1864,8 +2624,50 @@ Match sequences to these known variants when possible. Variants may be labeled d
1864
2624
  else:
1865
2625
  prompt = base_prompt
1866
2626
 
1867
- data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
1868
- return _parse_sequences(data)
2627
+ # Add mutation validation context if we have lineage variants with mutations
2628
+ if lineage_variants:
2629
+ mutation_context = _build_mutation_validation_context(lineage_variants)
2630
+ if mutation_context:
2631
+ prompt = f"""{prompt}
2632
+
2633
+ CRITICAL MUTATION VALIDATION:
2634
+ {mutation_context}
2635
+
2636
+ IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
2637
+ For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
2638
+ """
2639
+
2640
+ # Save the complete prompt for debugging
2641
+ if debug_dir:
2642
+ debug_path = Path(debug_dir)
2643
+ debug_path.mkdir(parents=True, exist_ok=True)
2644
+ prompt_file = debug_path / f"sequence_extraction_prompt_{int(time.time())}.txt"
2645
+ with open(prompt_file, 'w') as f:
2646
+ f.write(f"=== SEQUENCE EXTRACTION PROMPT ===\n")
2647
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
2648
+ f.write(f"Text length: {len(text)} characters\n")
2649
+ f.write(f"Truncated to: {len(text[:MAX_CHARS])} characters\n")
2650
+ f.write(f"Total prompt length: {len(prompt)} characters\n")
2651
+ f.write("="*80 + "\n\n")
2652
+ f.write(prompt)
2653
+ log.info(f"Saved sequence extraction prompt to {prompt_file}")
2654
+
2655
+ # Use triple validation for sequence extraction
2656
+ log.info("Extracting sequences with triple validation to ensure accuracy")
2657
+ data = _extract_sequences_with_triple_validation(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir)
2658
+
2659
+ if not data:
2660
+ log.warning("Failed to get consistent sequence extraction after triple validation")
2661
+ return []
2662
+
2663
+ extracted_sequences = _parse_sequences(data)
2664
+
2665
+ # Post-process: validate sequences against mutations if we have lineage info
2666
+ if lineage_variants:
2667
+ validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
2668
+ return validated_sequences
2669
+
2670
+ return extracted_sequences
1869
2671
 
1870
2672
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
1871
2673
  _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
@@ -1916,6 +2718,167 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
1916
2718
  )
1917
2719
  return blocks
1918
2720
 
2721
+ def _build_mutation_validation_context(lineage_variants: List[Variant]) -> str:
2722
+ """Build mutation context for sequence validation."""
2723
+ mutation_info = []
2724
+
2725
+ for variant in lineage_variants:
2726
+ if variant.mutations and variant.parent_id:
2727
+ mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
2728
+ mutation_info.append(f"Variant '{variant.variant_id}' (parent: '{variant.parent_id}') has mutations: {mutations_str}")
2729
+
2730
+ if not mutation_info:
2731
+ return ""
2732
+
2733
+ context = "Known mutation relationships:\n" + "\n".join(mutation_info[:10]) # Limit to first 10 for context
2734
+ if len(mutation_info) > 10:
2735
+ context += f"\n... and {len(mutation_info) - 10} more variants with mutations"
2736
+
2737
+ return context
2738
+
2739
+ def _validate_sequences_against_mutations(sequences: List[SequenceBlock], lineage_variants: List[Variant], model, debug_dir: str | Path | None = None) -> List[SequenceBlock]:
2740
+ """Validate extracted sequences against known mutations and fix inconsistencies."""
2741
+ # Create lookups for easier access
2742
+ seq_lookup = {seq.variant_id: seq for seq in sequences}
2743
+ variant_lookup = {var.variant_id: var for var in lineage_variants}
2744
+
2745
+ validation_issues = []
2746
+ corrected_sequences = []
2747
+
2748
+ for seq in sequences:
2749
+ variant = variant_lookup.get(seq.variant_id)
2750
+ if not variant or not variant.parent_id or not variant.mutations or not seq.aa_seq:
2751
+ corrected_sequences.append(seq)
2752
+ continue
2753
+
2754
+ parent_seq = seq_lookup.get(variant.parent_id)
2755
+ if not parent_seq or not parent_seq.aa_seq:
2756
+ corrected_sequences.append(seq)
2757
+ continue
2758
+
2759
+ # Check if mutations are consistent
2760
+ issues = _check_mutation_consistency(seq.aa_seq, parent_seq.aa_seq, variant.mutations, seq.variant_id, variant.parent_id)
2761
+
2762
+ if issues:
2763
+ validation_issues.extend(issues)
2764
+ log.warning(f"Sequence validation issues for {seq.variant_id}: {'; '.join(issues)}")
2765
+
2766
+ # Try to get corrected sequence from Gemini
2767
+ corrected_seq = _get_corrected_sequence_from_gemini(seq, parent_seq, variant, issues, model, debug_dir)
2768
+ if corrected_seq:
2769
+ corrected_sequences.append(corrected_seq)
2770
+ log.info(f"Corrected sequence for {seq.variant_id} using Gemini validation")
2771
+ else:
2772
+ corrected_sequences.append(seq) # Keep original if correction fails
2773
+ else:
2774
+ corrected_sequences.append(seq)
2775
+
2776
+ if validation_issues:
2777
+ log.warning(f"Found {len(validation_issues)} sequence validation issues across {len([s for s in sequences if s.variant_id in [v.variant_id for v in lineage_variants if v.mutations]])} variants with mutations")
2778
+
2779
+ return corrected_sequences
2780
+
2781
+ def _check_mutation_consistency(child_seq: str, parent_seq: str, mutations, child_id: str, parent_id: str) -> List[str]:
2782
+ """Check if mutations are consistent between parent and child sequences."""
2783
+ import re
2784
+
2785
+ issues = []
2786
+
2787
+ # Parse mutations (handle both string and list formats)
2788
+ if isinstance(mutations, list):
2789
+ mutation_strs = mutations
2790
+ else:
2791
+ mutation_strs = [m.strip() for m in str(mutations).split(',') if m.strip()]
2792
+
2793
+ for mut_str in mutation_strs:
2794
+ # Parse mutation like "A100V"
2795
+ match = re.match(r'^([A-Z])(\d+)([A-Z])$', mut_str.strip())
2796
+ if not match:
2797
+ continue # Skip non-standard mutation formats
2798
+
2799
+ orig_aa, pos_str, new_aa = match.groups()
2800
+ pos = int(pos_str) - 1 # Convert to 0-based indexing
2801
+
2802
+ # Check bounds
2803
+ if pos >= len(parent_seq) or pos >= len(child_seq):
2804
+ issues.append(f"Mutation {mut_str} position out of bounds")
2805
+ continue
2806
+
2807
+ # Check parent sequence has expected original amino acid
2808
+ if parent_seq[pos] != orig_aa:
2809
+ issues.append(f"Mutation {mut_str}: parent {parent_id} has {parent_seq[pos]} at position {pos+1}, expected {orig_aa}")
2810
+
2811
+ # Check child sequence has expected new amino acid
2812
+ if child_seq[pos] != new_aa:
2813
+ issues.append(f"Mutation {mut_str}: child {child_id} has {child_seq[pos]} at position {pos+1}, expected {new_aa}")
2814
+
2815
+ return issues
2816
+
2817
+ def _get_corrected_sequence_from_gemini(seq: SequenceBlock, parent_seq: SequenceBlock, variant: Variant, issues: List[str], model, debug_dir: str | Path | None = None) -> SequenceBlock | None:
2818
+ """Use Gemini to get a corrected sequence based on mutation validation issues."""
2819
+ if not model:
2820
+ return None
2821
+
2822
+ mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
2823
+ issues_str = "; ".join(issues)
2824
+
2825
+ prompt = f"""You extracted a sequence for variant "{seq.variant_id}" but there are mutation validation issues:
2826
+
2827
+ ISSUES: {issues_str}
2828
+
2829
+ PARENT SEQUENCE ({variant.parent_id}):
2830
+ {parent_seq.aa_seq}
2831
+
2832
+ EXTRACTED SEQUENCE ({seq.variant_id}):
2833
+ {seq.aa_seq}
2834
+
2835
+ EXPECTED MUTATIONS: {mutations_str}
2836
+
2837
+ Based on the parent sequence and the expected mutations, provide the CORRECT sequence for {seq.variant_id}.
2838
+ Apply each mutation to the parent sequence in order.
2839
+
2840
+ For example, if parent has "A" at position 100 and mutation is "A100V", then child should have "V" at position 100.
2841
+
2842
+ IMPORTANT SEQUENCE RULES:
2843
+ - Copy the sequence EXACTLY - do not add, remove, or modify any amino acids
2844
+ - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2845
+ - Preserve the exact length of the sequence
2846
+ - Only change the specific positions indicated by the mutations
2847
+ - Double-check that consecutive identical amino acids are copied correctly
2848
+
2849
+ Return ONLY the corrected amino acid sequence (no explanation, no formatting).
2850
+ If you cannot determine the correct sequence, return "UNCERTAIN".
2851
+ """
2852
+
2853
+ try:
2854
+ if debug_dir:
2855
+ import time
2856
+ timestamp = int(time.time())
2857
+ prompt_file = Path(debug_dir) / f"sequence_validation_{seq.variant_id}_{timestamp}.txt"
2858
+ _dump(prompt, prompt_file)
2859
+
2860
+ # Use triple validation for sequence correction
2861
+ log.info(f"Correcting sequence for {seq.variant_id} with triple validation")
2862
+ corrected_seq = _extract_plain_sequence_with_triple_validation(prompt, model, f"correction for {seq.variant_id}")
2863
+
2864
+ if debug_dir and corrected_seq:
2865
+ response_file = Path(debug_dir) / f"sequence_validation_response_{seq.variant_id}_{timestamp}.txt"
2866
+ _dump(corrected_seq, response_file)
2867
+
2868
+ if corrected_seq and corrected_seq not in ["UNCERTAIN", "VALID"] and _clean_seq(corrected_seq, _VALID_AA):
2869
+ return SequenceBlock(
2870
+ variant_id=seq.variant_id,
2871
+ aa_seq=corrected_seq,
2872
+ dna_seq=seq.dna_seq,
2873
+ confidence=0.8, # Lower confidence for corrected sequences
2874
+ truncated=seq.truncated
2875
+ )
2876
+
2877
+ except Exception as e:
2878
+ log.warning(f"Failed to get corrected sequence for {seq.variant_id}: {e}")
2879
+
2880
+ return None
2881
+
1919
2882
  # --- 7.5 Convenience wrapper -------------------------------------------------
1920
2883
  def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
1921
2884
  # Phase 1: Identify where sequences might be located
@@ -1973,6 +2936,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1973
2936
 
1974
2937
  # Fallback to text search if page extraction didn't work
1975
2938
  if not focused_text:
2939
+ log.info("Page extraction did not return text, falling back to text search")
1976
2940
  focused_text = _extract_text_at_locations(
1977
2941
  text, [best_location],
1978
2942
  context_chars=max(min_length, 30000),
@@ -1982,6 +2946,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1982
2946
  if focused_text and len(focused_text) < len(text):
1983
2947
  log.info("Reduced text from %d to %d chars using validated location",
1984
2948
  len(text), len(focused_text))
2949
+ else:
2950
+ log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
2951
+ len(focused_text) if focused_text else 0, len(text))
1985
2952
  # Build lineage context if available
1986
2953
  lineage_context = None
1987
2954
  if lineage_variants:
@@ -1993,7 +2960,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
1993
2960
  variant_info.append(info)
1994
2961
  lineage_context = "\n".join(variant_info)
1995
2962
 
1996
- return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
2963
+ return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
1997
2964
  else:
1998
2965
  log.warning("Location validation failed or returned invalid location: %s",
1999
2966
  validation.get("reason", "Unknown"))
@@ -2011,7 +2978,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2011
2978
  variant_info.append(info)
2012
2979
  lineage_context = "\n".join(variant_info)
2013
2980
 
2014
- return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
2981
+ return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
2015
2982
 
2016
2983
  # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
2017
2984
  """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2077,6 +3044,7 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
2077
3044
  log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
2078
3045
  return {}
2079
3046
 
3047
+
2080
3048
  def extract_enzyme_info_with_gemini(
2081
3049
  text: str,
2082
3050
  variants: List[Variant],
@@ -2146,7 +3114,7 @@ If you cannot determine certain fields, set them to null.
2146
3114
  # Validate it looks like a protein sequence
2147
3115
  if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
2148
3116
  # Sanity check the sequence against known mutations
2149
- validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
3117
+ validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
2150
3118
  if validated_seq:
2151
3119
  seq = validated_seq
2152
3120
  log.info(f"Sequence validated and potentially corrected by Gemini")
@@ -2714,7 +3682,7 @@ def run_pipeline(
2714
3682
 
2715
3683
  # 1. Prepare raw text ------------------------------------------------------
2716
3684
  # Always load both caption text (for identification) and full text (for extraction)
2717
- pdf_paths = [p for p in (si_path, manuscript) if p]
3685
+ pdf_paths = [p for p in (manuscript, si_path) if p]
2718
3686
  caption_text = limited_caption_concat(*pdf_paths)
2719
3687
  full_text = limited_concat(*pdf_paths)
2720
3688