debase 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -589,17 +589,28 @@ TEXT:
589
589
  {text}
590
590
  """.strip()
591
591
 
592
- _CAMPAIGN_MAPPING_PROMPT = """
593
- Given these identified campaigns and the lineage data location, determine which campaign this data belongs to:
592
+ _CAMPAIGN_BEST_LOCATION_PROMPT = """
593
+ Given this specific campaign and the available data locations, select the BEST location to extract the complete lineage data for this campaign.
594
594
 
595
- Campaigns:
596
- {campaigns}
595
+ Campaign:
596
+ - ID: {campaign_id}
597
+ - Name: {campaign_name}
598
+ - Description: {description}
599
+ - Lineage identifiers: {identifiers}
597
600
 
598
- Data location: {location}
599
- Caption/context: {context}
601
+ Available locations with context:
602
+ {locations_with_context}
600
603
 
601
- Based on the caption, enzyme names, or reaction details, which campaign does this data belong to?
602
- Return ONLY the campaign_id as a string.
604
+ Select the location that most likely contains the COMPLETE lineage data (all variants, mutations, and parent relationships) for THIS SPECIFIC campaign.
605
+
606
+ Consider:
607
+ 1. Tables are usually more structured and complete than figures
608
+ 2. Look for locations that mention this campaign's specific identifiers or enzyme names
609
+ 3. Some locations may contain data for multiple campaigns - that's fine, we can filter later
610
+ 4. Prioritize completeness over visual clarity
611
+
612
+ Return a JSON object with:
613
+ {{"location": "selected location identifier", "confidence": 0-100, "reason": "explanation"}}
603
614
  """.strip()
604
615
 
605
616
  # ---- 6.1 Prompt templates -------------------------------------------------
@@ -756,9 +767,43 @@ def identify_evolution_locations(
756
767
  max_results: int = 5,
757
768
  debug_dir: str | Path | None = None,
758
769
  campaigns: Optional[List[Campaign]] = None,
770
+ pdf_paths: Optional[List[Path]] = None,
759
771
  ) -> List[dict]:
760
772
  """Ask Gemini where in the paper the lineage is probably described."""
761
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
773
+ # Extract table of contents from PDFs if available
774
+ toc_text = ""
775
+ if pdf_paths:
776
+ toc_sections = []
777
+ for pdf_path in pdf_paths:
778
+ # Extract first few pages looking for TOC
779
+ doc = _open_doc(pdf_path)
780
+ try:
781
+ for page_num in range(min(5, len(doc))):
782
+ page_text = doc[page_num].get_text()
783
+ if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
784
+ # Found TOC page
785
+ lines = page_text.split('\n')
786
+ toc_lines = []
787
+ for line in lines:
788
+ line = line.strip()
789
+ # TOC entries typically have page numbers
790
+ if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
791
+ re.search(r'\s{2,}S?\d+\s*$', line) or
792
+ re.match(r'^\d+\.\s+\w+', line)):
793
+ toc_lines.append(line)
794
+ if toc_lines:
795
+ pdf_name = pdf_path.name
796
+ toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
797
+ break
798
+ finally:
799
+ doc.close()
800
+
801
+ if toc_sections:
802
+ toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
803
+
804
+ # Include TOC before the main text
805
+ combined_text = toc_text + text if toc_text else text
806
+ prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
762
807
  locs: List[dict] = []
763
808
  try:
764
809
  locs = generate_json_with_retry(
@@ -770,69 +815,7 @@ def identify_evolution_locations(
770
815
  except Exception as exc: # pragma: no cover
771
816
  log.warning("identify_evolution_locations(): %s", exc)
772
817
 
773
- # If we have campaigns, try to map locations to campaigns
774
- if campaigns and locs:
775
- for loc in locs:
776
- # Extract more context around the location
777
- location_str = loc.get('location', '')
778
- context = loc.get('reason', '')
779
-
780
- # Ask Gemini to map this location to a campaign
781
- if campaigns:
782
- try:
783
- campaigns_json = json.dumps([{
784
- "campaign_id": c.campaign_id,
785
- "campaign_name": c.campaign_name,
786
- "lineage_hint": c.notes
787
- } for c in campaigns])
788
-
789
- mapping_prompt = _CAMPAIGN_MAPPING_PROMPT.format(
790
- campaigns=campaigns_json,
791
- location=location_str,
792
- context=context
793
- )
794
-
795
- # Save mapping prompt to debug if provided
796
- if debug_dir:
797
- debug_path = Path(debug_dir)
798
- debug_path.mkdir(parents=True, exist_ok=True)
799
- mapping_file = debug_path / f"campaign_mapping_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
800
- _dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
801
-
802
- response = model.generate_content(mapping_prompt)
803
- response_text = _extract_text(response).strip()
804
-
805
- # Extract just the campaign_id from the response
806
- # Look for the campaign_id pattern in the response
807
- campaign_id = None
808
- for campaign in campaigns:
809
- if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
810
- campaign_id = campaign.campaign_id
811
- break
812
-
813
- # If not found, try to extract the last line or quoted string
814
- if not campaign_id:
815
- # Try to find quoted string
816
- quoted_match = re.search(r'"([^"]+)"', response_text)
817
- if quoted_match:
818
- campaign_id = quoted_match.group(1)
819
- else:
820
- # Take the last non-empty line
821
- lines = [line.strip() for line in response_text.split('\n') if line.strip()]
822
- if lines:
823
- campaign_id = lines[-1].strip('"')
824
-
825
- # Save mapping response to debug if provided
826
- if debug_dir:
827
- response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
828
- _dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
829
-
830
- # Add campaign_id to location
831
- if campaign_id:
832
- loc['campaign_id'] = campaign_id
833
- log.info(f"Mapped {location_str} to campaign: {campaign_id}")
834
- except Exception as exc:
835
- log.warning(f"Failed to map location to campaign: {exc}")
818
+ # No longer mapping locations to campaigns here - we'll ask for best location per campaign instead
836
819
 
837
820
  return locs if isinstance(locs, list) else []
838
821
 
@@ -878,6 +861,7 @@ def extract_complete_lineage(
878
861
  debug_dir: str | Path | None = None,
879
862
  campaign_id: Optional[str] = None,
880
863
  campaign_info: Optional[Campaign] = None,
864
+ pdf_paths: Optional[List[Path]] = None,
881
865
  ) -> List[Variant]:
882
866
  """Prompt Gemini for the full lineage and return a list[Variant]."""
883
867
  # Build campaign context
@@ -899,10 +883,44 @@ IMPORTANT:
899
883
  4. Include parent variants only if they are direct ancestors in this campaign's lineage.
900
884
  """
901
885
 
886
+ # Extract table of contents from PDFs if available
887
+ toc_text = ""
888
+ if pdf_paths:
889
+ toc_sections = []
890
+ for pdf_path in pdf_paths:
891
+ # Extract first few pages looking for TOC
892
+ doc = _open_doc(pdf_path)
893
+ try:
894
+ for page_num in range(min(5, len(doc))):
895
+ page_text = doc[page_num].get_text()
896
+ if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
897
+ # Found TOC page
898
+ lines = page_text.split('\n')
899
+ toc_lines = []
900
+ for line in lines:
901
+ line = line.strip()
902
+ # TOC entries typically have page numbers
903
+ if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
904
+ re.search(r'\s{2,}S?\d+\s*$', line) or
905
+ re.match(r'^\d+\.\s+\w+', line)):
906
+ toc_lines.append(line)
907
+ if toc_lines:
908
+ pdf_name = pdf_path.name
909
+ toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
910
+ break
911
+ finally:
912
+ doc.close()
913
+
914
+ if toc_sections:
915
+ toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
916
+
917
+ # Include TOC in the prompt text
918
+ combined_text = toc_text + text if toc_text else text
919
+
902
920
  prompt = _LINEAGE_EXTRACT_PROMPT.format(
903
921
  campaign_context=campaign_context,
904
922
  schema=_LINEAGE_SCHEMA_HINT,
905
- text=text[:MAX_CHARS],
923
+ text=combined_text[:MAX_CHARS],
906
924
  )
907
925
  raw = generate_json_with_retry(
908
926
  model,
@@ -1044,15 +1062,27 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1044
1062
  # 1. Line contains dots (...) followed by page number
1045
1063
  # 2. Line ends with just a page number
1046
1064
  # 3. Line has "Table S12:" or similar followed by title and page
1047
- if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
1065
+ # 4. Pattern appears at start of line followed by description and page number
1066
+ if ('...' in line or
1067
+ re.search(r'\.\s*\d+\s*$', line) or
1068
+ re.search(r':\s*[^:]+\s+\d+\s*$', line) or
1069
+ (line.strip().startswith(pattern) and re.search(r'\s+\d+\s*$', line))):
1048
1070
  return True
1049
1071
 
1050
1072
  # Check if this is in a contents/TOC section
1051
- # Look backwards up to 500 chars for "Contents" or "Table of Contents"
1052
- context_start = max(0, position - 500)
1073
+ # Look backwards up to 1000 chars for "Contents" or "Table of Contents"
1074
+ context_start = max(0, position - 1000)
1053
1075
  context = text[context_start:position].lower()
1054
1076
  if 'contents' in context or 'table of contents' in context:
1055
1077
  return True
1078
+
1079
+ # Check if we're in the first ~5000 chars of the document (likely TOC area)
1080
+ # This helps catch TOC entries that don't have obvious formatting
1081
+ if position < 5000:
1082
+ # Be more strict for early document positions
1083
+ # Check if line looks like a TOC entry (has page number at end)
1084
+ if re.search(r'\s+\d+\s*$', line):
1085
+ return True
1056
1086
 
1057
1087
  return False
1058
1088
 
@@ -1185,13 +1215,39 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1185
1215
  log.warning("No sequences found in any of %d occurrences of '%s'",
1186
1216
  len(all_positions), location_str)
1187
1217
  else:
1188
- # For lineage extraction, use the original logic
1189
- start = max(0, pos - context_chars)
1190
- end = min(len(text), pos + len(used_pattern) + context_chars)
1191
- section_text = text[start:end]
1192
- extracted_sections.append(section_text)
1193
- log.info("Found '%s' using pattern '%s' at position %d, extracted %d chars",
1194
- location_str, used_pattern, pos, len(section_text))
1218
+ # For lineage extraction, find ALL occurrences of the pattern
1219
+ all_positions = []
1220
+ search_pos = 0
1221
+
1222
+ # Find all occurrences of this pattern (not just the first)
1223
+ while search_pos < len(text_lower):
1224
+ temp_pos = text_lower.find(used_pattern.lower(), search_pos)
1225
+ if temp_pos == -1:
1226
+ break
1227
+
1228
+ # Check if this is a TOC entry
1229
+ if _is_toc_entry(text, temp_pos, used_pattern):
1230
+ log.debug("Skipping TOC entry for pattern '%s' at position %d", used_pattern, temp_pos)
1231
+ search_pos = temp_pos + len(used_pattern)
1232
+ continue
1233
+
1234
+ all_positions.append(temp_pos)
1235
+ search_pos = temp_pos + len(used_pattern)
1236
+
1237
+ if len(all_positions) >= 10: # Limit to 10 occurrences
1238
+ break
1239
+
1240
+ log.info("Found %d non-TOC occurrences of pattern '%s' for location '%s'",
1241
+ len(all_positions), used_pattern, location_str)
1242
+
1243
+ # Extract context around each occurrence
1244
+ for idx, pos in enumerate(all_positions):
1245
+ start = max(0, pos - context_chars)
1246
+ end = min(len(text), pos + len(used_pattern) + context_chars)
1247
+ section_text = text[start:end]
1248
+ extracted_sections.append(section_text)
1249
+ log.info("Occurrence %d/%d: Found '%s' at position %d, extracted %d chars",
1250
+ idx + 1, len(all_positions), location_str, pos, len(section_text))
1195
1251
  else:
1196
1252
  log.warning("Location '%s' not found in text (tried %d patterns)", location_str, len(page_patterns))
1197
1253
 
@@ -1229,41 +1285,113 @@ def get_lineage(
1229
1285
  log.info(f" - {camp.campaign_name}: {camp.description}")
1230
1286
 
1231
1287
  # Use captions for identification - they're concise and focused
1232
- locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=campaigns)
1288
+ locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
1233
1289
 
1234
1290
  all_variants = []
1235
1291
 
1236
- if locations:
1292
+ if locations and campaigns:
1237
1293
  # Log location information
1238
1294
  location_summary = []
1239
1295
  for loc in locations[:5]:
1240
1296
  if isinstance(loc, dict):
1241
- campaign_info = f", campaign: {loc.get('campaign_id', 'unknown')}" if 'campaign_id' in loc else ""
1242
- location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)}{campaign_info})")
1297
+ location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
1243
1298
  else:
1244
1299
  location_summary.append(str(loc))
1245
1300
  log.info("Gemini identified %d potential lineage locations: %s",
1246
1301
  len(locations), ", ".join(location_summary))
1247
1302
 
1248
- # Group locations by campaign
1249
- locations_by_campaign = {}
1303
+ # Extract context around each location for better decision making
1304
+ locations_with_context = []
1250
1305
  for loc in locations:
1251
- campaign_id = loc.get('campaign_id', 'default') if isinstance(loc, dict) else 'default'
1252
- if campaign_id not in locations_by_campaign:
1253
- locations_by_campaign[campaign_id] = []
1254
- locations_by_campaign[campaign_id].append(loc)
1306
+ location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1307
+ # Extract 1000 chars of context around the location
1308
+ context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1309
+ locations_with_context.append({
1310
+ 'location': loc,
1311
+ 'context': context_text[:1000] # First 1000 chars of extracted context
1312
+ })
1255
1313
 
1256
- # Process each campaign's locations
1257
- for campaign_id, campaign_locations in locations_by_campaign.items():
1258
- log.info(f"Processing campaign: {campaign_id}")
1314
+ # For each campaign, ask Gemini to select the best location
1315
+ for campaign in campaigns:
1316
+ log.info(f"Processing campaign: {campaign.campaign_id}")
1259
1317
 
1260
- # Sort locations by confidence to get the highest confidence one
1261
- sorted_locations = sorted(campaign_locations,
1262
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1263
- reverse=True)
1318
+ # Build locations context string
1319
+ locations_str = ""
1320
+ for i, loc_ctx in enumerate(locations_with_context):
1321
+ loc = loc_ctx['location']
1322
+ context = loc_ctx['context']
1323
+ location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1324
+ location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
1325
+ confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
1326
+ reason = loc.get('reason', '') if isinstance(loc, dict) else ''
1327
+
1328
+ locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
1329
+ locations_str += f" Reason: {reason}\n"
1330
+ locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
1264
1331
 
1265
- # Use only the highest confidence location to avoid duplicates
1266
- primary_location = sorted_locations[0] if sorted_locations else None
1332
+ # Ask Gemini to select best location for this campaign
1333
+ best_location_prompt = _CAMPAIGN_BEST_LOCATION_PROMPT.format(
1334
+ campaign_id=campaign.campaign_id,
1335
+ campaign_name=campaign.campaign_name,
1336
+ description=campaign.description,
1337
+ identifiers=campaign.notes or "No specific identifiers provided",
1338
+ locations_with_context=locations_str
1339
+ )
1340
+
1341
+ primary_location = None
1342
+ try:
1343
+ # Save prompt to debug if provided
1344
+ if debug_dir:
1345
+ debug_path = Path(debug_dir)
1346
+ debug_path.mkdir(parents=True, exist_ok=True)
1347
+ prompt_file = debug_path / f"best_location_{campaign.campaign_id}_{int(time.time())}.txt"
1348
+ _dump(f"=== BEST LOCATION PROMPT ===\nCampaign: {campaign.campaign_id}\n{'='*80}\n\n{best_location_prompt}", prompt_file)
1349
+
1350
+ response = model.generate_content(best_location_prompt)
1351
+ response_text = _extract_text(response).strip()
1352
+
1353
+ # Parse JSON response
1354
+ if response_text.startswith("```"):
1355
+ response_text = response_text.split("```")[1].strip()
1356
+ if response_text.startswith("json"):
1357
+ response_text = response_text[4:].strip()
1358
+
1359
+ best_loc_data = json.loads(response_text)
1360
+ selected_location = best_loc_data.get('location', '')
1361
+ confidence = best_loc_data.get('confidence', 0)
1362
+ reason = best_loc_data.get('reason', '')
1363
+
1364
+ # Save response to debug if provided
1365
+ if debug_dir:
1366
+ response_file = debug_path / f"best_location_response_{campaign.campaign_id}_{int(time.time())}.txt"
1367
+ _dump(f"=== BEST LOCATION RESPONSE ===\nCampaign: {campaign.campaign_id}\nSelected: {selected_location}\nConfidence: {confidence}\nReason: {reason}\n{'='*80}", response_file)
1368
+
1369
+ log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
1370
+
1371
+ # Find the actual location object
1372
+ for loc in locations:
1373
+ loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1374
+ if loc_str == selected_location:
1375
+ primary_location = loc
1376
+ break
1377
+
1378
+ if not primary_location:
1379
+ log.warning(f"Could not find selected location '{selected_location}' in locations list")
1380
+ # Fall back to highest confidence location
1381
+ primary_location = sorted(locations,
1382
+ key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1383
+ reverse=True)[0] if locations else None
1384
+
1385
+ except Exception as e:
1386
+ log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
1387
+ # Fall back to highest confidence location
1388
+ primary_location = sorted(locations,
1389
+ key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1390
+ reverse=True)[0] if locations else None
1391
+
1392
+ if not primary_location:
1393
+ log.warning(f"No location found for campaign {campaign.campaign_id}")
1394
+ continue
1267
1395
 
1268
1396
  # Track if we successfully extracted from figure
1269
1397
  extracted_from_figure = False
@@ -1297,12 +1425,11 @@ def get_lineage(
1297
1425
  log.info("Saved lineage figure to: %s", figure_file)
1298
1426
 
1299
1427
  # Extract lineage from the figure
1300
- campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
1301
1428
  variants = extract_lineage_from_figure(
1302
1429
  figure_bytes, model,
1303
1430
  debug_dir=debug_dir,
1304
- campaign_id=campaign_id,
1305
- campaign_info=campaign_obj
1431
+ campaign_id=campaign.campaign_id,
1432
+ campaign_info=campaign
1306
1433
  )
1307
1434
  if variants:
1308
1435
  all_variants.extend(variants)
@@ -1327,22 +1454,22 @@ def get_lineage(
1327
1454
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
1328
1455
  len(full_text), len(focused_text),
1329
1456
  primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
1330
- campaign_id)
1457
+ campaign.campaign_id)
1331
1458
 
1332
- # Find the campaign object
1333
- campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
1459
+ # Extract lineage for this campaign
1334
1460
  campaign_variants = extract_complete_lineage(
1335
1461
  focused_text, model,
1336
1462
  debug_dir=debug_dir,
1337
- campaign_id=campaign_id,
1338
- campaign_info=campaign_obj
1463
+ campaign_id=campaign.campaign_id,
1464
+ campaign_info=campaign,
1465
+ pdf_paths=pdf_paths
1339
1466
  )
1340
1467
  all_variants.extend(campaign_variants)
1341
1468
 
1342
1469
  return all_variants, campaigns
1343
1470
  else:
1344
1471
  log.info("Gemini did not identify specific lineage locations")
1345
- variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir)
1472
+ variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir, pdf_paths=pdf_paths)
1346
1473
  return variants, campaigns
1347
1474
 
1348
1475
  # === 7. SEQUENCE EXTRACTION === ----------------------------------------------
@@ -1398,18 +1525,31 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
1398
1525
  return []
1399
1526
 
1400
1527
  # --- 7.2 Page-based extraction helper ---------------------------------------
1401
- def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
1402
- """Extract text from a specific page number in the PDFs."""
1528
+ def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
1529
+ """Extract text from a specific page number in the PDFs.
1530
+
1531
+ Args:
1532
+ pdf_paths: List of PDF paths
1533
+ page_num: Page number (can be "S1", "S2", etc for SI pages)
1534
+ skip_si_toc: If True, skip first 2 pages of SI to avoid TOC
1535
+ """
1403
1536
  # Convert page number to int and handle S-prefix
1404
1537
  page_str = str(page_num).strip().upper()
1405
1538
  if page_str.startswith('S'):
1406
1539
  # Supplementary page - look in the SI PDF (second PDF)
1407
1540
  actual_page = int(page_str[1:]) - 1 # 0-indexed
1408
1541
  pdf_index = 1 if len(pdf_paths) > 1 else 0
1542
+ is_si_page = True
1409
1543
  else:
1410
1544
  # Regular page - look in the main PDF
1411
1545
  actual_page = int(page_str) - 1 # 0-indexed
1412
1546
  pdf_index = 0
1547
+ is_si_page = False
1548
+
1549
+ # Skip first 2 pages of SI to avoid table of contents
1550
+ if skip_si_toc and is_si_page and actual_page < 2:
1551
+ log.info("Skipping SI page %s (first 2 pages are typically TOC)", page_str)
1552
+ return ""
1413
1553
 
1414
1554
  if pdf_index >= len(pdf_paths):
1415
1555
  log.warning("Page %s requested but not enough PDFs provided", page_str)
@@ -1543,8 +1683,14 @@ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
1543
1683
  - Only extract dna_seq if NO amino acid sequence is available for that variant
1544
1684
  - This reduces redundancy since protein sequences are usually more relevant
1545
1685
 
1686
+ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
1687
+ - Papers often use different naming conventions in different sections
1688
+ - DO NOT normalize or simplify variant IDs
1689
+ - Extract the variant_id exactly as written where the sequence appears
1690
+ - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
1691
+
1546
1692
  For each variant return:
1547
- * variant_id - the label used in the paper (e.g. "R4-10")
1693
+ * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
1548
1694
  * aa_seq - amino-acid sequence (uppercase), or null
1549
1695
  * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
1550
1696
 
@@ -1584,7 +1730,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
1584
1730
  return _parse_sequences(data)
1585
1731
 
1586
1732
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
1587
- _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY")
1733
+ _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
1588
1734
  _VALID_DNA = set("ACGT")
1589
1735
 
1590
1736
  def _contains_sequence(text: str, min_length: int = 50) -> bool:
@@ -1974,43 +2120,53 @@ def _merge_lineage_and_sequences(
1974
2120
  }
1975
2121
  for s in seqs
1976
2122
  ])
2123
+
2124
+ # Log sequence data info
2125
+ if len(df_seq) > 0:
2126
+ seq_with_aa = (~df_seq['aa_seq'].isna()).sum()
2127
+ seq_with_dna = (~df_seq['dna_seq'].isna()).sum()
2128
+ log.info(f"Sequence data: {len(df_seq)} entries, {seq_with_aa} with aa_seq, {seq_with_dna} with dna_seq")
1977
2129
 
1978
- # 2. Outer merge keeps every lineage entry and adds sequence cols when present
2130
+ # 2. First try direct merge
1979
2131
  df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
1980
-
1981
- # 2a. If we have unmatched sequences and a model, use Gemini to match them
1982
- log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
1983
- if model and len(df_seq) > 0:
1984
- # Log initial state
1985
- log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
1986
- log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
1987
- log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
1988
-
1989
- # Find lineage entries without sequences
2132
+
2133
+ # Log merge results
2134
+ merged_aa = (~df['aa_seq'].isna()).sum()
2135
+ merged_dna = (~df['dna_seq'].isna()).sum()
2136
+ log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
2137
+
2138
+ # 3. If we have unmatched sequences and a model, use Gemini to match
2139
+ if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
2140
+ # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
1990
2141
  missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
1991
- unmatched_lineage = df[missing_seq]['variant_id'].tolist()
2142
+ unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
1992
2143
 
1993
- # Find sequences that weren't matched
2144
+ # Find unmatched sequences
1994
2145
  matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
1995
2146
  unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
1996
2147
 
1997
- if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
1998
- log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
1999
- log.info(f"Using Gemini to match variants")
2148
+ if unmatched_lineage_ids and len(unmatched_seqs) > 0:
2149
+ log.info(f"Found {len(unmatched_lineage_ids)} lineage entries without sequences")
2150
+ log.info(f"Found {len(unmatched_seqs)} unmatched sequences")
2151
+ log.info("Using Gemini to match variants")
2000
2152
 
2001
- # Build prompt for Gemini to match variants
2002
- prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
2153
+ # Build prompt for Gemini
2154
+ prompt = f"""Match enzyme variant IDs between two lists from the same paper.
2155
+
2156
+ Papers often use different naming conventions for the same variant:
2157
+ - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
2158
+ - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
2159
+
2160
+ Match variants by analyzing generation numbers, prefixes, and patterns.
2003
2161
 
2004
2162
  Lineage variant IDs (need sequences):
2005
- {json.dumps(unmatched_lineage)}
2163
+ {json.dumps(unmatched_lineage_ids)}
2006
2164
 
2007
2165
  Sequence variant IDs (have sequences):
2008
2166
  {json.dumps(unmatched_seqs['variant_id'].tolist())}
2009
2167
 
2010
- These lists contain variant identifiers from the same paper but may use different naming conventions.
2011
- Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
2012
-
2013
- Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
2168
+ Return ONLY a JSON object mapping lineage IDs to sequence IDs.
2169
+ Format: {{"lineage_id": "sequence_id", ...}}
2014
2170
  """
2015
2171
 
2016
2172
  try:
@@ -2024,85 +2180,82 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2024
2180
  text = text[4:].strip()
2025
2181
 
2026
2182
  matches = json.loads(text)
2027
- log.info(f"Gemini returned matches: {matches}")
2183
+ log.info(f"Gemini returned {len(matches)} matches")
2028
2184
 
2029
- # Debug: Log what sequences we actually have
2030
- log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
2185
+ # Create a mapping of sequence IDs to their data for efficient lookup
2186
+ seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
2031
2187
 
2032
- # Apply the matches
2188
+ # Apply matches and update variant IDs
2033
2189
  for lineage_id, seq_id in matches.items():
2034
- if lineage_id in unmatched_lineage:
2035
- # Find the sequence data - be flexible with matching
2036
- seq_data = None
2190
+ if lineage_id in unmatched_lineage_ids and seq_id in seq_data_map:
2191
+ # Get the sequence data
2192
+ seq_data = seq_data_map[seq_id]
2037
2193
 
2038
- # First try exact match
2039
- seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
2040
- if len(seq_matches) > 0:
2041
- seq_data = seq_matches.iloc[0]
2042
- else:
2043
- # Try to find by checking various matching strategies
2044
- for idx, row in unmatched_seqs.iterrows():
2045
- variant_id = row['variant_id']
2046
- # Check if one is contained in the other
2047
- if seq_id in variant_id or variant_id in seq_id:
2048
- seq_data = row
2049
- break
2050
- # Check if they share the same core identifier (e.g., G0, G1, etc.)
2051
- seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
2052
- variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
2053
- if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
2054
- seq_data = row
2055
- break
2056
-
2057
- if seq_data is not None:
2058
- # Update the dataframe
2059
- mask = df['variant_id'] == lineage_id
2060
- if mask.any():
2061
- # Log before update
2062
- log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
2063
-
2064
- df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2065
- df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2066
- df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2067
- df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2194
+ # Update the row with the matched sequence ID and data
2195
+ mask = df['variant_id'] == lineage_id
2196
+ if mask.any():
2197
+ # Update variant_id to use the sequence variant name
2198
+ df.loc[mask, 'variant_id'] = seq_id
2199
+
2200
+ # Update parent_id if it matches any of the mapped lineage IDs
2201
+ parent_mask = df['parent_id'] == lineage_id
2202
+ if parent_mask.any():
2203
+ df.loc[parent_mask, 'parent_id'] = seq_id
2204
+
2205
+ # Update sequence data
2206
+ # For pandas Series from iterrows(), use proper indexing
2207
+ aa_seq_val = seq_data['aa_seq'] if 'aa_seq' in seq_data else None
2208
+ dna_seq_val = seq_data['dna_seq'] if 'dna_seq' in seq_data else None
2209
+
2210
+ # Always update sequence fields to preserve DNA even when aa_seq is null
2211
+ df.loc[mask, 'aa_seq'] = aa_seq_val
2212
+ df.loc[mask, 'dna_seq'] = dna_seq_val
2068
2213
 
2069
- # Log after update
2070
- log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
2071
- log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
2072
- else:
2073
- log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
2074
- else:
2075
- log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
2214
+ df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2215
+ df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2216
+
2217
+ # Log sequence info - check both aa_seq and dna_seq
2218
+ aa_len = len(seq_data['aa_seq']) if pd.notna(seq_data.get('aa_seq')) and seq_data.get('aa_seq') else 0
2219
+ dna_len = len(seq_data['dna_seq']) if pd.notna(seq_data.get('dna_seq')) and seq_data.get('dna_seq') else 0
2220
+ log.info(f"Matched {lineage_id} -> {seq_id} (aa_seq: {aa_len} chars, dna_seq: {dna_len} chars)")
2221
+
2222
+ # Update any remaining parent_id references to matched variants
2223
+ for lineage_id, seq_id in matches.items():
2224
+ parent_mask = df['parent_id'] == lineage_id
2225
+ if parent_mask.any():
2226
+ df.loc[parent_mask, 'parent_id'] = seq_id
2076
2227
 
2077
- # Log the final state after all matches
2078
- matched_count = (~df['aa_seq'].isna()).sum()
2079
- log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
2228
+ # Log final state - count variants with any sequence (aa or dna)
2229
+ aa_count = (~df['aa_seq'].isna()).sum()
2230
+ dna_count = (~df['dna_seq'].isna()).sum()
2231
+ any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
2232
+ log.info(f"After Gemini matching: {any_seq_count}/{len(df)} variants have sequences (aa: {aa_count}, dna: {dna_count})")
2080
2233
 
2081
2234
  except Exception as e:
2082
2235
  log.warning(f"Failed to match variants using Gemini: {e}")
2083
2236
 
2084
- # 3. If generation missing after user input, try inference
2237
+ # 4. If generation missing, try inference
2085
2238
  if df["generation"].isna().any():
2086
- _infer_generations(lineage) # mutates in place
2087
- df = df.drop(columns=["generation"]).merge(
2088
- pd.DataFrame(
2089
- {"variant_id": [v.variant_id for v in lineage], "generation": [v.generation for v in lineage]}
2090
- ),
2091
- on="variant_id",
2092
- how="left",
2093
- )
2094
-
2095
- # 4. Attach DOI column for provenance
2239
+ _infer_generations(lineage)
2240
+ # Need to update the generations based on the potentially updated variant IDs
2241
+ gen_map = {v.variant_id: v.generation for v in lineage}
2242
+ # Also create a map for any variant IDs that were replaced
2243
+ for idx, row in df.iterrows():
2244
+ variant_id = row['variant_id']
2245
+ if variant_id in gen_map:
2246
+ df.at[idx, 'generation'] = gen_map[variant_id]
2247
+
2248
+ # 5. Attach DOI column
2096
2249
  df["doi"] = doi
2097
2250
 
2098
- # 5. Sort rows: primary by generation, then by variant_id
2251
+ # 6. Sort by generation, then variant_id
2099
2252
  df = df.sort_values(["generation", "variant_id"], kind="mergesort")
2100
2253
 
2101
- # Debug: Log final merge state
2102
- seq_count = (~df['aa_seq'].isna()).sum()
2103
- log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
2104
- if seq_count > 0:
2105
- log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
2254
+ # 7. Log final state
2255
+ aa_count = (~df['aa_seq'].isna()).sum()
2256
+ dna_count = (~df['dna_seq'].isna()).sum()
2257
+ any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
2258
+ log.info(f"Final result: {len(df)} variants, {any_seq_count} with sequences (aa: {aa_count}, dna: {dna_count})")
2106
2259
 
2107
2260
  return df
2108
2261
 
@@ -2114,28 +2267,27 @@ def merge_and_score(
2114
2267
  doi: Optional[str] = None,
2115
2268
  model=None,
2116
2269
  ) -> pd.DataFrame:
2117
- """User-facing helper imported by the pipeline orchestrator.
2118
-
2119
- * Ensures lineage + sequence lists are non-empty.
2120
- * Performs a shallow validation.
2121
- * Returns a ready-to-export pandas DataFrame.
2270
+ """Merge lineage and sequence data into a single DataFrame.
2271
+
2272
+ Args:
2273
+ lineage: List of Variant objects from lineage extraction
2274
+ seqs: List of SequenceBlock objects from sequence extraction
2275
+ doi: DOI of the paper for provenance
2276
+ model: Gemini model for smart matching (optional)
2277
+
2278
+ Returns:
2279
+ DataFrame with merged lineage and sequence data
2122
2280
  """
2123
-
2124
2281
  if not lineage:
2125
2282
  raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
2126
2283
 
2127
- # If no sequences found, still build a DataFrame so caller can decide what to do.
2128
2284
  df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
2129
2285
 
2130
- # Basic sanity: warn if many missing sequences
2286
+ # Warn if many sequences are missing
2131
2287
  missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
2132
2288
  if missing_rate > 0.5:
2133
2289
  log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
2134
2290
 
2135
- # Debug log before returning
2136
- seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
2137
- log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
2138
-
2139
2291
  return df
2140
2292
 
2141
2293
  # -------------------------------------------------------------------- end 8 ---
@@ -2320,18 +2472,17 @@ def run_pipeline(
2320
2472
  # Save final data with sequences using same filename (overwrites lineage-only)
2321
2473
  sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2322
2474
 
2323
- # Debug: Log what we're about to save
2324
- seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2325
- log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
2326
- if seq_count > 0 and 'aa_seq' in df_final:
2327
- with_seq = df_final[~df_final['aa_seq'].isna()]
2328
- log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
2329
-
2475
+ # Save the final CSV
2330
2476
  df_final.to_csv(sequence_path, index=False)
2477
+
2478
+ # Log summary statistics
2479
+ seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2331
2480
  log.info(
2332
- "Overwrote with final results -> %s (%.1f kB)",
2481
+ "Saved final CSV -> %s (%.1f kB, %d variants, %d with sequences)",
2333
2482
  sequence_path,
2334
2483
  sequence_path.stat().st_size / 1024,
2484
+ len(df_final),
2485
+ seq_count
2335
2486
  )
2336
2487
 
2337
2488
  log.info(