debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -589,17 +589,28 @@ TEXT:
589
589
  {text}
590
590
  """.strip()
591
591
 
592
- _CAMPAIGN_MAPPING_PROMPT = """
593
- Given these identified campaigns and the lineage data location, determine which campaign this data belongs to:
592
+ _CAMPAIGN_BEST_LOCATION_PROMPT = """
593
+ Given this specific campaign and the available data locations, select the BEST location to extract the complete lineage data for this campaign.
594
594
 
595
- Campaigns:
596
- {campaigns}
595
+ Campaign:
596
+ - ID: {campaign_id}
597
+ - Name: {campaign_name}
598
+ - Description: {description}
599
+ - Lineage identifiers: {identifiers}
597
600
 
598
- Data location: {location}
599
- Caption/context: {context}
601
+ Available locations with context:
602
+ {locations_with_context}
600
603
 
601
- Based on the caption, enzyme names, or reaction details, which campaign does this data belong to?
602
- Return ONLY the campaign_id as a string.
604
+ Select the location that most likely contains the COMPLETE lineage data (all variants, mutations, and parent relationships) for THIS SPECIFIC campaign.
605
+
606
+ Consider:
607
+ 1. Tables are usually more structured and complete than figures
608
+ 2. Look for locations that mention this campaign's specific identifiers or enzyme names
609
+ 3. Some locations may contain data for multiple campaigns - that's fine, we can filter later
610
+ 4. Prioritize completeness over visual clarity
611
+
612
+ Return a JSON object with:
613
+ {{"location": "selected location identifier", "confidence": 0-100, "reason": "explanation"}}
603
614
  """.strip()
604
615
 
605
616
  # ---- 6.1 Prompt templates -------------------------------------------------
@@ -756,9 +767,43 @@ def identify_evolution_locations(
756
767
  max_results: int = 5,
757
768
  debug_dir: str | Path | None = None,
758
769
  campaigns: Optional[List[Campaign]] = None,
770
+ pdf_paths: Optional[List[Path]] = None,
759
771
  ) -> List[dict]:
760
772
  """Ask Gemini where in the paper the lineage is probably described."""
761
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
773
+ # Extract table of contents from PDFs if available
774
+ toc_text = ""
775
+ if pdf_paths:
776
+ toc_sections = []
777
+ for pdf_path in pdf_paths:
778
+ # Extract first few pages looking for TOC
779
+ doc = _open_doc(pdf_path)
780
+ try:
781
+ for page_num in range(min(5, len(doc))):
782
+ page_text = doc[page_num].get_text()
783
+ if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
784
+ # Found TOC page
785
+ lines = page_text.split('\n')
786
+ toc_lines = []
787
+ for line in lines:
788
+ line = line.strip()
789
+ # TOC entries typically have page numbers
790
+ if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
791
+ re.search(r'\s{2,}S?\d+\s*$', line) or
792
+ re.match(r'^\d+\.\s+\w+', line)):
793
+ toc_lines.append(line)
794
+ if toc_lines:
795
+ pdf_name = pdf_path.name
796
+ toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
797
+ break
798
+ finally:
799
+ doc.close()
800
+
801
+ if toc_sections:
802
+ toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
803
+
804
+ # Include TOC before the main text
805
+ combined_text = toc_text + text if toc_text else text
806
+ prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
762
807
  locs: List[dict] = []
763
808
  try:
764
809
  locs = generate_json_with_retry(
@@ -770,69 +815,7 @@ def identify_evolution_locations(
770
815
  except Exception as exc: # pragma: no cover
771
816
  log.warning("identify_evolution_locations(): %s", exc)
772
817
 
773
- # If we have campaigns, try to map locations to campaigns
774
- if campaigns and locs:
775
- for loc in locs:
776
- # Extract more context around the location
777
- location_str = loc.get('location', '')
778
- context = loc.get('reason', '')
779
-
780
- # Ask Gemini to map this location to a campaign
781
- if campaigns:
782
- try:
783
- campaigns_json = json.dumps([{
784
- "campaign_id": c.campaign_id,
785
- "campaign_name": c.campaign_name,
786
- "lineage_hint": c.notes
787
- } for c in campaigns])
788
-
789
- mapping_prompt = _CAMPAIGN_MAPPING_PROMPT.format(
790
- campaigns=campaigns_json,
791
- location=location_str,
792
- context=context
793
- )
794
-
795
- # Save mapping prompt to debug if provided
796
- if debug_dir:
797
- debug_path = Path(debug_dir)
798
- debug_path.mkdir(parents=True, exist_ok=True)
799
- mapping_file = debug_path / f"campaign_mapping_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
800
- _dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
801
-
802
- response = model.generate_content(mapping_prompt)
803
- response_text = _extract_text(response).strip()
804
-
805
- # Extract just the campaign_id from the response
806
- # Look for the campaign_id pattern in the response
807
- campaign_id = None
808
- for campaign in campaigns:
809
- if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
810
- campaign_id = campaign.campaign_id
811
- break
812
-
813
- # If not found, try to extract the last line or quoted string
814
- if not campaign_id:
815
- # Try to find quoted string
816
- quoted_match = re.search(r'"([^"]+)"', response_text)
817
- if quoted_match:
818
- campaign_id = quoted_match.group(1)
819
- else:
820
- # Take the last non-empty line
821
- lines = [line.strip() for line in response_text.split('\n') if line.strip()]
822
- if lines:
823
- campaign_id = lines[-1].strip('"')
824
-
825
- # Save mapping response to debug if provided
826
- if debug_dir:
827
- response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
828
- _dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
829
-
830
- # Add campaign_id to location
831
- if campaign_id:
832
- loc['campaign_id'] = campaign_id
833
- log.info(f"Mapped {location_str} to campaign: {campaign_id}")
834
- except Exception as exc:
835
- log.warning(f"Failed to map location to campaign: {exc}")
818
+ # No longer mapping locations to campaigns here - we'll ask for best location per campaign instead
836
819
 
837
820
  return locs if isinstance(locs, list) else []
838
821
 
@@ -840,7 +823,14 @@ def identify_evolution_locations(
840
823
 
841
824
  def _parse_variants(data: Dict[str, Any], campaign_id: Optional[str] = None) -> List[Variant]:
842
825
  """Convert raw JSON to a list[Variant] with basic validation."""
843
- variants_json = data.get("variants", []) if isinstance(data, dict) else []
826
+ if isinstance(data, list):
827
+ # Direct array of variants
828
+ variants_json = data
829
+ elif isinstance(data, dict):
830
+ # Object with "variants" key
831
+ variants_json = data.get("variants", [])
832
+ else:
833
+ variants_json = []
844
834
  parsed: List[Variant] = []
845
835
  for item in variants_json:
846
836
  try:
@@ -878,6 +868,7 @@ def extract_complete_lineage(
878
868
  debug_dir: str | Path | None = None,
879
869
  campaign_id: Optional[str] = None,
880
870
  campaign_info: Optional[Campaign] = None,
871
+ pdf_paths: Optional[List[Path]] = None,
881
872
  ) -> List[Variant]:
882
873
  """Prompt Gemini for the full lineage and return a list[Variant]."""
883
874
  # Build campaign context
@@ -899,10 +890,44 @@ IMPORTANT:
899
890
  4. Include parent variants only if they are direct ancestors in this campaign's lineage.
900
891
  """
901
892
 
893
+ # Extract table of contents from PDFs if available
894
+ toc_text = ""
895
+ if pdf_paths:
896
+ toc_sections = []
897
+ for pdf_path in pdf_paths:
898
+ # Extract first few pages looking for TOC
899
+ doc = _open_doc(pdf_path)
900
+ try:
901
+ for page_num in range(min(5, len(doc))):
902
+ page_text = doc[page_num].get_text()
903
+ if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
904
+ # Found TOC page
905
+ lines = page_text.split('\n')
906
+ toc_lines = []
907
+ for line in lines:
908
+ line = line.strip()
909
+ # TOC entries typically have page numbers
910
+ if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
911
+ re.search(r'\s{2,}S?\d+\s*$', line) or
912
+ re.match(r'^\d+\.\s+\w+', line)):
913
+ toc_lines.append(line)
914
+ if toc_lines:
915
+ pdf_name = pdf_path.name
916
+ toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
917
+ break
918
+ finally:
919
+ doc.close()
920
+
921
+ if toc_sections:
922
+ toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
923
+
924
+ # Include TOC in the prompt text
925
+ combined_text = toc_text + text if toc_text else text
926
+
902
927
  prompt = _LINEAGE_EXTRACT_PROMPT.format(
903
928
  campaign_context=campaign_context,
904
929
  schema=_LINEAGE_SCHEMA_HINT,
905
- text=text[:MAX_CHARS],
930
+ text=combined_text[:MAX_CHARS],
906
931
  )
907
932
  raw = generate_json_with_retry(
908
933
  model,
@@ -1044,15 +1069,27 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1044
1069
  # 1. Line contains dots (...) followed by page number
1045
1070
  # 2. Line ends with just a page number
1046
1071
  # 3. Line has "Table S12:" or similar followed by title and page
1047
- if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
1072
+ # 4. Pattern appears at start of line followed by description and page number
1073
+ if ('...' in line or
1074
+ re.search(r'\.\s*\d+\s*$', line) or
1075
+ re.search(r':\s*[^:]+\s+\d+\s*$', line) or
1076
+ (line.strip().startswith(pattern) and re.search(r'\s+\d+\s*$', line))):
1048
1077
  return True
1049
1078
 
1050
1079
  # Check if this is in a contents/TOC section
1051
- # Look backwards up to 500 chars for "Contents" or "Table of Contents"
1052
- context_start = max(0, position - 500)
1080
+ # Look backwards up to 1000 chars for "Contents" or "Table of Contents"
1081
+ context_start = max(0, position - 1000)
1053
1082
  context = text[context_start:position].lower()
1054
1083
  if 'contents' in context or 'table of contents' in context:
1055
1084
  return True
1085
+
1086
+ # Check if we're in the first ~5000 chars of the document (likely TOC area)
1087
+ # This helps catch TOC entries that don't have obvious formatting
1088
+ if position < 5000:
1089
+ # Be more strict for early document positions
1090
+ # Check if line looks like a TOC entry (has page number at end)
1091
+ if re.search(r'\s+\d+\s*$', line):
1092
+ return True
1056
1093
 
1057
1094
  return False
1058
1095
 
@@ -1185,13 +1222,39 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1185
1222
  log.warning("No sequences found in any of %d occurrences of '%s'",
1186
1223
  len(all_positions), location_str)
1187
1224
  else:
1188
- # For lineage extraction, use the original logic
1189
- start = max(0, pos - context_chars)
1190
- end = min(len(text), pos + len(used_pattern) + context_chars)
1191
- section_text = text[start:end]
1192
- extracted_sections.append(section_text)
1193
- log.info("Found '%s' using pattern '%s' at position %d, extracted %d chars",
1194
- location_str, used_pattern, pos, len(section_text))
1225
+ # For lineage extraction, find ALL occurrences of the pattern
1226
+ all_positions = []
1227
+ search_pos = 0
1228
+
1229
+ # Find all occurrences of this pattern (not just the first)
1230
+ while search_pos < len(text_lower):
1231
+ temp_pos = text_lower.find(used_pattern.lower(), search_pos)
1232
+ if temp_pos == -1:
1233
+ break
1234
+
1235
+ # Check if this is a TOC entry
1236
+ if _is_toc_entry(text, temp_pos, used_pattern):
1237
+ log.debug("Skipping TOC entry for pattern '%s' at position %d", used_pattern, temp_pos)
1238
+ search_pos = temp_pos + len(used_pattern)
1239
+ continue
1240
+
1241
+ all_positions.append(temp_pos)
1242
+ search_pos = temp_pos + len(used_pattern)
1243
+
1244
+ if len(all_positions) >= 10: # Limit to 10 occurrences
1245
+ break
1246
+
1247
+ log.info("Found %d non-TOC occurrences of pattern '%s' for location '%s'",
1248
+ len(all_positions), used_pattern, location_str)
1249
+
1250
+ # Extract context around each occurrence
1251
+ for idx, pos in enumerate(all_positions):
1252
+ start = max(0, pos - context_chars)
1253
+ end = min(len(text), pos + len(used_pattern) + context_chars)
1254
+ section_text = text[start:end]
1255
+ extracted_sections.append(section_text)
1256
+ log.info("Occurrence %d/%d: Found '%s' at position %d, extracted %d chars",
1257
+ idx + 1, len(all_positions), location_str, pos, len(section_text))
1195
1258
  else:
1196
1259
  log.warning("Location '%s' not found in text (tried %d patterns)", location_str, len(page_patterns))
1197
1260
 
@@ -1227,43 +1290,142 @@ def get_lineage(
1227
1290
  log.info(f"Identified {len(campaigns)} distinct campaigns")
1228
1291
  for camp in campaigns:
1229
1292
  log.info(f" - {camp.campaign_name}: {camp.description}")
1293
+ else:
1294
+ log.warning("No campaigns identified, creating default campaign for enzyme characterization")
1295
+ # Create a default campaign when none are found
1296
+ default_campaign = Campaign(
1297
+ campaign_id="default_characterization",
1298
+ campaign_name="Enzyme Characterization Study",
1299
+ description="Default campaign for papers that characterize existing enzyme variants without describing new directed evolution",
1300
+ model_substrate="Unknown",
1301
+ model_product="Unknown",
1302
+ data_locations=["Full manuscript text"]
1303
+ )
1304
+ campaigns = [default_campaign]
1305
+ log.info(f"Created default campaign: {default_campaign.campaign_name}")
1230
1306
 
1231
1307
  # Use captions for identification - they're concise and focused
1232
- locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=campaigns)
1308
+ locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
1233
1309
 
1234
1310
  all_variants = []
1235
1311
 
1236
- if locations:
1312
+ if campaigns:
1313
+ # If we have campaigns but no specific locations, use general extraction
1314
+ if not locations:
1315
+ log.info("No specific lineage locations found, extracting from full text with campaign context")
1316
+ # Extract lineage for each campaign using full text
1317
+ for campaign in campaigns:
1318
+ log.info(f"Processing campaign: {campaign.campaign_id}")
1319
+ campaign_variants = extract_campaign_lineage(
1320
+ full_text, model, campaign_id=campaign.campaign_id,
1321
+ debug_dir=debug_dir, pdf_paths=pdf_paths,
1322
+ campaign_info=campaign
1323
+ )
1324
+ all_variants.extend(campaign_variants)
1325
+ return all_variants, campaigns
1326
+ # Original logic for when we have both locations and campaigns
1237
1327
  # Log location information
1238
1328
  location_summary = []
1239
1329
  for loc in locations[:5]:
1240
1330
  if isinstance(loc, dict):
1241
- campaign_info = f", campaign: {loc.get('campaign_id', 'unknown')}" if 'campaign_id' in loc else ""
1242
- location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)}{campaign_info})")
1331
+ location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
1243
1332
  else:
1244
1333
  location_summary.append(str(loc))
1245
1334
  log.info("Gemini identified %d potential lineage locations: %s",
1246
1335
  len(locations), ", ".join(location_summary))
1247
1336
 
1248
- # Group locations by campaign
1249
- locations_by_campaign = {}
1337
+ # Extract context around each location for better decision making
1338
+ locations_with_context = []
1250
1339
  for loc in locations:
1251
- campaign_id = loc.get('campaign_id', 'default') if isinstance(loc, dict) else 'default'
1252
- if campaign_id not in locations_by_campaign:
1253
- locations_by_campaign[campaign_id] = []
1254
- locations_by_campaign[campaign_id].append(loc)
1340
+ location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1341
+ # Extract 1000 chars of context around the location
1342
+ context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1343
+ locations_with_context.append({
1344
+ 'location': loc,
1345
+ 'context': context_text[:1000] # First 1000 chars of extracted context
1346
+ })
1255
1347
 
1256
- # Process each campaign's locations
1257
- for campaign_id, campaign_locations in locations_by_campaign.items():
1258
- log.info(f"Processing campaign: {campaign_id}")
1348
+ # For each campaign, ask Gemini to select the best location
1349
+ for campaign in campaigns:
1350
+ log.info(f"Processing campaign: {campaign.campaign_id}")
1259
1351
 
1260
- # Sort locations by confidence to get the highest confidence one
1261
- sorted_locations = sorted(campaign_locations,
1262
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1263
- reverse=True)
1352
+ # Build locations context string
1353
+ locations_str = ""
1354
+ for i, loc_ctx in enumerate(locations_with_context):
1355
+ loc = loc_ctx['location']
1356
+ context = loc_ctx['context']
1357
+ location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1358
+ location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
1359
+ confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
1360
+ reason = loc.get('reason', '') if isinstance(loc, dict) else ''
1361
+
1362
+ locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
1363
+ locations_str += f" Reason: {reason}\n"
1364
+ locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
1264
1365
 
1265
- # Use only the highest confidence location to avoid duplicates
1266
- primary_location = sorted_locations[0] if sorted_locations else None
1366
+ # Ask Gemini to select best location for this campaign
1367
+ best_location_prompt = _CAMPAIGN_BEST_LOCATION_PROMPT.format(
1368
+ campaign_id=campaign.campaign_id,
1369
+ campaign_name=campaign.campaign_name,
1370
+ description=campaign.description,
1371
+ identifiers=campaign.notes or "No specific identifiers provided",
1372
+ locations_with_context=locations_str
1373
+ )
1374
+
1375
+ primary_location = None
1376
+ try:
1377
+ # Save prompt to debug if provided
1378
+ if debug_dir:
1379
+ debug_path = Path(debug_dir)
1380
+ debug_path.mkdir(parents=True, exist_ok=True)
1381
+ prompt_file = debug_path / f"best_location_{campaign.campaign_id}_{int(time.time())}.txt"
1382
+ _dump(f"=== BEST LOCATION PROMPT ===\nCampaign: {campaign.campaign_id}\n{'='*80}\n\n{best_location_prompt}", prompt_file)
1383
+
1384
+ response = model.generate_content(best_location_prompt)
1385
+ response_text = _extract_text(response).strip()
1386
+
1387
+ # Parse JSON response
1388
+ if response_text.startswith("```"):
1389
+ response_text = response_text.split("```")[1].strip()
1390
+ if response_text.startswith("json"):
1391
+ response_text = response_text[4:].strip()
1392
+
1393
+ best_loc_data = json.loads(response_text)
1394
+ selected_location = best_loc_data.get('location', '')
1395
+ confidence = best_loc_data.get('confidence', 0)
1396
+ reason = best_loc_data.get('reason', '')
1397
+
1398
+ # Save response to debug if provided
1399
+ if debug_dir:
1400
+ response_file = debug_path / f"best_location_response_{campaign.campaign_id}_{int(time.time())}.txt"
1401
+ _dump(f"=== BEST LOCATION RESPONSE ===\nCampaign: {campaign.campaign_id}\nSelected: {selected_location}\nConfidence: {confidence}\nReason: {reason}\n{'='*80}", response_file)
1402
+
1403
+ log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
1404
+
1405
+ # Find the actual location object
1406
+ for loc in locations:
1407
+ loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1408
+ if loc_str == selected_location:
1409
+ primary_location = loc
1410
+ break
1411
+
1412
+ if not primary_location:
1413
+ log.warning(f"Could not find selected location '{selected_location}' in locations list")
1414
+ # Fall back to highest confidence location
1415
+ primary_location = sorted(locations,
1416
+ key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1417
+ reverse=True)[0] if locations else None
1418
+
1419
+ except Exception as e:
1420
+ log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
1421
+ # Fall back to highest confidence location
1422
+ primary_location = sorted(locations,
1423
+ key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1424
+ reverse=True)[0] if locations else None
1425
+
1426
+ if not primary_location:
1427
+ log.warning(f"No location found for campaign {campaign.campaign_id}")
1428
+ continue
1267
1429
 
1268
1430
  # Track if we successfully extracted from figure
1269
1431
  extracted_from_figure = False
@@ -1297,12 +1459,11 @@ def get_lineage(
1297
1459
  log.info("Saved lineage figure to: %s", figure_file)
1298
1460
 
1299
1461
  # Extract lineage from the figure
1300
- campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
1301
1462
  variants = extract_lineage_from_figure(
1302
1463
  figure_bytes, model,
1303
1464
  debug_dir=debug_dir,
1304
- campaign_id=campaign_id,
1305
- campaign_info=campaign_obj
1465
+ campaign_id=campaign.campaign_id,
1466
+ campaign_info=campaign
1306
1467
  )
1307
1468
  if variants:
1308
1469
  all_variants.extend(variants)
@@ -1327,22 +1488,22 @@ def get_lineage(
1327
1488
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
1328
1489
  len(full_text), len(focused_text),
1329
1490
  primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
1330
- campaign_id)
1491
+ campaign.campaign_id)
1331
1492
 
1332
- # Find the campaign object
1333
- campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
1493
+ # Extract lineage for this campaign
1334
1494
  campaign_variants = extract_complete_lineage(
1335
1495
  focused_text, model,
1336
1496
  debug_dir=debug_dir,
1337
- campaign_id=campaign_id,
1338
- campaign_info=campaign_obj
1497
+ campaign_id=campaign.campaign_id,
1498
+ campaign_info=campaign,
1499
+ pdf_paths=pdf_paths
1339
1500
  )
1340
1501
  all_variants.extend(campaign_variants)
1341
1502
 
1342
1503
  return all_variants, campaigns
1343
1504
  else:
1344
1505
  log.info("Gemini did not identify specific lineage locations")
1345
- variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir)
1506
+ variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir, pdf_paths=pdf_paths)
1346
1507
  return variants, campaigns
1347
1508
 
1348
1509
  # === 7. SEQUENCE EXTRACTION === ----------------------------------------------
@@ -1398,18 +1559,31 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
1398
1559
  return []
1399
1560
 
1400
1561
  # --- 7.2 Page-based extraction helper ---------------------------------------
1401
- def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
1402
- """Extract text from a specific page number in the PDFs."""
1562
+ def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
1563
+ """Extract text from a specific page number in the PDFs.
1564
+
1565
+ Args:
1566
+ pdf_paths: List of PDF paths
1567
+ page_num: Page number (can be "S1", "S2", etc for SI pages)
1568
+ skip_si_toc: If True, skip first 2 pages of SI to avoid TOC
1569
+ """
1403
1570
  # Convert page number to int and handle S-prefix
1404
1571
  page_str = str(page_num).strip().upper()
1405
1572
  if page_str.startswith('S'):
1406
1573
  # Supplementary page - look in the SI PDF (second PDF)
1407
1574
  actual_page = int(page_str[1:]) - 1 # 0-indexed
1408
1575
  pdf_index = 1 if len(pdf_paths) > 1 else 0
1576
+ is_si_page = True
1409
1577
  else:
1410
1578
  # Regular page - look in the main PDF
1411
1579
  actual_page = int(page_str) - 1 # 0-indexed
1412
1580
  pdf_index = 0
1581
+ is_si_page = False
1582
+
1583
+ # Skip first 2 pages of SI to avoid table of contents
1584
+ if skip_si_toc and is_si_page and actual_page < 2:
1585
+ log.info("Skipping SI page %s (first 2 pages are typically TOC)", page_str)
1586
+ return ""
1413
1587
 
1414
1588
  if pdf_index >= len(pdf_paths):
1415
1589
  log.warning("Page %s requested but not enough PDFs provided", page_str)
@@ -1543,8 +1717,14 @@ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
1543
1717
  - Only extract dna_seq if NO amino acid sequence is available for that variant
1544
1718
  - This reduces redundancy since protein sequences are usually more relevant
1545
1719
 
1720
+ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
1721
+ - Papers often use different naming conventions in different sections
1722
+ - DO NOT normalize or simplify variant IDs
1723
+ - Extract the variant_id exactly as written where the sequence appears
1724
+ - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
1725
+
1546
1726
  For each variant return:
1547
- * variant_id - the label used in the paper (e.g. "R4-10")
1727
+ * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
1548
1728
  * aa_seq - amino-acid sequence (uppercase), or null
1549
1729
  * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
1550
1730
 
@@ -1584,7 +1764,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
1584
1764
  return _parse_sequences(data)
1585
1765
 
1586
1766
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
1587
- _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY")
1767
+ _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
1588
1768
  _VALID_DNA = set("ACGT")
1589
1769
 
1590
1770
  def _contains_sequence(text: str, min_length: int = 50) -> bool:
@@ -1793,6 +1973,173 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
1793
1973
  log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
1794
1974
  return {}
1795
1975
 
1976
+ def extract_enzyme_info_with_gemini(
1977
+ text: str,
1978
+ variants: List[Variant],
1979
+ model,
1980
+ ) -> Dict[str, str]:
1981
+ """Use Gemini to extract enzyme names or sequences when PDB IDs are not available.
1982
+
1983
+ Returns:
1984
+ Dict mapping variant IDs to sequences
1985
+ """
1986
+ # Build variant info for context
1987
+ variant_info = []
1988
+ for v in variants[:10]: # Limit to first 10 variants for context
1989
+ info = {
1990
+ "id": v.variant_id,
1991
+ "mutations": v.mutations[:5] if v.mutations else [], # Limit mutations shown
1992
+ "parent": v.parent_id,
1993
+ "generation": v.generation
1994
+ }
1995
+ variant_info.append(info)
1996
+
1997
+ prompt = f"""You are analyzing a scientific paper about enzyme engineering. No PDB IDs were found in the paper, and I need to obtain protein sequences for the enzyme variants described.
1998
+
1999
+ Here are the variants found in the paper:
2000
+ {json.dumps(variant_info, indent=2)}
2001
+
2002
+ Please analyze the paper text and:
2003
+ 1. Identify the common name of the enzyme being studied (e.g., "P450 BM3", "cytochrome P450 BM3", "CYP102A1")
2004
+ 2. If possible, extract or find the wild-type sequence
2005
+ 3. Provide any UniProt IDs or accession numbers mentioned
2006
+
2007
+ Paper text (first 5000 characters):
2008
+ {text[:5000]}
2009
+
2010
+ Return your response as a JSON object with this structure:
2011
+ {{
2012
+ "enzyme_name": "common name of the enzyme",
2013
+ "systematic_name": "systematic name if applicable (e.g., CYP102A1)",
2014
+ "uniprot_id": "UniProt ID if found",
2015
+ "wild_type_sequence": "sequence if found in paper or if you know it",
2016
+ "additional_names": ["list", "of", "alternative", "names"]
2017
+ }}
2018
+
2019
+ If you cannot determine certain fields, set them to null.
2020
+ """
2021
+
2022
+ try:
2023
+ response = model.generate_content(prompt)
2024
+ text_response = _extract_text(response).strip()
2025
+
2026
+ # Parse JSON response
2027
+ if text_response.startswith("```"):
2028
+ text_response = text_response.split("```")[1].strip()
2029
+ if text_response.startswith("json"):
2030
+ text_response = text_response[4:].strip()
2031
+ text_response = text_response.split("```")[0].strip()
2032
+
2033
+ enzyme_info = json.loads(text_response)
2034
+ log.info(f"Gemini extracted enzyme info: {enzyme_info.get('enzyme_name', 'Unknown')}")
2035
+
2036
+ sequences = {}
2037
+
2038
+ # If Gemini provided a sequence directly, use it
2039
+ if enzyme_info.get("wild_type_sequence"):
2040
+ # Clean the sequence
2041
+ seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
2042
+ # Validate it looks like a protein sequence
2043
+ if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
2044
+ # Map to the first variant or wild-type
2045
+ wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
2046
+ if wt_variant:
2047
+ sequences[wt_variant.variant_id] = seq
2048
+ else:
2049
+ sequences[variants[0].variant_id] = seq
2050
+ log.info(f"Using sequence from Gemini: {len(seq)} residues")
2051
+
2052
+ # If no sequence but we have names, try to fetch from UniProt
2053
+ if not sequences:
2054
+ names_to_try = []
2055
+ if enzyme_info.get("enzyme_name"):
2056
+ names_to_try.append(enzyme_info["enzyme_name"])
2057
+ if enzyme_info.get("systematic_name"):
2058
+ names_to_try.append(enzyme_info["systematic_name"])
2059
+ if enzyme_info.get("uniprot_id"):
2060
+ names_to_try.append(enzyme_info["uniprot_id"])
2061
+ if enzyme_info.get("additional_names"):
2062
+ names_to_try.extend(enzyme_info["additional_names"])
2063
+
2064
+ # Try each name with UniProt
2065
+ for name in names_to_try:
2066
+ if name:
2067
+ uniprot_seqs = fetch_sequence_by_name(name)
2068
+ if uniprot_seqs:
2069
+ # Map the first sequence to appropriate variant
2070
+ seq = list(uniprot_seqs.values())[0]
2071
+ wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
2072
+ if wt_variant:
2073
+ sequences[wt_variant.variant_id] = seq
2074
+ else:
2075
+ sequences[variants[0].variant_id] = seq
2076
+ log.info(f"Found sequence via UniProt search for '{name}': {len(seq)} residues")
2077
+ break
2078
+
2079
+ return sequences
2080
+
2081
+ except Exception as e:
2082
+ log.warning(f"Failed to extract enzyme info with Gemini: {e}")
2083
+ return {}
2084
+
2085
+
2086
+ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
2087
+ """Fetch protein sequences from UniProt by enzyme name or ID.
2088
+
2089
+ Args:
2090
+ enzyme_name: Name, ID, or accession of the enzyme
2091
+
2092
+ Returns:
2093
+ Dict mapping identifiers to sequences
2094
+ """
2095
+ import requests
2096
+
2097
+ clean_name = enzyme_name.strip()
2098
+
2099
+ # First try as accession number
2100
+ if len(clean_name) <= 10 and (clean_name[0].isalpha() and clean_name[1:].replace("_", "").isalnum()):
2101
+ # Looks like a UniProt accession
2102
+ url = f"https://rest.uniprot.org/uniprotkb/{clean_name}"
2103
+ try:
2104
+ response = requests.get(url, timeout=10)
2105
+ if response.status_code == 200:
2106
+ data = response.json()
2107
+ sequence = data.get('sequence', {}).get('value', '')
2108
+ if sequence:
2109
+ return {clean_name: sequence}
2110
+ except:
2111
+ pass
2112
+
2113
+ # Try search API
2114
+ url = "https://rest.uniprot.org/uniprotkb/search"
2115
+ params = {
2116
+ "query": f'(protein_name:"{clean_name}" OR gene:"{clean_name}" OR id:"{clean_name}")',
2117
+ "format": "json",
2118
+ "size": "5",
2119
+ "fields": "accession,id,protein_name,gene_names,sequence"
2120
+ }
2121
+
2122
+ try:
2123
+ response = requests.get(url, params=params, timeout=10)
2124
+ response.raise_for_status()
2125
+ data = response.json()
2126
+
2127
+ results = data.get('results', [])
2128
+ sequences = {}
2129
+
2130
+ for result in results[:1]: # Just take the first match
2131
+ sequence = result.get('sequence', {}).get('value', '')
2132
+ if sequence:
2133
+ sequences[clean_name] = sequence
2134
+ break
2135
+
2136
+ return sequences
2137
+
2138
+ except Exception as e:
2139
+ log.warning(f"Failed to fetch sequence for '{enzyme_name}': {e}")
2140
+ return {}
2141
+
2142
+
1796
2143
  def match_pdb_to_variants(
1797
2144
  pdb_sequences: Dict[str, str],
1798
2145
  variants: List[Variant],
@@ -1964,53 +2311,70 @@ def _merge_lineage_and_sequences(
1964
2311
  for v in lineage
1965
2312
  ])
1966
2313
 
1967
- df_seq = pd.DataFrame([
1968
- {
1969
- "variant_id": s.variant_id,
1970
- "aa_seq": s.aa_seq,
1971
- "dna_seq": s.dna_seq,
1972
- "seq_confidence": s.confidence,
1973
- "truncated": s.truncated,
1974
- }
1975
- for s in seqs
1976
- ])
2314
+ if seqs:
2315
+ df_seq = pd.DataFrame([
2316
+ {
2317
+ "variant_id": s.variant_id,
2318
+ "aa_seq": s.aa_seq,
2319
+ "dna_seq": s.dna_seq,
2320
+ "seq_confidence": s.confidence,
2321
+ "truncated": s.truncated,
2322
+ "seq_source": s.metadata.get("source", None) if s.metadata else None,
2323
+ }
2324
+ for s in seqs
2325
+ ])
2326
+ else:
2327
+ # Create empty DataFrame with correct columns for merging
2328
+ df_seq = pd.DataFrame(columns=[
2329
+ "variant_id", "aa_seq", "dna_seq", "seq_confidence", "truncated", "seq_source"
2330
+ ])
2331
+
2332
+ # Log sequence data info
2333
+ if len(df_seq) > 0:
2334
+ seq_with_aa = (~df_seq['aa_seq'].isna()).sum()
2335
+ seq_with_dna = (~df_seq['dna_seq'].isna()).sum()
2336
+ log.info(f"Sequence data: {len(df_seq)} entries, {seq_with_aa} with aa_seq, {seq_with_dna} with dna_seq")
1977
2337
 
1978
- # 2. Outer merge keeps every lineage entry and adds sequence cols when present
2338
+ # 2. First try direct merge
1979
2339
  df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
1980
-
1981
- # 2a. If we have unmatched sequences and a model, use Gemini to match them
1982
- log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
1983
- if model and len(df_seq) > 0:
1984
- # Log initial state
1985
- log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
1986
- log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
1987
- log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
1988
-
1989
- # Find lineage entries without sequences
2340
+
2341
+ # Log merge results
2342
+ merged_aa = (~df['aa_seq'].isna()).sum()
2343
+ merged_dna = (~df['dna_seq'].isna()).sum()
2344
+ log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
2345
+
2346
+ # 3. If we have unmatched sequences and a model, use Gemini to match
2347
+ if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
2348
+ # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
1990
2349
  missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
1991
- unmatched_lineage = df[missing_seq]['variant_id'].tolist()
2350
+ unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
1992
2351
 
1993
- # Find sequences that weren't matched
2352
+ # Find unmatched sequences
1994
2353
  matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
1995
2354
  unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
1996
2355
 
1997
- if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
1998
- log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
1999
- log.info(f"Using Gemini to match variants")
2356
+ if unmatched_lineage_ids and len(unmatched_seqs) > 0:
2357
+ log.info(f"Found {len(unmatched_lineage_ids)} lineage entries without sequences")
2358
+ log.info(f"Found {len(unmatched_seqs)} unmatched sequences")
2359
+ log.info("Using Gemini to match variants")
2000
2360
 
2001
- # Build prompt for Gemini to match variants
2002
- prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
2361
+ # Build prompt for Gemini
2362
+ prompt = f"""Match enzyme variant IDs between two lists from the same paper.
2363
+
2364
+ Papers often use different naming conventions for the same variant:
2365
+ - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
2366
+ - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
2367
+
2368
+ Match variants by analyzing generation numbers, prefixes, and patterns.
2003
2369
 
2004
2370
  Lineage variant IDs (need sequences):
2005
- {json.dumps(unmatched_lineage)}
2371
+ {json.dumps(unmatched_lineage_ids)}
2006
2372
 
2007
2373
  Sequence variant IDs (have sequences):
2008
2374
  {json.dumps(unmatched_seqs['variant_id'].tolist())}
2009
2375
 
2010
- These lists contain variant identifiers from the same paper but may use different naming conventions.
2011
- Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
2012
-
2013
- Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
2376
+ Return ONLY a JSON object mapping lineage IDs to sequence IDs.
2377
+ Format: {{"lineage_id": "sequence_id", ...}}
2014
2378
  """
2015
2379
 
2016
2380
  try:
@@ -2024,85 +2388,82 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2024
2388
  text = text[4:].strip()
2025
2389
 
2026
2390
  matches = json.loads(text)
2027
- log.info(f"Gemini returned matches: {matches}")
2391
+ log.info(f"Gemini returned {len(matches)} matches")
2028
2392
 
2029
- # Debug: Log what sequences we actually have
2030
- log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
2393
+ # Create a mapping of sequence IDs to their data for efficient lookup
2394
+ seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
2031
2395
 
2032
- # Apply the matches
2396
+ # Apply matches and update variant IDs
2033
2397
  for lineage_id, seq_id in matches.items():
2034
- if lineage_id in unmatched_lineage:
2035
- # Find the sequence data - be flexible with matching
2036
- seq_data = None
2037
-
2038
- # First try exact match
2039
- seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
2040
- if len(seq_matches) > 0:
2041
- seq_data = seq_matches.iloc[0]
2042
- else:
2043
- # Try to find by checking various matching strategies
2044
- for idx, row in unmatched_seqs.iterrows():
2045
- variant_id = row['variant_id']
2046
- # Check if one is contained in the other
2047
- if seq_id in variant_id or variant_id in seq_id:
2048
- seq_data = row
2049
- break
2050
- # Check if they share the same core identifier (e.g., G0, G1, etc.)
2051
- seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
2052
- variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
2053
- if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
2054
- seq_data = row
2055
- break
2398
+ if lineage_id in unmatched_lineage_ids and seq_id in seq_data_map:
2399
+ # Get the sequence data
2400
+ seq_data = seq_data_map[seq_id]
2056
2401
 
2057
- if seq_data is not None:
2058
- # Update the dataframe
2059
- mask = df['variant_id'] == lineage_id
2060
- if mask.any():
2061
- # Log before update
2062
- log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
2063
-
2064
- df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2065
- df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2066
- df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2067
- df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2402
+ # Update the row with the matched sequence ID and data
2403
+ mask = df['variant_id'] == lineage_id
2404
+ if mask.any():
2405
+ # Update variant_id to use the sequence variant name
2406
+ df.loc[mask, 'variant_id'] = seq_id
2407
+
2408
+ # Update parent_id if it matches any of the mapped lineage IDs
2409
+ parent_mask = df['parent_id'] == lineage_id
2410
+ if parent_mask.any():
2411
+ df.loc[parent_mask, 'parent_id'] = seq_id
2412
+
2413
+ # Update sequence data
2414
+ # For pandas Series from iterrows(), use proper indexing
2415
+ aa_seq_val = seq_data['aa_seq'] if 'aa_seq' in seq_data else None
2416
+ dna_seq_val = seq_data['dna_seq'] if 'dna_seq' in seq_data else None
2417
+
2418
+ # Always update sequence fields to preserve DNA even when aa_seq is null
2419
+ df.loc[mask, 'aa_seq'] = aa_seq_val
2420
+ df.loc[mask, 'dna_seq'] = dna_seq_val
2068
2421
 
2069
- # Log after update
2070
- log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
2071
- log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
2072
- else:
2073
- log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
2074
- else:
2075
- log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
2422
+ df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2423
+ df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2424
+
2425
+ # Log sequence info - check both aa_seq and dna_seq
2426
+ aa_len = len(seq_data['aa_seq']) if pd.notna(seq_data.get('aa_seq')) and seq_data.get('aa_seq') else 0
2427
+ dna_len = len(seq_data['dna_seq']) if pd.notna(seq_data.get('dna_seq')) and seq_data.get('dna_seq') else 0
2428
+ log.info(f"Matched {lineage_id} -> {seq_id} (aa_seq: {aa_len} chars, dna_seq: {dna_len} chars)")
2429
+
2430
+ # Update any remaining parent_id references to matched variants
2431
+ for lineage_id, seq_id in matches.items():
2432
+ parent_mask = df['parent_id'] == lineage_id
2433
+ if parent_mask.any():
2434
+ df.loc[parent_mask, 'parent_id'] = seq_id
2076
2435
 
2077
- # Log the final state after all matches
2078
- matched_count = (~df['aa_seq'].isna()).sum()
2079
- log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
2436
+ # Log final state - count variants with any sequence (aa or dna)
2437
+ aa_count = (~df['aa_seq'].isna()).sum()
2438
+ dna_count = (~df['dna_seq'].isna()).sum()
2439
+ any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
2440
+ log.info(f"After Gemini matching: {any_seq_count}/{len(df)} variants have sequences (aa: {aa_count}, dna: {dna_count})")
2080
2441
 
2081
2442
  except Exception as e:
2082
2443
  log.warning(f"Failed to match variants using Gemini: {e}")
2083
2444
 
2084
- # 3. If generation missing after user input, try inference
2445
+ # 4. If generation missing, try inference
2085
2446
  if df["generation"].isna().any():
2086
- _infer_generations(lineage) # mutates in place
2087
- df = df.drop(columns=["generation"]).merge(
2088
- pd.DataFrame(
2089
- {"variant_id": [v.variant_id for v in lineage], "generation": [v.generation for v in lineage]}
2090
- ),
2091
- on="variant_id",
2092
- how="left",
2093
- )
2094
-
2095
- # 4. Attach DOI column for provenance
2447
+ _infer_generations(lineage)
2448
+ # Need to update the generations based on the potentially updated variant IDs
2449
+ gen_map = {v.variant_id: v.generation for v in lineage}
2450
+ # Also create a map for any variant IDs that were replaced
2451
+ for idx, row in df.iterrows():
2452
+ variant_id = row['variant_id']
2453
+ if variant_id in gen_map:
2454
+ df.at[idx, 'generation'] = gen_map[variant_id]
2455
+
2456
+ # 5. Attach DOI column
2096
2457
  df["doi"] = doi
2097
2458
 
2098
- # 5. Sort rows: primary by generation, then by variant_id
2459
+ # 6. Sort by generation, then variant_id
2099
2460
  df = df.sort_values(["generation", "variant_id"], kind="mergesort")
2100
2461
 
2101
- # Debug: Log final merge state
2102
- seq_count = (~df['aa_seq'].isna()).sum()
2103
- log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
2104
- if seq_count > 0:
2105
- log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
2462
+ # 7. Log final state
2463
+ aa_count = (~df['aa_seq'].isna()).sum()
2464
+ dna_count = (~df['dna_seq'].isna()).sum()
2465
+ any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
2466
+ log.info(f"Final result: {len(df)} variants, {any_seq_count} with sequences (aa: {aa_count}, dna: {dna_count})")
2106
2467
 
2107
2468
  return df
2108
2469
 
@@ -2114,28 +2475,27 @@ def merge_and_score(
2114
2475
  doi: Optional[str] = None,
2115
2476
  model=None,
2116
2477
  ) -> pd.DataFrame:
2117
- """User-facing helper imported by the pipeline orchestrator.
2118
-
2119
- * Ensures lineage + sequence lists are non-empty.
2120
- * Performs a shallow validation.
2121
- * Returns a ready-to-export pandas DataFrame.
2478
+ """Merge lineage and sequence data into a single DataFrame.
2479
+
2480
+ Args:
2481
+ lineage: List of Variant objects from lineage extraction
2482
+ seqs: List of SequenceBlock objects from sequence extraction
2483
+ doi: DOI of the paper for provenance
2484
+ model: Gemini model for smart matching (optional)
2485
+
2486
+ Returns:
2487
+ DataFrame with merged lineage and sequence data
2122
2488
  """
2123
-
2124
2489
  if not lineage:
2125
2490
  raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
2126
2491
 
2127
- # If no sequences found, still build a DataFrame so caller can decide what to do.
2128
2492
  df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
2129
2493
 
2130
- # Basic sanity: warn if many missing sequences
2494
+ # Warn if many sequences are missing
2131
2495
  missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
2132
2496
  if missing_rate > 0.5:
2133
2497
  log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
2134
2498
 
2135
- # Debug log before returning
2136
- seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
2137
- log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
2138
-
2139
2499
  return df
2140
2500
 
2141
2501
  # -------------------------------------------------------------------- end 8 ---
@@ -2245,7 +2605,7 @@ def run_pipeline(
2245
2605
  early_df = _lineage_to_dataframe(lineage)
2246
2606
  output_csv_path = Path(output_csv)
2247
2607
  # Save lineage-only data with specific filename
2248
- lineage_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2608
+ lineage_path = output_csv_path.parent / "enzyme_lineage_name.csv"
2249
2609
  early_df.to_csv(lineage_path, index=False)
2250
2610
  log.info(
2251
2611
  "Saved lineage-only CSV -> %s",
@@ -2309,6 +2669,36 @@ def run_pipeline(
2309
2669
  log.warning(f"No sequences found in PDB {pdb_id}")
2310
2670
  else:
2311
2671
  log.warning("No PDB IDs found in paper")
2672
+
2673
+ # 4b. If still no sequences, try Gemini extraction as last resort
2674
+ if not sequences or all(not s.aa_seq for s in sequences):
2675
+ log.info("No sequences from PDB, attempting Gemini-based extraction...")
2676
+
2677
+ gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
2678
+
2679
+ if gemini_sequences:
2680
+ # Convert to SequenceBlock objects
2681
+ gemini_seq_blocks = []
2682
+ for variant_id, seq in gemini_sequences.items():
2683
+ # Find the matching variant
2684
+ variant = next((v for v in lineage if v.variant_id == variant_id), None)
2685
+ if variant:
2686
+ seq_block = SequenceBlock(
2687
+ variant_id=variant.variant_id,
2688
+ aa_seq=seq,
2689
+ dna_seq=None,
2690
+ confidence=0.9, # High confidence but slightly lower than PDB
2691
+ truncated=False,
2692
+ metadata={"source": "Gemini/UniProt"}
2693
+ )
2694
+ gemini_seq_blocks.append(seq_block)
2695
+ log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
2696
+
2697
+ if gemini_seq_blocks:
2698
+ sequences = gemini_seq_blocks
2699
+ log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
2700
+ else:
2701
+ log.warning("Failed to extract sequences via Gemini")
2312
2702
 
2313
2703
  # 5. Merge & score (Section 8) --------------------------------------------
2314
2704
  doi = extract_doi(manuscript)
@@ -2320,18 +2710,17 @@ def run_pipeline(
2320
2710
  # Save final data with sequences using same filename (overwrites lineage-only)
2321
2711
  sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2322
2712
 
2323
- # Debug: Log what we're about to save
2324
- seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2325
- log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
2326
- if seq_count > 0 and 'aa_seq' in df_final:
2327
- with_seq = df_final[~df_final['aa_seq'].isna()]
2328
- log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
2329
-
2713
+ # Save the final CSV
2330
2714
  df_final.to_csv(sequence_path, index=False)
2715
+
2716
+ # Log summary statistics
2717
+ seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2331
2718
  log.info(
2332
- "Overwrote with final results -> %s (%.1f kB)",
2719
+ "Saved final CSV -> %s (%.1f kB, %d variants, %d with sequences)",
2333
2720
  sequence_path,
2334
2721
  sequence_path.stat().st_size / 1024,
2722
+ len(df_final),
2723
+ seq_count
2335
2724
  )
2336
2725
 
2337
2726
  log.info(