debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +623 -234
- debase/lineage_format.py +113 -11
- debase/reaction_info_extractor.py +21 -7
- debase/substrate_scope_extractor.py +516 -67
- debase/wrapper.py +301 -67
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
- debase-0.1.17.dist-info/RECORD +17 -0
- debase-0.1.11.dist-info/RECORD +0 -17
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0
@@ -589,17 +589,28 @@ TEXT:
|
|
589
589
|
{text}
|
590
590
|
""".strip()
|
591
591
|
|
592
|
-
|
593
|
-
Given
|
592
|
+
_CAMPAIGN_BEST_LOCATION_PROMPT = """
|
593
|
+
Given this specific campaign and the available data locations, select the BEST location to extract the complete lineage data for this campaign.
|
594
594
|
|
595
|
-
|
596
|
-
{
|
595
|
+
Campaign:
|
596
|
+
- ID: {campaign_id}
|
597
|
+
- Name: {campaign_name}
|
598
|
+
- Description: {description}
|
599
|
+
- Lineage identifiers: {identifiers}
|
597
600
|
|
598
|
-
|
599
|
-
|
601
|
+
Available locations with context:
|
602
|
+
{locations_with_context}
|
600
603
|
|
601
|
-
|
602
|
-
|
604
|
+
Select the location that most likely contains the COMPLETE lineage data (all variants, mutations, and parent relationships) for THIS SPECIFIC campaign.
|
605
|
+
|
606
|
+
Consider:
|
607
|
+
1. Tables are usually more structured and complete than figures
|
608
|
+
2. Look for locations that mention this campaign's specific identifiers or enzyme names
|
609
|
+
3. Some locations may contain data for multiple campaigns - that's fine, we can filter later
|
610
|
+
4. Prioritize completeness over visual clarity
|
611
|
+
|
612
|
+
Return a JSON object with:
|
613
|
+
{{"location": "selected location identifier", "confidence": 0-100, "reason": "explanation"}}
|
603
614
|
""".strip()
|
604
615
|
|
605
616
|
# ---- 6.1 Prompt templates -------------------------------------------------
|
@@ -756,9 +767,43 @@ def identify_evolution_locations(
|
|
756
767
|
max_results: int = 5,
|
757
768
|
debug_dir: str | Path | None = None,
|
758
769
|
campaigns: Optional[List[Campaign]] = None,
|
770
|
+
pdf_paths: Optional[List[Path]] = None,
|
759
771
|
) -> List[dict]:
|
760
772
|
"""Ask Gemini where in the paper the lineage is probably described."""
|
761
|
-
|
773
|
+
# Extract table of contents from PDFs if available
|
774
|
+
toc_text = ""
|
775
|
+
if pdf_paths:
|
776
|
+
toc_sections = []
|
777
|
+
for pdf_path in pdf_paths:
|
778
|
+
# Extract first few pages looking for TOC
|
779
|
+
doc = _open_doc(pdf_path)
|
780
|
+
try:
|
781
|
+
for page_num in range(min(5, len(doc))):
|
782
|
+
page_text = doc[page_num].get_text()
|
783
|
+
if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
|
784
|
+
# Found TOC page
|
785
|
+
lines = page_text.split('\n')
|
786
|
+
toc_lines = []
|
787
|
+
for line in lines:
|
788
|
+
line = line.strip()
|
789
|
+
# TOC entries typically have page numbers
|
790
|
+
if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
|
791
|
+
re.search(r'\s{2,}S?\d+\s*$', line) or
|
792
|
+
re.match(r'^\d+\.\s+\w+', line)):
|
793
|
+
toc_lines.append(line)
|
794
|
+
if toc_lines:
|
795
|
+
pdf_name = pdf_path.name
|
796
|
+
toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
|
797
|
+
break
|
798
|
+
finally:
|
799
|
+
doc.close()
|
800
|
+
|
801
|
+
if toc_sections:
|
802
|
+
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
803
|
+
|
804
|
+
# Include TOC before the main text
|
805
|
+
combined_text = toc_text + text if toc_text else text
|
806
|
+
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
|
762
807
|
locs: List[dict] = []
|
763
808
|
try:
|
764
809
|
locs = generate_json_with_retry(
|
@@ -770,69 +815,7 @@ def identify_evolution_locations(
|
|
770
815
|
except Exception as exc: # pragma: no cover
|
771
816
|
log.warning("identify_evolution_locations(): %s", exc)
|
772
817
|
|
773
|
-
#
|
774
|
-
if campaigns and locs:
|
775
|
-
for loc in locs:
|
776
|
-
# Extract more context around the location
|
777
|
-
location_str = loc.get('location', '')
|
778
|
-
context = loc.get('reason', '')
|
779
|
-
|
780
|
-
# Ask Gemini to map this location to a campaign
|
781
|
-
if campaigns:
|
782
|
-
try:
|
783
|
-
campaigns_json = json.dumps([{
|
784
|
-
"campaign_id": c.campaign_id,
|
785
|
-
"campaign_name": c.campaign_name,
|
786
|
-
"lineage_hint": c.notes
|
787
|
-
} for c in campaigns])
|
788
|
-
|
789
|
-
mapping_prompt = _CAMPAIGN_MAPPING_PROMPT.format(
|
790
|
-
campaigns=campaigns_json,
|
791
|
-
location=location_str,
|
792
|
-
context=context
|
793
|
-
)
|
794
|
-
|
795
|
-
# Save mapping prompt to debug if provided
|
796
|
-
if debug_dir:
|
797
|
-
debug_path = Path(debug_dir)
|
798
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
799
|
-
mapping_file = debug_path / f"campaign_mapping_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
800
|
-
_dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
|
801
|
-
|
802
|
-
response = model.generate_content(mapping_prompt)
|
803
|
-
response_text = _extract_text(response).strip()
|
804
|
-
|
805
|
-
# Extract just the campaign_id from the response
|
806
|
-
# Look for the campaign_id pattern in the response
|
807
|
-
campaign_id = None
|
808
|
-
for campaign in campaigns:
|
809
|
-
if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
|
810
|
-
campaign_id = campaign.campaign_id
|
811
|
-
break
|
812
|
-
|
813
|
-
# If not found, try to extract the last line or quoted string
|
814
|
-
if not campaign_id:
|
815
|
-
# Try to find quoted string
|
816
|
-
quoted_match = re.search(r'"([^"]+)"', response_text)
|
817
|
-
if quoted_match:
|
818
|
-
campaign_id = quoted_match.group(1)
|
819
|
-
else:
|
820
|
-
# Take the last non-empty line
|
821
|
-
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
822
|
-
if lines:
|
823
|
-
campaign_id = lines[-1].strip('"')
|
824
|
-
|
825
|
-
# Save mapping response to debug if provided
|
826
|
-
if debug_dir:
|
827
|
-
response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
828
|
-
_dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
|
829
|
-
|
830
|
-
# Add campaign_id to location
|
831
|
-
if campaign_id:
|
832
|
-
loc['campaign_id'] = campaign_id
|
833
|
-
log.info(f"Mapped {location_str} to campaign: {campaign_id}")
|
834
|
-
except Exception as exc:
|
835
|
-
log.warning(f"Failed to map location to campaign: {exc}")
|
818
|
+
# No longer mapping locations to campaigns here - we'll ask for best location per campaign instead
|
836
819
|
|
837
820
|
return locs if isinstance(locs, list) else []
|
838
821
|
|
@@ -840,7 +823,14 @@ def identify_evolution_locations(
|
|
840
823
|
|
841
824
|
def _parse_variants(data: Dict[str, Any], campaign_id: Optional[str] = None) -> List[Variant]:
|
842
825
|
"""Convert raw JSON to a list[Variant] with basic validation."""
|
843
|
-
|
826
|
+
if isinstance(data, list):
|
827
|
+
# Direct array of variants
|
828
|
+
variants_json = data
|
829
|
+
elif isinstance(data, dict):
|
830
|
+
# Object with "variants" key
|
831
|
+
variants_json = data.get("variants", [])
|
832
|
+
else:
|
833
|
+
variants_json = []
|
844
834
|
parsed: List[Variant] = []
|
845
835
|
for item in variants_json:
|
846
836
|
try:
|
@@ -878,6 +868,7 @@ def extract_complete_lineage(
|
|
878
868
|
debug_dir: str | Path | None = None,
|
879
869
|
campaign_id: Optional[str] = None,
|
880
870
|
campaign_info: Optional[Campaign] = None,
|
871
|
+
pdf_paths: Optional[List[Path]] = None,
|
881
872
|
) -> List[Variant]:
|
882
873
|
"""Prompt Gemini for the full lineage and return a list[Variant]."""
|
883
874
|
# Build campaign context
|
@@ -899,10 +890,44 @@ IMPORTANT:
|
|
899
890
|
4. Include parent variants only if they are direct ancestors in this campaign's lineage.
|
900
891
|
"""
|
901
892
|
|
893
|
+
# Extract table of contents from PDFs if available
|
894
|
+
toc_text = ""
|
895
|
+
if pdf_paths:
|
896
|
+
toc_sections = []
|
897
|
+
for pdf_path in pdf_paths:
|
898
|
+
# Extract first few pages looking for TOC
|
899
|
+
doc = _open_doc(pdf_path)
|
900
|
+
try:
|
901
|
+
for page_num in range(min(5, len(doc))):
|
902
|
+
page_text = doc[page_num].get_text()
|
903
|
+
if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
|
904
|
+
# Found TOC page
|
905
|
+
lines = page_text.split('\n')
|
906
|
+
toc_lines = []
|
907
|
+
for line in lines:
|
908
|
+
line = line.strip()
|
909
|
+
# TOC entries typically have page numbers
|
910
|
+
if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
|
911
|
+
re.search(r'\s{2,}S?\d+\s*$', line) or
|
912
|
+
re.match(r'^\d+\.\s+\w+', line)):
|
913
|
+
toc_lines.append(line)
|
914
|
+
if toc_lines:
|
915
|
+
pdf_name = pdf_path.name
|
916
|
+
toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
|
917
|
+
break
|
918
|
+
finally:
|
919
|
+
doc.close()
|
920
|
+
|
921
|
+
if toc_sections:
|
922
|
+
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
923
|
+
|
924
|
+
# Include TOC in the prompt text
|
925
|
+
combined_text = toc_text + text if toc_text else text
|
926
|
+
|
902
927
|
prompt = _LINEAGE_EXTRACT_PROMPT.format(
|
903
928
|
campaign_context=campaign_context,
|
904
929
|
schema=_LINEAGE_SCHEMA_HINT,
|
905
|
-
text=
|
930
|
+
text=combined_text[:MAX_CHARS],
|
906
931
|
)
|
907
932
|
raw = generate_json_with_retry(
|
908
933
|
model,
|
@@ -1044,15 +1069,27 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
|
1044
1069
|
# 1. Line contains dots (...) followed by page number
|
1045
1070
|
# 2. Line ends with just a page number
|
1046
1071
|
# 3. Line has "Table S12:" or similar followed by title and page
|
1047
|
-
|
1072
|
+
# 4. Pattern appears at start of line followed by description and page number
|
1073
|
+
if ('...' in line or
|
1074
|
+
re.search(r'\.\s*\d+\s*$', line) or
|
1075
|
+
re.search(r':\s*[^:]+\s+\d+\s*$', line) or
|
1076
|
+
(line.strip().startswith(pattern) and re.search(r'\s+\d+\s*$', line))):
|
1048
1077
|
return True
|
1049
1078
|
|
1050
1079
|
# Check if this is in a contents/TOC section
|
1051
|
-
# Look backwards up to
|
1052
|
-
context_start = max(0, position -
|
1080
|
+
# Look backwards up to 1000 chars for "Contents" or "Table of Contents"
|
1081
|
+
context_start = max(0, position - 1000)
|
1053
1082
|
context = text[context_start:position].lower()
|
1054
1083
|
if 'contents' in context or 'table of contents' in context:
|
1055
1084
|
return True
|
1085
|
+
|
1086
|
+
# Check if we're in the first ~5000 chars of the document (likely TOC area)
|
1087
|
+
# This helps catch TOC entries that don't have obvious formatting
|
1088
|
+
if position < 5000:
|
1089
|
+
# Be more strict for early document positions
|
1090
|
+
# Check if line looks like a TOC entry (has page number at end)
|
1091
|
+
if re.search(r'\s+\d+\s*$', line):
|
1092
|
+
return True
|
1056
1093
|
|
1057
1094
|
return False
|
1058
1095
|
|
@@ -1185,13 +1222,39 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1185
1222
|
log.warning("No sequences found in any of %d occurrences of '%s'",
|
1186
1223
|
len(all_positions), location_str)
|
1187
1224
|
else:
|
1188
|
-
# For lineage extraction,
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1225
|
+
# For lineage extraction, find ALL occurrences of the pattern
|
1226
|
+
all_positions = []
|
1227
|
+
search_pos = 0
|
1228
|
+
|
1229
|
+
# Find all occurrences of this pattern (not just the first)
|
1230
|
+
while search_pos < len(text_lower):
|
1231
|
+
temp_pos = text_lower.find(used_pattern.lower(), search_pos)
|
1232
|
+
if temp_pos == -1:
|
1233
|
+
break
|
1234
|
+
|
1235
|
+
# Check if this is a TOC entry
|
1236
|
+
if _is_toc_entry(text, temp_pos, used_pattern):
|
1237
|
+
log.debug("Skipping TOC entry for pattern '%s' at position %d", used_pattern, temp_pos)
|
1238
|
+
search_pos = temp_pos + len(used_pattern)
|
1239
|
+
continue
|
1240
|
+
|
1241
|
+
all_positions.append(temp_pos)
|
1242
|
+
search_pos = temp_pos + len(used_pattern)
|
1243
|
+
|
1244
|
+
if len(all_positions) >= 10: # Limit to 10 occurrences
|
1245
|
+
break
|
1246
|
+
|
1247
|
+
log.info("Found %d non-TOC occurrences of pattern '%s' for location '%s'",
|
1248
|
+
len(all_positions), used_pattern, location_str)
|
1249
|
+
|
1250
|
+
# Extract context around each occurrence
|
1251
|
+
for idx, pos in enumerate(all_positions):
|
1252
|
+
start = max(0, pos - context_chars)
|
1253
|
+
end = min(len(text), pos + len(used_pattern) + context_chars)
|
1254
|
+
section_text = text[start:end]
|
1255
|
+
extracted_sections.append(section_text)
|
1256
|
+
log.info("Occurrence %d/%d: Found '%s' at position %d, extracted %d chars",
|
1257
|
+
idx + 1, len(all_positions), location_str, pos, len(section_text))
|
1195
1258
|
else:
|
1196
1259
|
log.warning("Location '%s' not found in text (tried %d patterns)", location_str, len(page_patterns))
|
1197
1260
|
|
@@ -1227,43 +1290,142 @@ def get_lineage(
|
|
1227
1290
|
log.info(f"Identified {len(campaigns)} distinct campaigns")
|
1228
1291
|
for camp in campaigns:
|
1229
1292
|
log.info(f" - {camp.campaign_name}: {camp.description}")
|
1293
|
+
else:
|
1294
|
+
log.warning("No campaigns identified, creating default campaign for enzyme characterization")
|
1295
|
+
# Create a default campaign when none are found
|
1296
|
+
default_campaign = Campaign(
|
1297
|
+
campaign_id="default_characterization",
|
1298
|
+
campaign_name="Enzyme Characterization Study",
|
1299
|
+
description="Default campaign for papers that characterize existing enzyme variants without describing new directed evolution",
|
1300
|
+
model_substrate="Unknown",
|
1301
|
+
model_product="Unknown",
|
1302
|
+
data_locations=["Full manuscript text"]
|
1303
|
+
)
|
1304
|
+
campaigns = [default_campaign]
|
1305
|
+
log.info(f"Created default campaign: {default_campaign.campaign_name}")
|
1230
1306
|
|
1231
1307
|
# Use captions for identification - they're concise and focused
|
1232
|
-
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=
|
1308
|
+
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
|
1233
1309
|
|
1234
1310
|
all_variants = []
|
1235
1311
|
|
1236
|
-
if
|
1312
|
+
if campaigns:
|
1313
|
+
# If we have campaigns but no specific locations, use general extraction
|
1314
|
+
if not locations:
|
1315
|
+
log.info("No specific lineage locations found, extracting from full text with campaign context")
|
1316
|
+
# Extract lineage for each campaign using full text
|
1317
|
+
for campaign in campaigns:
|
1318
|
+
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1319
|
+
campaign_variants = extract_campaign_lineage(
|
1320
|
+
full_text, model, campaign_id=campaign.campaign_id,
|
1321
|
+
debug_dir=debug_dir, pdf_paths=pdf_paths,
|
1322
|
+
campaign_info=campaign
|
1323
|
+
)
|
1324
|
+
all_variants.extend(campaign_variants)
|
1325
|
+
return all_variants, campaigns
|
1326
|
+
# Original logic for when we have both locations and campaigns
|
1237
1327
|
# Log location information
|
1238
1328
|
location_summary = []
|
1239
1329
|
for loc in locations[:5]:
|
1240
1330
|
if isinstance(loc, dict):
|
1241
|
-
|
1242
|
-
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)}{campaign_info})")
|
1331
|
+
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
|
1243
1332
|
else:
|
1244
1333
|
location_summary.append(str(loc))
|
1245
1334
|
log.info("Gemini identified %d potential lineage locations: %s",
|
1246
1335
|
len(locations), ", ".join(location_summary))
|
1247
1336
|
|
1248
|
-
#
|
1249
|
-
|
1337
|
+
# Extract context around each location for better decision making
|
1338
|
+
locations_with_context = []
|
1250
1339
|
for loc in locations:
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1340
|
+
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1341
|
+
# Extract 1000 chars of context around the location
|
1342
|
+
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1343
|
+
locations_with_context.append({
|
1344
|
+
'location': loc,
|
1345
|
+
'context': context_text[:1000] # First 1000 chars of extracted context
|
1346
|
+
})
|
1255
1347
|
|
1256
|
-
#
|
1257
|
-
for
|
1258
|
-
log.info(f"Processing campaign: {campaign_id}")
|
1348
|
+
# For each campaign, ask Gemini to select the best location
|
1349
|
+
for campaign in campaigns:
|
1350
|
+
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1259
1351
|
|
1260
|
-
#
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1352
|
+
# Build locations context string
|
1353
|
+
locations_str = ""
|
1354
|
+
for i, loc_ctx in enumerate(locations_with_context):
|
1355
|
+
loc = loc_ctx['location']
|
1356
|
+
context = loc_ctx['context']
|
1357
|
+
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1358
|
+
location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
|
1359
|
+
confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
|
1360
|
+
reason = loc.get('reason', '') if isinstance(loc, dict) else ''
|
1361
|
+
|
1362
|
+
locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
|
1363
|
+
locations_str += f" Reason: {reason}\n"
|
1364
|
+
locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
|
1264
1365
|
|
1265
|
-
#
|
1266
|
-
|
1366
|
+
# Ask Gemini to select best location for this campaign
|
1367
|
+
best_location_prompt = _CAMPAIGN_BEST_LOCATION_PROMPT.format(
|
1368
|
+
campaign_id=campaign.campaign_id,
|
1369
|
+
campaign_name=campaign.campaign_name,
|
1370
|
+
description=campaign.description,
|
1371
|
+
identifiers=campaign.notes or "No specific identifiers provided",
|
1372
|
+
locations_with_context=locations_str
|
1373
|
+
)
|
1374
|
+
|
1375
|
+
primary_location = None
|
1376
|
+
try:
|
1377
|
+
# Save prompt to debug if provided
|
1378
|
+
if debug_dir:
|
1379
|
+
debug_path = Path(debug_dir)
|
1380
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1381
|
+
prompt_file = debug_path / f"best_location_{campaign.campaign_id}_{int(time.time())}.txt"
|
1382
|
+
_dump(f"=== BEST LOCATION PROMPT ===\nCampaign: {campaign.campaign_id}\n{'='*80}\n\n{best_location_prompt}", prompt_file)
|
1383
|
+
|
1384
|
+
response = model.generate_content(best_location_prompt)
|
1385
|
+
response_text = _extract_text(response).strip()
|
1386
|
+
|
1387
|
+
# Parse JSON response
|
1388
|
+
if response_text.startswith("```"):
|
1389
|
+
response_text = response_text.split("```")[1].strip()
|
1390
|
+
if response_text.startswith("json"):
|
1391
|
+
response_text = response_text[4:].strip()
|
1392
|
+
|
1393
|
+
best_loc_data = json.loads(response_text)
|
1394
|
+
selected_location = best_loc_data.get('location', '')
|
1395
|
+
confidence = best_loc_data.get('confidence', 0)
|
1396
|
+
reason = best_loc_data.get('reason', '')
|
1397
|
+
|
1398
|
+
# Save response to debug if provided
|
1399
|
+
if debug_dir:
|
1400
|
+
response_file = debug_path / f"best_location_response_{campaign.campaign_id}_{int(time.time())}.txt"
|
1401
|
+
_dump(f"=== BEST LOCATION RESPONSE ===\nCampaign: {campaign.campaign_id}\nSelected: {selected_location}\nConfidence: {confidence}\nReason: {reason}\n{'='*80}", response_file)
|
1402
|
+
|
1403
|
+
log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
|
1404
|
+
|
1405
|
+
# Find the actual location object
|
1406
|
+
for loc in locations:
|
1407
|
+
loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1408
|
+
if loc_str == selected_location:
|
1409
|
+
primary_location = loc
|
1410
|
+
break
|
1411
|
+
|
1412
|
+
if not primary_location:
|
1413
|
+
log.warning(f"Could not find selected location '{selected_location}' in locations list")
|
1414
|
+
# Fall back to highest confidence location
|
1415
|
+
primary_location = sorted(locations,
|
1416
|
+
key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
|
1417
|
+
reverse=True)[0] if locations else None
|
1418
|
+
|
1419
|
+
except Exception as e:
|
1420
|
+
log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
|
1421
|
+
# Fall back to highest confidence location
|
1422
|
+
primary_location = sorted(locations,
|
1423
|
+
key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
|
1424
|
+
reverse=True)[0] if locations else None
|
1425
|
+
|
1426
|
+
if not primary_location:
|
1427
|
+
log.warning(f"No location found for campaign {campaign.campaign_id}")
|
1428
|
+
continue
|
1267
1429
|
|
1268
1430
|
# Track if we successfully extracted from figure
|
1269
1431
|
extracted_from_figure = False
|
@@ -1297,12 +1459,11 @@ def get_lineage(
|
|
1297
1459
|
log.info("Saved lineage figure to: %s", figure_file)
|
1298
1460
|
|
1299
1461
|
# Extract lineage from the figure
|
1300
|
-
campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
|
1301
1462
|
variants = extract_lineage_from_figure(
|
1302
1463
|
figure_bytes, model,
|
1303
1464
|
debug_dir=debug_dir,
|
1304
|
-
campaign_id=campaign_id,
|
1305
|
-
campaign_info=
|
1465
|
+
campaign_id=campaign.campaign_id,
|
1466
|
+
campaign_info=campaign
|
1306
1467
|
)
|
1307
1468
|
if variants:
|
1308
1469
|
all_variants.extend(variants)
|
@@ -1327,22 +1488,22 @@ def get_lineage(
|
|
1327
1488
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
1328
1489
|
len(full_text), len(focused_text),
|
1329
1490
|
primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
|
1330
|
-
campaign_id)
|
1491
|
+
campaign.campaign_id)
|
1331
1492
|
|
1332
|
-
#
|
1333
|
-
campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
|
1493
|
+
# Extract lineage for this campaign
|
1334
1494
|
campaign_variants = extract_complete_lineage(
|
1335
1495
|
focused_text, model,
|
1336
1496
|
debug_dir=debug_dir,
|
1337
|
-
campaign_id=campaign_id,
|
1338
|
-
campaign_info=
|
1497
|
+
campaign_id=campaign.campaign_id,
|
1498
|
+
campaign_info=campaign,
|
1499
|
+
pdf_paths=pdf_paths
|
1339
1500
|
)
|
1340
1501
|
all_variants.extend(campaign_variants)
|
1341
1502
|
|
1342
1503
|
return all_variants, campaigns
|
1343
1504
|
else:
|
1344
1505
|
log.info("Gemini did not identify specific lineage locations")
|
1345
|
-
variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir)
|
1506
|
+
variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir, pdf_paths=pdf_paths)
|
1346
1507
|
return variants, campaigns
|
1347
1508
|
|
1348
1509
|
# === 7. SEQUENCE EXTRACTION === ----------------------------------------------
|
@@ -1398,18 +1559,31 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
1398
1559
|
return []
|
1399
1560
|
|
1400
1561
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
1401
|
-
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
|
1402
|
-
"""Extract text from a specific page number in the PDFs.
|
1562
|
+
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
1563
|
+
"""Extract text from a specific page number in the PDFs.
|
1564
|
+
|
1565
|
+
Args:
|
1566
|
+
pdf_paths: List of PDF paths
|
1567
|
+
page_num: Page number (can be "S1", "S2", etc for SI pages)
|
1568
|
+
skip_si_toc: If True, skip first 2 pages of SI to avoid TOC
|
1569
|
+
"""
|
1403
1570
|
# Convert page number to int and handle S-prefix
|
1404
1571
|
page_str = str(page_num).strip().upper()
|
1405
1572
|
if page_str.startswith('S'):
|
1406
1573
|
# Supplementary page - look in the SI PDF (second PDF)
|
1407
1574
|
actual_page = int(page_str[1:]) - 1 # 0-indexed
|
1408
1575
|
pdf_index = 1 if len(pdf_paths) > 1 else 0
|
1576
|
+
is_si_page = True
|
1409
1577
|
else:
|
1410
1578
|
# Regular page - look in the main PDF
|
1411
1579
|
actual_page = int(page_str) - 1 # 0-indexed
|
1412
1580
|
pdf_index = 0
|
1581
|
+
is_si_page = False
|
1582
|
+
|
1583
|
+
# Skip first 2 pages of SI to avoid table of contents
|
1584
|
+
if skip_si_toc and is_si_page and actual_page < 2:
|
1585
|
+
log.info("Skipping SI page %s (first 2 pages are typically TOC)", page_str)
|
1586
|
+
return ""
|
1413
1587
|
|
1414
1588
|
if pdf_index >= len(pdf_paths):
|
1415
1589
|
log.warning("Page %s requested but not enough PDFs provided", page_str)
|
@@ -1543,8 +1717,14 @@ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
|
|
1543
1717
|
- Only extract dna_seq if NO amino acid sequence is available for that variant
|
1544
1718
|
- This reduces redundancy since protein sequences are usually more relevant
|
1545
1719
|
|
1720
|
+
CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
1721
|
+
- Papers often use different naming conventions in different sections
|
1722
|
+
- DO NOT normalize or simplify variant IDs
|
1723
|
+
- Extract the variant_id exactly as written where the sequence appears
|
1724
|
+
- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
|
1725
|
+
|
1546
1726
|
For each variant return:
|
1547
|
-
* variant_id - the label
|
1727
|
+
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
1548
1728
|
* aa_seq - amino-acid sequence (uppercase), or null
|
1549
1729
|
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
1550
1730
|
|
@@ -1584,7 +1764,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
1584
1764
|
return _parse_sequences(data)
|
1585
1765
|
|
1586
1766
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
1587
|
-
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY")
|
1767
|
+
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
|
1588
1768
|
_VALID_DNA = set("ACGT")
|
1589
1769
|
|
1590
1770
|
def _contains_sequence(text: str, min_length: int = 50) -> bool:
|
@@ -1793,6 +1973,173 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
|
|
1793
1973
|
log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
|
1794
1974
|
return {}
|
1795
1975
|
|
1976
|
+
def extract_enzyme_info_with_gemini(
|
1977
|
+
text: str,
|
1978
|
+
variants: List[Variant],
|
1979
|
+
model,
|
1980
|
+
) -> Dict[str, str]:
|
1981
|
+
"""Use Gemini to extract enzyme names or sequences when PDB IDs are not available.
|
1982
|
+
|
1983
|
+
Returns:
|
1984
|
+
Dict mapping variant IDs to sequences
|
1985
|
+
"""
|
1986
|
+
# Build variant info for context
|
1987
|
+
variant_info = []
|
1988
|
+
for v in variants[:10]: # Limit to first 10 variants for context
|
1989
|
+
info = {
|
1990
|
+
"id": v.variant_id,
|
1991
|
+
"mutations": v.mutations[:5] if v.mutations else [], # Limit mutations shown
|
1992
|
+
"parent": v.parent_id,
|
1993
|
+
"generation": v.generation
|
1994
|
+
}
|
1995
|
+
variant_info.append(info)
|
1996
|
+
|
1997
|
+
prompt = f"""You are analyzing a scientific paper about enzyme engineering. No PDB IDs were found in the paper, and I need to obtain protein sequences for the enzyme variants described.
|
1998
|
+
|
1999
|
+
Here are the variants found in the paper:
|
2000
|
+
{json.dumps(variant_info, indent=2)}
|
2001
|
+
|
2002
|
+
Please analyze the paper text and:
|
2003
|
+
1. Identify the common name of the enzyme being studied (e.g., "P450 BM3", "cytochrome P450 BM3", "CYP102A1")
|
2004
|
+
2. If possible, extract or find the wild-type sequence
|
2005
|
+
3. Provide any UniProt IDs or accession numbers mentioned
|
2006
|
+
|
2007
|
+
Paper text (first 5000 characters):
|
2008
|
+
{text[:5000]}
|
2009
|
+
|
2010
|
+
Return your response as a JSON object with this structure:
|
2011
|
+
{{
|
2012
|
+
"enzyme_name": "common name of the enzyme",
|
2013
|
+
"systematic_name": "systematic name if applicable (e.g., CYP102A1)",
|
2014
|
+
"uniprot_id": "UniProt ID if found",
|
2015
|
+
"wild_type_sequence": "sequence if found in paper or if you know it",
|
2016
|
+
"additional_names": ["list", "of", "alternative", "names"]
|
2017
|
+
}}
|
2018
|
+
|
2019
|
+
If you cannot determine certain fields, set them to null.
|
2020
|
+
"""
|
2021
|
+
|
2022
|
+
try:
|
2023
|
+
response = model.generate_content(prompt)
|
2024
|
+
text_response = _extract_text(response).strip()
|
2025
|
+
|
2026
|
+
# Parse JSON response
|
2027
|
+
if text_response.startswith("```"):
|
2028
|
+
text_response = text_response.split("```")[1].strip()
|
2029
|
+
if text_response.startswith("json"):
|
2030
|
+
text_response = text_response[4:].strip()
|
2031
|
+
text_response = text_response.split("```")[0].strip()
|
2032
|
+
|
2033
|
+
enzyme_info = json.loads(text_response)
|
2034
|
+
log.info(f"Gemini extracted enzyme info: {enzyme_info.get('enzyme_name', 'Unknown')}")
|
2035
|
+
|
2036
|
+
sequences = {}
|
2037
|
+
|
2038
|
+
# If Gemini provided a sequence directly, use it
|
2039
|
+
if enzyme_info.get("wild_type_sequence"):
|
2040
|
+
# Clean the sequence
|
2041
|
+
seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
|
2042
|
+
# Validate it looks like a protein sequence
|
2043
|
+
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
|
2044
|
+
# Map to the first variant or wild-type
|
2045
|
+
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
2046
|
+
if wt_variant:
|
2047
|
+
sequences[wt_variant.variant_id] = seq
|
2048
|
+
else:
|
2049
|
+
sequences[variants[0].variant_id] = seq
|
2050
|
+
log.info(f"Using sequence from Gemini: {len(seq)} residues")
|
2051
|
+
|
2052
|
+
# If no sequence but we have names, try to fetch from UniProt
|
2053
|
+
if not sequences:
|
2054
|
+
names_to_try = []
|
2055
|
+
if enzyme_info.get("enzyme_name"):
|
2056
|
+
names_to_try.append(enzyme_info["enzyme_name"])
|
2057
|
+
if enzyme_info.get("systematic_name"):
|
2058
|
+
names_to_try.append(enzyme_info["systematic_name"])
|
2059
|
+
if enzyme_info.get("uniprot_id"):
|
2060
|
+
names_to_try.append(enzyme_info["uniprot_id"])
|
2061
|
+
if enzyme_info.get("additional_names"):
|
2062
|
+
names_to_try.extend(enzyme_info["additional_names"])
|
2063
|
+
|
2064
|
+
# Try each name with UniProt
|
2065
|
+
for name in names_to_try:
|
2066
|
+
if name:
|
2067
|
+
uniprot_seqs = fetch_sequence_by_name(name)
|
2068
|
+
if uniprot_seqs:
|
2069
|
+
# Map the first sequence to appropriate variant
|
2070
|
+
seq = list(uniprot_seqs.values())[0]
|
2071
|
+
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
2072
|
+
if wt_variant:
|
2073
|
+
sequences[wt_variant.variant_id] = seq
|
2074
|
+
else:
|
2075
|
+
sequences[variants[0].variant_id] = seq
|
2076
|
+
log.info(f"Found sequence via UniProt search for '{name}': {len(seq)} residues")
|
2077
|
+
break
|
2078
|
+
|
2079
|
+
return sequences
|
2080
|
+
|
2081
|
+
except Exception as e:
|
2082
|
+
log.warning(f"Failed to extract enzyme info with Gemini: {e}")
|
2083
|
+
return {}
|
2084
|
+
|
2085
|
+
|
2086
|
+
def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
|
2087
|
+
"""Fetch protein sequences from UniProt by enzyme name or ID.
|
2088
|
+
|
2089
|
+
Args:
|
2090
|
+
enzyme_name: Name, ID, or accession of the enzyme
|
2091
|
+
|
2092
|
+
Returns:
|
2093
|
+
Dict mapping identifiers to sequences
|
2094
|
+
"""
|
2095
|
+
import requests
|
2096
|
+
|
2097
|
+
clean_name = enzyme_name.strip()
|
2098
|
+
|
2099
|
+
# First try as accession number
|
2100
|
+
if len(clean_name) <= 10 and (clean_name[0].isalpha() and clean_name[1:].replace("_", "").isalnum()):
|
2101
|
+
# Looks like a UniProt accession
|
2102
|
+
url = f"https://rest.uniprot.org/uniprotkb/{clean_name}"
|
2103
|
+
try:
|
2104
|
+
response = requests.get(url, timeout=10)
|
2105
|
+
if response.status_code == 200:
|
2106
|
+
data = response.json()
|
2107
|
+
sequence = data.get('sequence', {}).get('value', '')
|
2108
|
+
if sequence:
|
2109
|
+
return {clean_name: sequence}
|
2110
|
+
except:
|
2111
|
+
pass
|
2112
|
+
|
2113
|
+
# Try search API
|
2114
|
+
url = "https://rest.uniprot.org/uniprotkb/search"
|
2115
|
+
params = {
|
2116
|
+
"query": f'(protein_name:"{clean_name}" OR gene:"{clean_name}" OR id:"{clean_name}")',
|
2117
|
+
"format": "json",
|
2118
|
+
"size": "5",
|
2119
|
+
"fields": "accession,id,protein_name,gene_names,sequence"
|
2120
|
+
}
|
2121
|
+
|
2122
|
+
try:
|
2123
|
+
response = requests.get(url, params=params, timeout=10)
|
2124
|
+
response.raise_for_status()
|
2125
|
+
data = response.json()
|
2126
|
+
|
2127
|
+
results = data.get('results', [])
|
2128
|
+
sequences = {}
|
2129
|
+
|
2130
|
+
for result in results[:1]: # Just take the first match
|
2131
|
+
sequence = result.get('sequence', {}).get('value', '')
|
2132
|
+
if sequence:
|
2133
|
+
sequences[clean_name] = sequence
|
2134
|
+
break
|
2135
|
+
|
2136
|
+
return sequences
|
2137
|
+
|
2138
|
+
except Exception as e:
|
2139
|
+
log.warning(f"Failed to fetch sequence for '{enzyme_name}': {e}")
|
2140
|
+
return {}
|
2141
|
+
|
2142
|
+
|
1796
2143
|
def match_pdb_to_variants(
|
1797
2144
|
pdb_sequences: Dict[str, str],
|
1798
2145
|
variants: List[Variant],
|
@@ -1964,53 +2311,70 @@ def _merge_lineage_and_sequences(
|
|
1964
2311
|
for v in lineage
|
1965
2312
|
])
|
1966
2313
|
|
1967
|
-
|
1968
|
-
|
1969
|
-
|
1970
|
-
|
1971
|
-
|
1972
|
-
|
1973
|
-
|
1974
|
-
|
1975
|
-
|
1976
|
-
|
2314
|
+
if seqs:
|
2315
|
+
df_seq = pd.DataFrame([
|
2316
|
+
{
|
2317
|
+
"variant_id": s.variant_id,
|
2318
|
+
"aa_seq": s.aa_seq,
|
2319
|
+
"dna_seq": s.dna_seq,
|
2320
|
+
"seq_confidence": s.confidence,
|
2321
|
+
"truncated": s.truncated,
|
2322
|
+
"seq_source": s.metadata.get("source", None) if s.metadata else None,
|
2323
|
+
}
|
2324
|
+
for s in seqs
|
2325
|
+
])
|
2326
|
+
else:
|
2327
|
+
# Create empty DataFrame with correct columns for merging
|
2328
|
+
df_seq = pd.DataFrame(columns=[
|
2329
|
+
"variant_id", "aa_seq", "dna_seq", "seq_confidence", "truncated", "seq_source"
|
2330
|
+
])
|
2331
|
+
|
2332
|
+
# Log sequence data info
|
2333
|
+
if len(df_seq) > 0:
|
2334
|
+
seq_with_aa = (~df_seq['aa_seq'].isna()).sum()
|
2335
|
+
seq_with_dna = (~df_seq['dna_seq'].isna()).sum()
|
2336
|
+
log.info(f"Sequence data: {len(df_seq)} entries, {seq_with_aa} with aa_seq, {seq_with_dna} with dna_seq")
|
1977
2337
|
|
1978
|
-
# 2.
|
2338
|
+
# 2. First try direct merge
|
1979
2339
|
df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
|
1980
|
-
|
1981
|
-
#
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
# Find lineage entries without sequences
|
2340
|
+
|
2341
|
+
# Log merge results
|
2342
|
+
merged_aa = (~df['aa_seq'].isna()).sum()
|
2343
|
+
merged_dna = (~df['dna_seq'].isna()).sum()
|
2344
|
+
log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
|
2345
|
+
|
2346
|
+
# 3. If we have unmatched sequences and a model, use Gemini to match
|
2347
|
+
if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
|
2348
|
+
# Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
|
1990
2349
|
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
1991
|
-
|
2350
|
+
unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
|
1992
2351
|
|
1993
|
-
# Find sequences
|
2352
|
+
# Find unmatched sequences
|
1994
2353
|
matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
|
1995
2354
|
unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
|
1996
2355
|
|
1997
|
-
if
|
1998
|
-
log.info(f"Found {len(
|
1999
|
-
log.info(f"
|
2356
|
+
if unmatched_lineage_ids and len(unmatched_seqs) > 0:
|
2357
|
+
log.info(f"Found {len(unmatched_lineage_ids)} lineage entries without sequences")
|
2358
|
+
log.info(f"Found {len(unmatched_seqs)} unmatched sequences")
|
2359
|
+
log.info("Using Gemini to match variants")
|
2000
2360
|
|
2001
|
-
# Build prompt for Gemini
|
2002
|
-
prompt = f"""Match enzyme variant IDs between two lists
|
2361
|
+
# Build prompt for Gemini
|
2362
|
+
prompt = f"""Match enzyme variant IDs between two lists from the same paper.
|
2363
|
+
|
2364
|
+
Papers often use different naming conventions for the same variant:
|
2365
|
+
- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
|
2366
|
+
- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
|
2367
|
+
|
2368
|
+
Match variants by analyzing generation numbers, prefixes, and patterns.
|
2003
2369
|
|
2004
2370
|
Lineage variant IDs (need sequences):
|
2005
|
-
{json.dumps(
|
2371
|
+
{json.dumps(unmatched_lineage_ids)}
|
2006
2372
|
|
2007
2373
|
Sequence variant IDs (have sequences):
|
2008
2374
|
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
2009
2375
|
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
|
2376
|
+
Return ONLY a JSON object mapping lineage IDs to sequence IDs.
|
2377
|
+
Format: {{"lineage_id": "sequence_id", ...}}
|
2014
2378
|
"""
|
2015
2379
|
|
2016
2380
|
try:
|
@@ -2024,85 +2388,82 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2024
2388
|
text = text[4:].strip()
|
2025
2389
|
|
2026
2390
|
matches = json.loads(text)
|
2027
|
-
log.info(f"Gemini returned matches
|
2391
|
+
log.info(f"Gemini returned {len(matches)} matches")
|
2028
2392
|
|
2029
|
-
#
|
2030
|
-
|
2393
|
+
# Create a mapping of sequence IDs to their data for efficient lookup
|
2394
|
+
seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
|
2031
2395
|
|
2032
|
-
# Apply
|
2396
|
+
# Apply matches and update variant IDs
|
2033
2397
|
for lineage_id, seq_id in matches.items():
|
2034
|
-
if lineage_id in
|
2035
|
-
#
|
2036
|
-
seq_data =
|
2037
|
-
|
2038
|
-
# First try exact match
|
2039
|
-
seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
|
2040
|
-
if len(seq_matches) > 0:
|
2041
|
-
seq_data = seq_matches.iloc[0]
|
2042
|
-
else:
|
2043
|
-
# Try to find by checking various matching strategies
|
2044
|
-
for idx, row in unmatched_seqs.iterrows():
|
2045
|
-
variant_id = row['variant_id']
|
2046
|
-
# Check if one is contained in the other
|
2047
|
-
if seq_id in variant_id or variant_id in seq_id:
|
2048
|
-
seq_data = row
|
2049
|
-
break
|
2050
|
-
# Check if they share the same core identifier (e.g., G0, G1, etc.)
|
2051
|
-
seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
|
2052
|
-
variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
|
2053
|
-
if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
|
2054
|
-
seq_data = row
|
2055
|
-
break
|
2398
|
+
if lineage_id in unmatched_lineage_ids and seq_id in seq_data_map:
|
2399
|
+
# Get the sequence data
|
2400
|
+
seq_data = seq_data_map[seq_id]
|
2056
2401
|
|
2057
|
-
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
df.loc[
|
2067
|
-
|
2402
|
+
# Update the row with the matched sequence ID and data
|
2403
|
+
mask = df['variant_id'] == lineage_id
|
2404
|
+
if mask.any():
|
2405
|
+
# Update variant_id to use the sequence variant name
|
2406
|
+
df.loc[mask, 'variant_id'] = seq_id
|
2407
|
+
|
2408
|
+
# Update parent_id if it matches any of the mapped lineage IDs
|
2409
|
+
parent_mask = df['parent_id'] == lineage_id
|
2410
|
+
if parent_mask.any():
|
2411
|
+
df.loc[parent_mask, 'parent_id'] = seq_id
|
2412
|
+
|
2413
|
+
# Update sequence data
|
2414
|
+
# For pandas Series from iterrows(), use proper indexing
|
2415
|
+
aa_seq_val = seq_data['aa_seq'] if 'aa_seq' in seq_data else None
|
2416
|
+
dna_seq_val = seq_data['dna_seq'] if 'dna_seq' in seq_data else None
|
2417
|
+
|
2418
|
+
# Always update sequence fields to preserve DNA even when aa_seq is null
|
2419
|
+
df.loc[mask, 'aa_seq'] = aa_seq_val
|
2420
|
+
df.loc[mask, 'dna_seq'] = dna_seq_val
|
2068
2421
|
|
2069
|
-
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
log.
|
2422
|
+
df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
|
2423
|
+
df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
|
2424
|
+
|
2425
|
+
# Log sequence info - check both aa_seq and dna_seq
|
2426
|
+
aa_len = len(seq_data['aa_seq']) if pd.notna(seq_data.get('aa_seq')) and seq_data.get('aa_seq') else 0
|
2427
|
+
dna_len = len(seq_data['dna_seq']) if pd.notna(seq_data.get('dna_seq')) and seq_data.get('dna_seq') else 0
|
2428
|
+
log.info(f"Matched {lineage_id} -> {seq_id} (aa_seq: {aa_len} chars, dna_seq: {dna_len} chars)")
|
2429
|
+
|
2430
|
+
# Update any remaining parent_id references to matched variants
|
2431
|
+
for lineage_id, seq_id in matches.items():
|
2432
|
+
parent_mask = df['parent_id'] == lineage_id
|
2433
|
+
if parent_mask.any():
|
2434
|
+
df.loc[parent_mask, 'parent_id'] = seq_id
|
2076
2435
|
|
2077
|
-
# Log
|
2078
|
-
|
2079
|
-
|
2436
|
+
# Log final state - count variants with any sequence (aa or dna)
|
2437
|
+
aa_count = (~df['aa_seq'].isna()).sum()
|
2438
|
+
dna_count = (~df['dna_seq'].isna()).sum()
|
2439
|
+
any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
|
2440
|
+
log.info(f"After Gemini matching: {any_seq_count}/{len(df)} variants have sequences (aa: {aa_count}, dna: {dna_count})")
|
2080
2441
|
|
2081
2442
|
except Exception as e:
|
2082
2443
|
log.warning(f"Failed to match variants using Gemini: {e}")
|
2083
2444
|
|
2084
|
-
#
|
2445
|
+
# 4. If generation missing, try inference
|
2085
2446
|
if df["generation"].isna().any():
|
2086
|
-
_infer_generations(lineage)
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
#
|
2447
|
+
_infer_generations(lineage)
|
2448
|
+
# Need to update the generations based on the potentially updated variant IDs
|
2449
|
+
gen_map = {v.variant_id: v.generation for v in lineage}
|
2450
|
+
# Also create a map for any variant IDs that were replaced
|
2451
|
+
for idx, row in df.iterrows():
|
2452
|
+
variant_id = row['variant_id']
|
2453
|
+
if variant_id in gen_map:
|
2454
|
+
df.at[idx, 'generation'] = gen_map[variant_id]
|
2455
|
+
|
2456
|
+
# 5. Attach DOI column
|
2096
2457
|
df["doi"] = doi
|
2097
2458
|
|
2098
|
-
#
|
2459
|
+
# 6. Sort by generation, then variant_id
|
2099
2460
|
df = df.sort_values(["generation", "variant_id"], kind="mergesort")
|
2100
2461
|
|
2101
|
-
#
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2462
|
+
# 7. Log final state
|
2463
|
+
aa_count = (~df['aa_seq'].isna()).sum()
|
2464
|
+
dna_count = (~df['dna_seq'].isna()).sum()
|
2465
|
+
any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
|
2466
|
+
log.info(f"Final result: {len(df)} variants, {any_seq_count} with sequences (aa: {aa_count}, dna: {dna_count})")
|
2106
2467
|
|
2107
2468
|
return df
|
2108
2469
|
|
@@ -2114,28 +2475,27 @@ def merge_and_score(
|
|
2114
2475
|
doi: Optional[str] = None,
|
2115
2476
|
model=None,
|
2116
2477
|
) -> pd.DataFrame:
|
2117
|
-
"""
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2478
|
+
"""Merge lineage and sequence data into a single DataFrame.
|
2479
|
+
|
2480
|
+
Args:
|
2481
|
+
lineage: List of Variant objects from lineage extraction
|
2482
|
+
seqs: List of SequenceBlock objects from sequence extraction
|
2483
|
+
doi: DOI of the paper for provenance
|
2484
|
+
model: Gemini model for smart matching (optional)
|
2485
|
+
|
2486
|
+
Returns:
|
2487
|
+
DataFrame with merged lineage and sequence data
|
2122
2488
|
"""
|
2123
|
-
|
2124
2489
|
if not lineage:
|
2125
2490
|
raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
|
2126
2491
|
|
2127
|
-
# If no sequences found, still build a DataFrame so caller can decide what to do.
|
2128
2492
|
df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
|
2129
2493
|
|
2130
|
-
#
|
2494
|
+
# Warn if many sequences are missing
|
2131
2495
|
missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
|
2132
2496
|
if missing_rate > 0.5:
|
2133
2497
|
log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
|
2134
2498
|
|
2135
|
-
# Debug log before returning
|
2136
|
-
seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
|
2137
|
-
log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
|
2138
|
-
|
2139
2499
|
return df
|
2140
2500
|
|
2141
2501
|
# -------------------------------------------------------------------- end 8 ---
|
@@ -2245,7 +2605,7 @@ def run_pipeline(
|
|
2245
2605
|
early_df = _lineage_to_dataframe(lineage)
|
2246
2606
|
output_csv_path = Path(output_csv)
|
2247
2607
|
# Save lineage-only data with specific filename
|
2248
|
-
lineage_path = output_csv_path.parent / "
|
2608
|
+
lineage_path = output_csv_path.parent / "enzyme_lineage_name.csv"
|
2249
2609
|
early_df.to_csv(lineage_path, index=False)
|
2250
2610
|
log.info(
|
2251
2611
|
"Saved lineage-only CSV -> %s",
|
@@ -2309,6 +2669,36 @@ def run_pipeline(
|
|
2309
2669
|
log.warning(f"No sequences found in PDB {pdb_id}")
|
2310
2670
|
else:
|
2311
2671
|
log.warning("No PDB IDs found in paper")
|
2672
|
+
|
2673
|
+
# 4b. If still no sequences, try Gemini extraction as last resort
|
2674
|
+
if not sequences or all(not s.aa_seq for s in sequences):
|
2675
|
+
log.info("No sequences from PDB, attempting Gemini-based extraction...")
|
2676
|
+
|
2677
|
+
gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
|
2678
|
+
|
2679
|
+
if gemini_sequences:
|
2680
|
+
# Convert to SequenceBlock objects
|
2681
|
+
gemini_seq_blocks = []
|
2682
|
+
for variant_id, seq in gemini_sequences.items():
|
2683
|
+
# Find the matching variant
|
2684
|
+
variant = next((v for v in lineage if v.variant_id == variant_id), None)
|
2685
|
+
if variant:
|
2686
|
+
seq_block = SequenceBlock(
|
2687
|
+
variant_id=variant.variant_id,
|
2688
|
+
aa_seq=seq,
|
2689
|
+
dna_seq=None,
|
2690
|
+
confidence=0.9, # High confidence but slightly lower than PDB
|
2691
|
+
truncated=False,
|
2692
|
+
metadata={"source": "Gemini/UniProt"}
|
2693
|
+
)
|
2694
|
+
gemini_seq_blocks.append(seq_block)
|
2695
|
+
log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
|
2696
|
+
|
2697
|
+
if gemini_seq_blocks:
|
2698
|
+
sequences = gemini_seq_blocks
|
2699
|
+
log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
|
2700
|
+
else:
|
2701
|
+
log.warning("Failed to extract sequences via Gemini")
|
2312
2702
|
|
2313
2703
|
# 5. Merge & score (Section 8) --------------------------------------------
|
2314
2704
|
doi = extract_doi(manuscript)
|
@@ -2320,18 +2710,17 @@ def run_pipeline(
|
|
2320
2710
|
# Save final data with sequences using same filename (overwrites lineage-only)
|
2321
2711
|
sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
|
2322
2712
|
|
2323
|
-
#
|
2324
|
-
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2325
|
-
log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
|
2326
|
-
if seq_count > 0 and 'aa_seq' in df_final:
|
2327
|
-
with_seq = df_final[~df_final['aa_seq'].isna()]
|
2328
|
-
log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
|
2329
|
-
|
2713
|
+
# Save the final CSV
|
2330
2714
|
df_final.to_csv(sequence_path, index=False)
|
2715
|
+
|
2716
|
+
# Log summary statistics
|
2717
|
+
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2331
2718
|
log.info(
|
2332
|
-
"
|
2719
|
+
"Saved final CSV -> %s (%.1f kB, %d variants, %d with sequences)",
|
2333
2720
|
sequence_path,
|
2334
2721
|
sequence_path.stat().st_size / 1024,
|
2722
|
+
len(df_final),
|
2723
|
+
seq_count
|
2335
2724
|
)
|
2336
2725
|
|
2337
2726
|
log.info(
|