debase 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +373 -222
- debase/reaction_info_extractor.py +3 -3
- debase/substrate_scope_extractor.py +516 -67
- {debase-0.1.11.dist-info → debase-0.1.16.dist-info}/METADATA +1 -1
- debase-0.1.16.dist-info/RECORD +16 -0
- debase/PIPELINE_FLOW.md +0 -100
- debase-0.1.11.dist-info/RECORD +0 -17
- {debase-0.1.11.dist-info → debase-0.1.16.dist-info}/WHEEL +0 -0
- {debase-0.1.11.dist-info → debase-0.1.16.dist-info}/entry_points.txt +0 -0
- {debase-0.1.11.dist-info → debase-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.11.dist-info → debase-0.1.16.dist-info}/top_level.txt +0 -0
@@ -589,17 +589,28 @@ TEXT:
|
|
589
589
|
{text}
|
590
590
|
""".strip()
|
591
591
|
|
592
|
-
|
593
|
-
Given
|
592
|
+
_CAMPAIGN_BEST_LOCATION_PROMPT = """
|
593
|
+
Given this specific campaign and the available data locations, select the BEST location to extract the complete lineage data for this campaign.
|
594
594
|
|
595
|
-
|
596
|
-
{
|
595
|
+
Campaign:
|
596
|
+
- ID: {campaign_id}
|
597
|
+
- Name: {campaign_name}
|
598
|
+
- Description: {description}
|
599
|
+
- Lineage identifiers: {identifiers}
|
597
600
|
|
598
|
-
|
599
|
-
|
601
|
+
Available locations with context:
|
602
|
+
{locations_with_context}
|
600
603
|
|
601
|
-
|
602
|
-
|
604
|
+
Select the location that most likely contains the COMPLETE lineage data (all variants, mutations, and parent relationships) for THIS SPECIFIC campaign.
|
605
|
+
|
606
|
+
Consider:
|
607
|
+
1. Tables are usually more structured and complete than figures
|
608
|
+
2. Look for locations that mention this campaign's specific identifiers or enzyme names
|
609
|
+
3. Some locations may contain data for multiple campaigns - that's fine, we can filter later
|
610
|
+
4. Prioritize completeness over visual clarity
|
611
|
+
|
612
|
+
Return a JSON object with:
|
613
|
+
{{"location": "selected location identifier", "confidence": 0-100, "reason": "explanation"}}
|
603
614
|
""".strip()
|
604
615
|
|
605
616
|
# ---- 6.1 Prompt templates -------------------------------------------------
|
@@ -756,9 +767,43 @@ def identify_evolution_locations(
|
|
756
767
|
max_results: int = 5,
|
757
768
|
debug_dir: str | Path | None = None,
|
758
769
|
campaigns: Optional[List[Campaign]] = None,
|
770
|
+
pdf_paths: Optional[List[Path]] = None,
|
759
771
|
) -> List[dict]:
|
760
772
|
"""Ask Gemini where in the paper the lineage is probably described."""
|
761
|
-
|
773
|
+
# Extract table of contents from PDFs if available
|
774
|
+
toc_text = ""
|
775
|
+
if pdf_paths:
|
776
|
+
toc_sections = []
|
777
|
+
for pdf_path in pdf_paths:
|
778
|
+
# Extract first few pages looking for TOC
|
779
|
+
doc = _open_doc(pdf_path)
|
780
|
+
try:
|
781
|
+
for page_num in range(min(5, len(doc))):
|
782
|
+
page_text = doc[page_num].get_text()
|
783
|
+
if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
|
784
|
+
# Found TOC page
|
785
|
+
lines = page_text.split('\n')
|
786
|
+
toc_lines = []
|
787
|
+
for line in lines:
|
788
|
+
line = line.strip()
|
789
|
+
# TOC entries typically have page numbers
|
790
|
+
if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
|
791
|
+
re.search(r'\s{2,}S?\d+\s*$', line) or
|
792
|
+
re.match(r'^\d+\.\s+\w+', line)):
|
793
|
+
toc_lines.append(line)
|
794
|
+
if toc_lines:
|
795
|
+
pdf_name = pdf_path.name
|
796
|
+
toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
|
797
|
+
break
|
798
|
+
finally:
|
799
|
+
doc.close()
|
800
|
+
|
801
|
+
if toc_sections:
|
802
|
+
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
803
|
+
|
804
|
+
# Include TOC before the main text
|
805
|
+
combined_text = toc_text + text if toc_text else text
|
806
|
+
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
|
762
807
|
locs: List[dict] = []
|
763
808
|
try:
|
764
809
|
locs = generate_json_with_retry(
|
@@ -770,69 +815,7 @@ def identify_evolution_locations(
|
|
770
815
|
except Exception as exc: # pragma: no cover
|
771
816
|
log.warning("identify_evolution_locations(): %s", exc)
|
772
817
|
|
773
|
-
#
|
774
|
-
if campaigns and locs:
|
775
|
-
for loc in locs:
|
776
|
-
# Extract more context around the location
|
777
|
-
location_str = loc.get('location', '')
|
778
|
-
context = loc.get('reason', '')
|
779
|
-
|
780
|
-
# Ask Gemini to map this location to a campaign
|
781
|
-
if campaigns:
|
782
|
-
try:
|
783
|
-
campaigns_json = json.dumps([{
|
784
|
-
"campaign_id": c.campaign_id,
|
785
|
-
"campaign_name": c.campaign_name,
|
786
|
-
"lineage_hint": c.notes
|
787
|
-
} for c in campaigns])
|
788
|
-
|
789
|
-
mapping_prompt = _CAMPAIGN_MAPPING_PROMPT.format(
|
790
|
-
campaigns=campaigns_json,
|
791
|
-
location=location_str,
|
792
|
-
context=context
|
793
|
-
)
|
794
|
-
|
795
|
-
# Save mapping prompt to debug if provided
|
796
|
-
if debug_dir:
|
797
|
-
debug_path = Path(debug_dir)
|
798
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
799
|
-
mapping_file = debug_path / f"campaign_mapping_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
800
|
-
_dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
|
801
|
-
|
802
|
-
response = model.generate_content(mapping_prompt)
|
803
|
-
response_text = _extract_text(response).strip()
|
804
|
-
|
805
|
-
# Extract just the campaign_id from the response
|
806
|
-
# Look for the campaign_id pattern in the response
|
807
|
-
campaign_id = None
|
808
|
-
for campaign in campaigns:
|
809
|
-
if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
|
810
|
-
campaign_id = campaign.campaign_id
|
811
|
-
break
|
812
|
-
|
813
|
-
# If not found, try to extract the last line or quoted string
|
814
|
-
if not campaign_id:
|
815
|
-
# Try to find quoted string
|
816
|
-
quoted_match = re.search(r'"([^"]+)"', response_text)
|
817
|
-
if quoted_match:
|
818
|
-
campaign_id = quoted_match.group(1)
|
819
|
-
else:
|
820
|
-
# Take the last non-empty line
|
821
|
-
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
822
|
-
if lines:
|
823
|
-
campaign_id = lines[-1].strip('"')
|
824
|
-
|
825
|
-
# Save mapping response to debug if provided
|
826
|
-
if debug_dir:
|
827
|
-
response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
828
|
-
_dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
|
829
|
-
|
830
|
-
# Add campaign_id to location
|
831
|
-
if campaign_id:
|
832
|
-
loc['campaign_id'] = campaign_id
|
833
|
-
log.info(f"Mapped {location_str} to campaign: {campaign_id}")
|
834
|
-
except Exception as exc:
|
835
|
-
log.warning(f"Failed to map location to campaign: {exc}")
|
818
|
+
# No longer mapping locations to campaigns here - we'll ask for best location per campaign instead
|
836
819
|
|
837
820
|
return locs if isinstance(locs, list) else []
|
838
821
|
|
@@ -878,6 +861,7 @@ def extract_complete_lineage(
|
|
878
861
|
debug_dir: str | Path | None = None,
|
879
862
|
campaign_id: Optional[str] = None,
|
880
863
|
campaign_info: Optional[Campaign] = None,
|
864
|
+
pdf_paths: Optional[List[Path]] = None,
|
881
865
|
) -> List[Variant]:
|
882
866
|
"""Prompt Gemini for the full lineage and return a list[Variant]."""
|
883
867
|
# Build campaign context
|
@@ -899,10 +883,44 @@ IMPORTANT:
|
|
899
883
|
4. Include parent variants only if they are direct ancestors in this campaign's lineage.
|
900
884
|
"""
|
901
885
|
|
886
|
+
# Extract table of contents from PDFs if available
|
887
|
+
toc_text = ""
|
888
|
+
if pdf_paths:
|
889
|
+
toc_sections = []
|
890
|
+
for pdf_path in pdf_paths:
|
891
|
+
# Extract first few pages looking for TOC
|
892
|
+
doc = _open_doc(pdf_path)
|
893
|
+
try:
|
894
|
+
for page_num in range(min(5, len(doc))):
|
895
|
+
page_text = doc[page_num].get_text()
|
896
|
+
if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
|
897
|
+
# Found TOC page
|
898
|
+
lines = page_text.split('\n')
|
899
|
+
toc_lines = []
|
900
|
+
for line in lines:
|
901
|
+
line = line.strip()
|
902
|
+
# TOC entries typically have page numbers
|
903
|
+
if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
|
904
|
+
re.search(r'\s{2,}S?\d+\s*$', line) or
|
905
|
+
re.match(r'^\d+\.\s+\w+', line)):
|
906
|
+
toc_lines.append(line)
|
907
|
+
if toc_lines:
|
908
|
+
pdf_name = pdf_path.name
|
909
|
+
toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
|
910
|
+
break
|
911
|
+
finally:
|
912
|
+
doc.close()
|
913
|
+
|
914
|
+
if toc_sections:
|
915
|
+
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
916
|
+
|
917
|
+
# Include TOC in the prompt text
|
918
|
+
combined_text = toc_text + text if toc_text else text
|
919
|
+
|
902
920
|
prompt = _LINEAGE_EXTRACT_PROMPT.format(
|
903
921
|
campaign_context=campaign_context,
|
904
922
|
schema=_LINEAGE_SCHEMA_HINT,
|
905
|
-
text=
|
923
|
+
text=combined_text[:MAX_CHARS],
|
906
924
|
)
|
907
925
|
raw = generate_json_with_retry(
|
908
926
|
model,
|
@@ -1044,15 +1062,27 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
|
1044
1062
|
# 1. Line contains dots (...) followed by page number
|
1045
1063
|
# 2. Line ends with just a page number
|
1046
1064
|
# 3. Line has "Table S12:" or similar followed by title and page
|
1047
|
-
|
1065
|
+
# 4. Pattern appears at start of line followed by description and page number
|
1066
|
+
if ('...' in line or
|
1067
|
+
re.search(r'\.\s*\d+\s*$', line) or
|
1068
|
+
re.search(r':\s*[^:]+\s+\d+\s*$', line) or
|
1069
|
+
(line.strip().startswith(pattern) and re.search(r'\s+\d+\s*$', line))):
|
1048
1070
|
return True
|
1049
1071
|
|
1050
1072
|
# Check if this is in a contents/TOC section
|
1051
|
-
# Look backwards up to
|
1052
|
-
context_start = max(0, position -
|
1073
|
+
# Look backwards up to 1000 chars for "Contents" or "Table of Contents"
|
1074
|
+
context_start = max(0, position - 1000)
|
1053
1075
|
context = text[context_start:position].lower()
|
1054
1076
|
if 'contents' in context or 'table of contents' in context:
|
1055
1077
|
return True
|
1078
|
+
|
1079
|
+
# Check if we're in the first ~5000 chars of the document (likely TOC area)
|
1080
|
+
# This helps catch TOC entries that don't have obvious formatting
|
1081
|
+
if position < 5000:
|
1082
|
+
# Be more strict for early document positions
|
1083
|
+
# Check if line looks like a TOC entry (has page number at end)
|
1084
|
+
if re.search(r'\s+\d+\s*$', line):
|
1085
|
+
return True
|
1056
1086
|
|
1057
1087
|
return False
|
1058
1088
|
|
@@ -1185,13 +1215,39 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1185
1215
|
log.warning("No sequences found in any of %d occurrences of '%s'",
|
1186
1216
|
len(all_positions), location_str)
|
1187
1217
|
else:
|
1188
|
-
# For lineage extraction,
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1218
|
+
# For lineage extraction, find ALL occurrences of the pattern
|
1219
|
+
all_positions = []
|
1220
|
+
search_pos = 0
|
1221
|
+
|
1222
|
+
# Find all occurrences of this pattern (not just the first)
|
1223
|
+
while search_pos < len(text_lower):
|
1224
|
+
temp_pos = text_lower.find(used_pattern.lower(), search_pos)
|
1225
|
+
if temp_pos == -1:
|
1226
|
+
break
|
1227
|
+
|
1228
|
+
# Check if this is a TOC entry
|
1229
|
+
if _is_toc_entry(text, temp_pos, used_pattern):
|
1230
|
+
log.debug("Skipping TOC entry for pattern '%s' at position %d", used_pattern, temp_pos)
|
1231
|
+
search_pos = temp_pos + len(used_pattern)
|
1232
|
+
continue
|
1233
|
+
|
1234
|
+
all_positions.append(temp_pos)
|
1235
|
+
search_pos = temp_pos + len(used_pattern)
|
1236
|
+
|
1237
|
+
if len(all_positions) >= 10: # Limit to 10 occurrences
|
1238
|
+
break
|
1239
|
+
|
1240
|
+
log.info("Found %d non-TOC occurrences of pattern '%s' for location '%s'",
|
1241
|
+
len(all_positions), used_pattern, location_str)
|
1242
|
+
|
1243
|
+
# Extract context around each occurrence
|
1244
|
+
for idx, pos in enumerate(all_positions):
|
1245
|
+
start = max(0, pos - context_chars)
|
1246
|
+
end = min(len(text), pos + len(used_pattern) + context_chars)
|
1247
|
+
section_text = text[start:end]
|
1248
|
+
extracted_sections.append(section_text)
|
1249
|
+
log.info("Occurrence %d/%d: Found '%s' at position %d, extracted %d chars",
|
1250
|
+
idx + 1, len(all_positions), location_str, pos, len(section_text))
|
1195
1251
|
else:
|
1196
1252
|
log.warning("Location '%s' not found in text (tried %d patterns)", location_str, len(page_patterns))
|
1197
1253
|
|
@@ -1229,41 +1285,113 @@ def get_lineage(
|
|
1229
1285
|
log.info(f" - {camp.campaign_name}: {camp.description}")
|
1230
1286
|
|
1231
1287
|
# Use captions for identification - they're concise and focused
|
1232
|
-
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=
|
1288
|
+
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
|
1233
1289
|
|
1234
1290
|
all_variants = []
|
1235
1291
|
|
1236
|
-
if locations:
|
1292
|
+
if locations and campaigns:
|
1237
1293
|
# Log location information
|
1238
1294
|
location_summary = []
|
1239
1295
|
for loc in locations[:5]:
|
1240
1296
|
if isinstance(loc, dict):
|
1241
|
-
|
1242
|
-
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)}{campaign_info})")
|
1297
|
+
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
|
1243
1298
|
else:
|
1244
1299
|
location_summary.append(str(loc))
|
1245
1300
|
log.info("Gemini identified %d potential lineage locations: %s",
|
1246
1301
|
len(locations), ", ".join(location_summary))
|
1247
1302
|
|
1248
|
-
#
|
1249
|
-
|
1303
|
+
# Extract context around each location for better decision making
|
1304
|
+
locations_with_context = []
|
1250
1305
|
for loc in locations:
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1306
|
+
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1307
|
+
# Extract 1000 chars of context around the location
|
1308
|
+
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1309
|
+
locations_with_context.append({
|
1310
|
+
'location': loc,
|
1311
|
+
'context': context_text[:1000] # First 1000 chars of extracted context
|
1312
|
+
})
|
1255
1313
|
|
1256
|
-
#
|
1257
|
-
for
|
1258
|
-
log.info(f"Processing campaign: {campaign_id}")
|
1314
|
+
# For each campaign, ask Gemini to select the best location
|
1315
|
+
for campaign in campaigns:
|
1316
|
+
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1259
1317
|
|
1260
|
-
#
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1318
|
+
# Build locations context string
|
1319
|
+
locations_str = ""
|
1320
|
+
for i, loc_ctx in enumerate(locations_with_context):
|
1321
|
+
loc = loc_ctx['location']
|
1322
|
+
context = loc_ctx['context']
|
1323
|
+
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1324
|
+
location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
|
1325
|
+
confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
|
1326
|
+
reason = loc.get('reason', '') if isinstance(loc, dict) else ''
|
1327
|
+
|
1328
|
+
locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
|
1329
|
+
locations_str += f" Reason: {reason}\n"
|
1330
|
+
locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
|
1264
1331
|
|
1265
|
-
#
|
1266
|
-
|
1332
|
+
# Ask Gemini to select best location for this campaign
|
1333
|
+
best_location_prompt = _CAMPAIGN_BEST_LOCATION_PROMPT.format(
|
1334
|
+
campaign_id=campaign.campaign_id,
|
1335
|
+
campaign_name=campaign.campaign_name,
|
1336
|
+
description=campaign.description,
|
1337
|
+
identifiers=campaign.notes or "No specific identifiers provided",
|
1338
|
+
locations_with_context=locations_str
|
1339
|
+
)
|
1340
|
+
|
1341
|
+
primary_location = None
|
1342
|
+
try:
|
1343
|
+
# Save prompt to debug if provided
|
1344
|
+
if debug_dir:
|
1345
|
+
debug_path = Path(debug_dir)
|
1346
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1347
|
+
prompt_file = debug_path / f"best_location_{campaign.campaign_id}_{int(time.time())}.txt"
|
1348
|
+
_dump(f"=== BEST LOCATION PROMPT ===\nCampaign: {campaign.campaign_id}\n{'='*80}\n\n{best_location_prompt}", prompt_file)
|
1349
|
+
|
1350
|
+
response = model.generate_content(best_location_prompt)
|
1351
|
+
response_text = _extract_text(response).strip()
|
1352
|
+
|
1353
|
+
# Parse JSON response
|
1354
|
+
if response_text.startswith("```"):
|
1355
|
+
response_text = response_text.split("```")[1].strip()
|
1356
|
+
if response_text.startswith("json"):
|
1357
|
+
response_text = response_text[4:].strip()
|
1358
|
+
|
1359
|
+
best_loc_data = json.loads(response_text)
|
1360
|
+
selected_location = best_loc_data.get('location', '')
|
1361
|
+
confidence = best_loc_data.get('confidence', 0)
|
1362
|
+
reason = best_loc_data.get('reason', '')
|
1363
|
+
|
1364
|
+
# Save response to debug if provided
|
1365
|
+
if debug_dir:
|
1366
|
+
response_file = debug_path / f"best_location_response_{campaign.campaign_id}_{int(time.time())}.txt"
|
1367
|
+
_dump(f"=== BEST LOCATION RESPONSE ===\nCampaign: {campaign.campaign_id}\nSelected: {selected_location}\nConfidence: {confidence}\nReason: {reason}\n{'='*80}", response_file)
|
1368
|
+
|
1369
|
+
log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
|
1370
|
+
|
1371
|
+
# Find the actual location object
|
1372
|
+
for loc in locations:
|
1373
|
+
loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1374
|
+
if loc_str == selected_location:
|
1375
|
+
primary_location = loc
|
1376
|
+
break
|
1377
|
+
|
1378
|
+
if not primary_location:
|
1379
|
+
log.warning(f"Could not find selected location '{selected_location}' in locations list")
|
1380
|
+
# Fall back to highest confidence location
|
1381
|
+
primary_location = sorted(locations,
|
1382
|
+
key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
|
1383
|
+
reverse=True)[0] if locations else None
|
1384
|
+
|
1385
|
+
except Exception as e:
|
1386
|
+
log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
|
1387
|
+
# Fall back to highest confidence location
|
1388
|
+
primary_location = sorted(locations,
|
1389
|
+
key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
|
1390
|
+
reverse=True)[0] if locations else None
|
1391
|
+
|
1392
|
+
if not primary_location:
|
1393
|
+
log.warning(f"No location found for campaign {campaign.campaign_id}")
|
1394
|
+
continue
|
1267
1395
|
|
1268
1396
|
# Track if we successfully extracted from figure
|
1269
1397
|
extracted_from_figure = False
|
@@ -1297,12 +1425,11 @@ def get_lineage(
|
|
1297
1425
|
log.info("Saved lineage figure to: %s", figure_file)
|
1298
1426
|
|
1299
1427
|
# Extract lineage from the figure
|
1300
|
-
campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
|
1301
1428
|
variants = extract_lineage_from_figure(
|
1302
1429
|
figure_bytes, model,
|
1303
1430
|
debug_dir=debug_dir,
|
1304
|
-
campaign_id=campaign_id,
|
1305
|
-
campaign_info=
|
1431
|
+
campaign_id=campaign.campaign_id,
|
1432
|
+
campaign_info=campaign
|
1306
1433
|
)
|
1307
1434
|
if variants:
|
1308
1435
|
all_variants.extend(variants)
|
@@ -1327,22 +1454,22 @@ def get_lineage(
|
|
1327
1454
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
1328
1455
|
len(full_text), len(focused_text),
|
1329
1456
|
primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
|
1330
|
-
campaign_id)
|
1457
|
+
campaign.campaign_id)
|
1331
1458
|
|
1332
|
-
#
|
1333
|
-
campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
|
1459
|
+
# Extract lineage for this campaign
|
1334
1460
|
campaign_variants = extract_complete_lineage(
|
1335
1461
|
focused_text, model,
|
1336
1462
|
debug_dir=debug_dir,
|
1337
|
-
campaign_id=campaign_id,
|
1338
|
-
campaign_info=
|
1463
|
+
campaign_id=campaign.campaign_id,
|
1464
|
+
campaign_info=campaign,
|
1465
|
+
pdf_paths=pdf_paths
|
1339
1466
|
)
|
1340
1467
|
all_variants.extend(campaign_variants)
|
1341
1468
|
|
1342
1469
|
return all_variants, campaigns
|
1343
1470
|
else:
|
1344
1471
|
log.info("Gemini did not identify specific lineage locations")
|
1345
|
-
variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir)
|
1472
|
+
variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir, pdf_paths=pdf_paths)
|
1346
1473
|
return variants, campaigns
|
1347
1474
|
|
1348
1475
|
# === 7. SEQUENCE EXTRACTION === ----------------------------------------------
|
@@ -1398,18 +1525,31 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
1398
1525
|
return []
|
1399
1526
|
|
1400
1527
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
1401
|
-
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
|
1402
|
-
"""Extract text from a specific page number in the PDFs.
|
1528
|
+
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
1529
|
+
"""Extract text from a specific page number in the PDFs.
|
1530
|
+
|
1531
|
+
Args:
|
1532
|
+
pdf_paths: List of PDF paths
|
1533
|
+
page_num: Page number (can be "S1", "S2", etc for SI pages)
|
1534
|
+
skip_si_toc: If True, skip first 2 pages of SI to avoid TOC
|
1535
|
+
"""
|
1403
1536
|
# Convert page number to int and handle S-prefix
|
1404
1537
|
page_str = str(page_num).strip().upper()
|
1405
1538
|
if page_str.startswith('S'):
|
1406
1539
|
# Supplementary page - look in the SI PDF (second PDF)
|
1407
1540
|
actual_page = int(page_str[1:]) - 1 # 0-indexed
|
1408
1541
|
pdf_index = 1 if len(pdf_paths) > 1 else 0
|
1542
|
+
is_si_page = True
|
1409
1543
|
else:
|
1410
1544
|
# Regular page - look in the main PDF
|
1411
1545
|
actual_page = int(page_str) - 1 # 0-indexed
|
1412
1546
|
pdf_index = 0
|
1547
|
+
is_si_page = False
|
1548
|
+
|
1549
|
+
# Skip first 2 pages of SI to avoid table of contents
|
1550
|
+
if skip_si_toc and is_si_page and actual_page < 2:
|
1551
|
+
log.info("Skipping SI page %s (first 2 pages are typically TOC)", page_str)
|
1552
|
+
return ""
|
1413
1553
|
|
1414
1554
|
if pdf_index >= len(pdf_paths):
|
1415
1555
|
log.warning("Page %s requested but not enough PDFs provided", page_str)
|
@@ -1543,8 +1683,14 @@ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
|
|
1543
1683
|
- Only extract dna_seq if NO amino acid sequence is available for that variant
|
1544
1684
|
- This reduces redundancy since protein sequences are usually more relevant
|
1545
1685
|
|
1686
|
+
CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
1687
|
+
- Papers often use different naming conventions in different sections
|
1688
|
+
- DO NOT normalize or simplify variant IDs
|
1689
|
+
- Extract the variant_id exactly as written where the sequence appears
|
1690
|
+
- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
|
1691
|
+
|
1546
1692
|
For each variant return:
|
1547
|
-
* variant_id - the label
|
1693
|
+
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
1548
1694
|
* aa_seq - amino-acid sequence (uppercase), or null
|
1549
1695
|
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
1550
1696
|
|
@@ -1584,7 +1730,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
1584
1730
|
return _parse_sequences(data)
|
1585
1731
|
|
1586
1732
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
1587
|
-
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY")
|
1733
|
+
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
|
1588
1734
|
_VALID_DNA = set("ACGT")
|
1589
1735
|
|
1590
1736
|
def _contains_sequence(text: str, min_length: int = 50) -> bool:
|
@@ -1974,43 +2120,53 @@ def _merge_lineage_and_sequences(
|
|
1974
2120
|
}
|
1975
2121
|
for s in seqs
|
1976
2122
|
])
|
2123
|
+
|
2124
|
+
# Log sequence data info
|
2125
|
+
if len(df_seq) > 0:
|
2126
|
+
seq_with_aa = (~df_seq['aa_seq'].isna()).sum()
|
2127
|
+
seq_with_dna = (~df_seq['dna_seq'].isna()).sum()
|
2128
|
+
log.info(f"Sequence data: {len(df_seq)} entries, {seq_with_aa} with aa_seq, {seq_with_dna} with dna_seq")
|
1977
2129
|
|
1978
|
-
# 2.
|
2130
|
+
# 2. First try direct merge
|
1979
2131
|
df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
|
1980
|
-
|
1981
|
-
#
|
1982
|
-
|
1983
|
-
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
# Find lineage entries without sequences
|
2132
|
+
|
2133
|
+
# Log merge results
|
2134
|
+
merged_aa = (~df['aa_seq'].isna()).sum()
|
2135
|
+
merged_dna = (~df['dna_seq'].isna()).sum()
|
2136
|
+
log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
|
2137
|
+
|
2138
|
+
# 3. If we have unmatched sequences and a model, use Gemini to match
|
2139
|
+
if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
|
2140
|
+
# Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
|
1990
2141
|
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
1991
|
-
|
2142
|
+
unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
|
1992
2143
|
|
1993
|
-
# Find sequences
|
2144
|
+
# Find unmatched sequences
|
1994
2145
|
matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
|
1995
2146
|
unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
|
1996
2147
|
|
1997
|
-
if
|
1998
|
-
log.info(f"Found {len(
|
1999
|
-
log.info(f"
|
2148
|
+
if unmatched_lineage_ids and len(unmatched_seqs) > 0:
|
2149
|
+
log.info(f"Found {len(unmatched_lineage_ids)} lineage entries without sequences")
|
2150
|
+
log.info(f"Found {len(unmatched_seqs)} unmatched sequences")
|
2151
|
+
log.info("Using Gemini to match variants")
|
2000
2152
|
|
2001
|
-
# Build prompt for Gemini
|
2002
|
-
prompt = f"""Match enzyme variant IDs between two lists
|
2153
|
+
# Build prompt for Gemini
|
2154
|
+
prompt = f"""Match enzyme variant IDs between two lists from the same paper.
|
2155
|
+
|
2156
|
+
Papers often use different naming conventions for the same variant:
|
2157
|
+
- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
|
2158
|
+
- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
|
2159
|
+
|
2160
|
+
Match variants by analyzing generation numbers, prefixes, and patterns.
|
2003
2161
|
|
2004
2162
|
Lineage variant IDs (need sequences):
|
2005
|
-
{json.dumps(
|
2163
|
+
{json.dumps(unmatched_lineage_ids)}
|
2006
2164
|
|
2007
2165
|
Sequence variant IDs (have sequences):
|
2008
2166
|
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
2009
2167
|
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2013
|
-
Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
|
2168
|
+
Return ONLY a JSON object mapping lineage IDs to sequence IDs.
|
2169
|
+
Format: {{"lineage_id": "sequence_id", ...}}
|
2014
2170
|
"""
|
2015
2171
|
|
2016
2172
|
try:
|
@@ -2024,85 +2180,82 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2024
2180
|
text = text[4:].strip()
|
2025
2181
|
|
2026
2182
|
matches = json.loads(text)
|
2027
|
-
log.info(f"Gemini returned matches
|
2183
|
+
log.info(f"Gemini returned {len(matches)} matches")
|
2028
2184
|
|
2029
|
-
#
|
2030
|
-
|
2185
|
+
# Create a mapping of sequence IDs to their data for efficient lookup
|
2186
|
+
seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
|
2031
2187
|
|
2032
|
-
# Apply
|
2188
|
+
# Apply matches and update variant IDs
|
2033
2189
|
for lineage_id, seq_id in matches.items():
|
2034
|
-
if lineage_id in
|
2035
|
-
#
|
2036
|
-
seq_data =
|
2190
|
+
if lineage_id in unmatched_lineage_ids and seq_id in seq_data_map:
|
2191
|
+
# Get the sequence data
|
2192
|
+
seq_data = seq_data_map[seq_id]
|
2037
2193
|
|
2038
|
-
#
|
2039
|
-
|
2040
|
-
if
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
|
2045
|
-
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
if seq_data is not None:
|
2058
|
-
# Update the dataframe
|
2059
|
-
mask = df['variant_id'] == lineage_id
|
2060
|
-
if mask.any():
|
2061
|
-
# Log before update
|
2062
|
-
log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
|
2063
|
-
|
2064
|
-
df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
|
2065
|
-
df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
|
2066
|
-
df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
|
2067
|
-
df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
|
2194
|
+
# Update the row with the matched sequence ID and data
|
2195
|
+
mask = df['variant_id'] == lineage_id
|
2196
|
+
if mask.any():
|
2197
|
+
# Update variant_id to use the sequence variant name
|
2198
|
+
df.loc[mask, 'variant_id'] = seq_id
|
2199
|
+
|
2200
|
+
# Update parent_id if it matches any of the mapped lineage IDs
|
2201
|
+
parent_mask = df['parent_id'] == lineage_id
|
2202
|
+
if parent_mask.any():
|
2203
|
+
df.loc[parent_mask, 'parent_id'] = seq_id
|
2204
|
+
|
2205
|
+
# Update sequence data
|
2206
|
+
# For pandas Series from iterrows(), use proper indexing
|
2207
|
+
aa_seq_val = seq_data['aa_seq'] if 'aa_seq' in seq_data else None
|
2208
|
+
dna_seq_val = seq_data['dna_seq'] if 'dna_seq' in seq_data else None
|
2209
|
+
|
2210
|
+
# Always update sequence fields to preserve DNA even when aa_seq is null
|
2211
|
+
df.loc[mask, 'aa_seq'] = aa_seq_val
|
2212
|
+
df.loc[mask, 'dna_seq'] = dna_seq_val
|
2068
2213
|
|
2069
|
-
|
2070
|
-
|
2071
|
-
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
log.
|
2214
|
+
df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
|
2215
|
+
df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
|
2216
|
+
|
2217
|
+
# Log sequence info - check both aa_seq and dna_seq
|
2218
|
+
aa_len = len(seq_data['aa_seq']) if pd.notna(seq_data.get('aa_seq')) and seq_data.get('aa_seq') else 0
|
2219
|
+
dna_len = len(seq_data['dna_seq']) if pd.notna(seq_data.get('dna_seq')) and seq_data.get('dna_seq') else 0
|
2220
|
+
log.info(f"Matched {lineage_id} -> {seq_id} (aa_seq: {aa_len} chars, dna_seq: {dna_len} chars)")
|
2221
|
+
|
2222
|
+
# Update any remaining parent_id references to matched variants
|
2223
|
+
for lineage_id, seq_id in matches.items():
|
2224
|
+
parent_mask = df['parent_id'] == lineage_id
|
2225
|
+
if parent_mask.any():
|
2226
|
+
df.loc[parent_mask, 'parent_id'] = seq_id
|
2076
2227
|
|
2077
|
-
# Log
|
2078
|
-
|
2079
|
-
|
2228
|
+
# Log final state - count variants with any sequence (aa or dna)
|
2229
|
+
aa_count = (~df['aa_seq'].isna()).sum()
|
2230
|
+
dna_count = (~df['dna_seq'].isna()).sum()
|
2231
|
+
any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
|
2232
|
+
log.info(f"After Gemini matching: {any_seq_count}/{len(df)} variants have sequences (aa: {aa_count}, dna: {dna_count})")
|
2080
2233
|
|
2081
2234
|
except Exception as e:
|
2082
2235
|
log.warning(f"Failed to match variants using Gemini: {e}")
|
2083
2236
|
|
2084
|
-
#
|
2237
|
+
# 4. If generation missing, try inference
|
2085
2238
|
if df["generation"].isna().any():
|
2086
|
-
_infer_generations(lineage)
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
|
2092
|
-
|
2093
|
-
|
2094
|
-
|
2095
|
-
#
|
2239
|
+
_infer_generations(lineage)
|
2240
|
+
# Need to update the generations based on the potentially updated variant IDs
|
2241
|
+
gen_map = {v.variant_id: v.generation for v in lineage}
|
2242
|
+
# Also create a map for any variant IDs that were replaced
|
2243
|
+
for idx, row in df.iterrows():
|
2244
|
+
variant_id = row['variant_id']
|
2245
|
+
if variant_id in gen_map:
|
2246
|
+
df.at[idx, 'generation'] = gen_map[variant_id]
|
2247
|
+
|
2248
|
+
# 5. Attach DOI column
|
2096
2249
|
df["doi"] = doi
|
2097
2250
|
|
2098
|
-
#
|
2251
|
+
# 6. Sort by generation, then variant_id
|
2099
2252
|
df = df.sort_values(["generation", "variant_id"], kind="mergesort")
|
2100
2253
|
|
2101
|
-
#
|
2102
|
-
|
2103
|
-
|
2104
|
-
|
2105
|
-
|
2254
|
+
# 7. Log final state
|
2255
|
+
aa_count = (~df['aa_seq'].isna()).sum()
|
2256
|
+
dna_count = (~df['dna_seq'].isna()).sum()
|
2257
|
+
any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
|
2258
|
+
log.info(f"Final result: {len(df)} variants, {any_seq_count} with sequences (aa: {aa_count}, dna: {dna_count})")
|
2106
2259
|
|
2107
2260
|
return df
|
2108
2261
|
|
@@ -2114,28 +2267,27 @@ def merge_and_score(
|
|
2114
2267
|
doi: Optional[str] = None,
|
2115
2268
|
model=None,
|
2116
2269
|
) -> pd.DataFrame:
|
2117
|
-
"""
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2270
|
+
"""Merge lineage and sequence data into a single DataFrame.
|
2271
|
+
|
2272
|
+
Args:
|
2273
|
+
lineage: List of Variant objects from lineage extraction
|
2274
|
+
seqs: List of SequenceBlock objects from sequence extraction
|
2275
|
+
doi: DOI of the paper for provenance
|
2276
|
+
model: Gemini model for smart matching (optional)
|
2277
|
+
|
2278
|
+
Returns:
|
2279
|
+
DataFrame with merged lineage and sequence data
|
2122
2280
|
"""
|
2123
|
-
|
2124
2281
|
if not lineage:
|
2125
2282
|
raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
|
2126
2283
|
|
2127
|
-
# If no sequences found, still build a DataFrame so caller can decide what to do.
|
2128
2284
|
df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
|
2129
2285
|
|
2130
|
-
#
|
2286
|
+
# Warn if many sequences are missing
|
2131
2287
|
missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
|
2132
2288
|
if missing_rate > 0.5:
|
2133
2289
|
log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
|
2134
2290
|
|
2135
|
-
# Debug log before returning
|
2136
|
-
seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
|
2137
|
-
log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
|
2138
|
-
|
2139
2291
|
return df
|
2140
2292
|
|
2141
2293
|
# -------------------------------------------------------------------- end 8 ---
|
@@ -2320,18 +2472,17 @@ def run_pipeline(
|
|
2320
2472
|
# Save final data with sequences using same filename (overwrites lineage-only)
|
2321
2473
|
sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
|
2322
2474
|
|
2323
|
-
#
|
2324
|
-
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2325
|
-
log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
|
2326
|
-
if seq_count > 0 and 'aa_seq' in df_final:
|
2327
|
-
with_seq = df_final[~df_final['aa_seq'].isna()]
|
2328
|
-
log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
|
2329
|
-
|
2475
|
+
# Save the final CSV
|
2330
2476
|
df_final.to_csv(sequence_path, index=False)
|
2477
|
+
|
2478
|
+
# Log summary statistics
|
2479
|
+
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2331
2480
|
log.info(
|
2332
|
-
"
|
2481
|
+
"Saved final CSV -> %s (%.1f kB, %d variants, %d with sequences)",
|
2333
2482
|
sequence_path,
|
2334
2483
|
sequence_path.stat().st_size / 1024,
|
2484
|
+
len(df_final),
|
2485
|
+
seq_count
|
2335
2486
|
)
|
2336
2487
|
|
2337
2488
|
log.info(
|