debase 0.1.19__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.19 → debase-0.4.1}/PKG-INFO +1 -1
- {debase-0.1.19 → debase-0.4.1}/src/debase/_version.py +1 -1
- {debase-0.1.19 → debase-0.4.1}/src/debase/cleanup_sequence.py +40 -8
- {debase-0.1.19 → debase-0.4.1}/src/debase/enzyme_lineage_extractor.py +153 -9
- {debase-0.1.19 → debase-0.4.1}/src/debase/reaction_info_extractor.py +1119 -504
- {debase-0.1.19 → debase-0.4.1}/src/debase/substrate_scope_extractor.py +50 -41
- {debase-0.1.19 → debase-0.4.1}/src/debase/wrapper.py +75 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/SOURCES.txt +0 -7
- debase-0.1.19/.gitignore +0 -177
- debase-0.1.19/CONTRIBUTING.md +0 -61
- debase-0.1.19/docs/README.md +0 -19
- debase-0.1.19/docs/examples/README.md +0 -24
- debase-0.1.19/environment.yml +0 -21
- debase-0.1.19/src/__init__.py +0 -1
- debase-0.1.19/src/debase/PIPELINE_FLOW.md +0 -100
- {debase-0.1.19 → debase-0.4.1}/LICENSE +0 -0
- {debase-0.1.19 → debase-0.4.1}/MANIFEST.in +0 -0
- {debase-0.1.19 → debase-0.4.1}/README.md +0 -0
- {debase-0.1.19 → debase-0.4.1}/pyproject.toml +0 -0
- {debase-0.1.19 → debase-0.4.1}/setup.cfg +0 -0
- {debase-0.1.19 → debase-0.4.1}/setup.py +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase/__init__.py +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase/__main__.py +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase/build_db.py +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase/lineage_format.py +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.19 → debase-0.4.1}/src/debase.egg-info/top_level.txt +0 -0
@@ -827,20 +827,52 @@ class SequenceProcessor:
|
|
827
827
|
log.info(f"Saved results to {self.output_csv}")
|
828
828
|
|
829
829
|
def run(self) -> None:
|
830
|
-
"""Run the complete processing pipeline."""
|
830
|
+
"""Run the complete processing pipeline with campaign-based processing."""
|
831
831
|
log.info("Starting sequence generation pipeline")
|
832
832
|
|
833
833
|
# Load data
|
834
834
|
self.load_data()
|
835
835
|
|
836
|
-
#
|
837
|
-
self.
|
836
|
+
# Process each campaign separately
|
837
|
+
campaigns = self.df['campaign_id'].unique()
|
838
|
+
log.info(f"Processing {len(campaigns)} campaigns: {list(campaigns)}")
|
838
839
|
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
840
|
+
for campaign_id in campaigns:
|
841
|
+
if pd.isna(campaign_id):
|
842
|
+
campaign_id = "unknown"
|
843
|
+
|
844
|
+
log.info(f"Processing campaign: {campaign_id}")
|
845
|
+
|
846
|
+
# Filter data for this campaign
|
847
|
+
campaign_mask = self.df['campaign_id'] == campaign_id
|
848
|
+
if pd.isna(campaign_id):
|
849
|
+
campaign_mask = self.df['campaign_id'].isna()
|
850
|
+
|
851
|
+
# Store original dataframe
|
852
|
+
original_df = self.df
|
853
|
+
|
854
|
+
# Process only this campaign's data
|
855
|
+
self.df = self.df[campaign_mask].copy()
|
856
|
+
|
857
|
+
# Rebuild relationships for this campaign
|
858
|
+
self.generator = SequenceGenerator(self.df)
|
859
|
+
|
860
|
+
# Flag complex mutations
|
861
|
+
self.flag_complex_mutations()
|
862
|
+
|
863
|
+
# Process in order
|
864
|
+
self.process_simple_mutations()
|
865
|
+
self.process_complex_mutations()
|
866
|
+
self.process_remaining()
|
867
|
+
self.backward_pass()
|
868
|
+
|
869
|
+
# Update the original dataframe with results
|
870
|
+
original_df.loc[campaign_mask, :] = self.df
|
871
|
+
|
872
|
+
# Restore original dataframe
|
873
|
+
self.df = original_df
|
874
|
+
|
875
|
+
log.info(f"Completed campaign: {campaign_id}")
|
844
876
|
|
845
877
|
# Save results
|
846
878
|
self.save_results()
|
@@ -377,13 +377,28 @@ def get_model():
|
|
377
377
|
|
378
378
|
# === 5.3 Unified call helper ----------------------------------------------
|
379
379
|
|
380
|
-
def
|
380
|
+
def _extract_text_and_track_tokens(resp) -> str:
|
381
381
|
"""
|
382
382
|
Pull the *first* textual part out of a GenerativeAI response, handling both
|
383
|
-
the old prerelease SDK and the >=1.0 SDK.
|
383
|
+
the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
|
384
384
|
|
385
385
|
Returns an empty string if no textual content is found.
|
386
386
|
"""
|
387
|
+
# Track token usage if available
|
388
|
+
try:
|
389
|
+
if hasattr(resp, 'usage_metadata'):
|
390
|
+
input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
|
391
|
+
output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
|
392
|
+
if input_tokens or output_tokens:
|
393
|
+
# Import wrapper token tracking
|
394
|
+
try:
|
395
|
+
from .wrapper import add_token_usage
|
396
|
+
add_token_usage('enzyme_lineage_extractor', input_tokens, output_tokens)
|
397
|
+
except ImportError:
|
398
|
+
pass # wrapper not available
|
399
|
+
except Exception:
|
400
|
+
pass # token tracking is best-effort
|
401
|
+
|
387
402
|
# 1) Legacy SDK (<= 0.4) - still has nice `.text`
|
388
403
|
if getattr(resp, "text", None):
|
389
404
|
return resp.text
|
@@ -409,6 +424,10 @@ def _extract_text(resp) -> str:
|
|
409
424
|
# 3) As a last resort fall back to str()
|
410
425
|
return str(resp)
|
411
426
|
|
427
|
+
def _extract_text(resp) -> str:
|
428
|
+
"""Backward compatibility wrapper for _extract_text_and_track_tokens."""
|
429
|
+
return _extract_text_and_track_tokens(resp)
|
430
|
+
|
412
431
|
|
413
432
|
def generate_json_with_retry(
|
414
433
|
model,
|
@@ -572,7 +591,7 @@ Look for:
|
|
572
591
|
Return a JSON array of campaigns:
|
573
592
|
[
|
574
593
|
{{
|
575
|
-
"campaign_id": "
|
594
|
+
"campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
|
576
595
|
"campaign_name": "descriptive name",
|
577
596
|
"description": "what this campaign evolved for",
|
578
597
|
"model_substrate": "substrate name/id",
|
@@ -585,6 +604,9 @@ Return a JSON array of campaigns:
|
|
585
604
|
}}
|
586
605
|
]
|
587
606
|
|
607
|
+
IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
|
608
|
+
Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
|
609
|
+
|
588
610
|
TEXT:
|
589
611
|
{text}
|
590
612
|
""".strip()
|
@@ -1559,6 +1581,82 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
1559
1581
|
return []
|
1560
1582
|
|
1561
1583
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
1584
|
+
def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
|
1585
|
+
"""Validate and potentially correct a sequence using Gemini by checking against known mutations."""
|
1586
|
+
|
1587
|
+
# Extract mutations from variants
|
1588
|
+
mutations = []
|
1589
|
+
for variant in variants:
|
1590
|
+
if variant.mutations:
|
1591
|
+
mutations.extend(variant.mutations)
|
1592
|
+
|
1593
|
+
if not mutations:
|
1594
|
+
return None
|
1595
|
+
|
1596
|
+
# Take a sample of mutations for validation
|
1597
|
+
sample_mutations = mutations[:10] # Check first 10 mutations
|
1598
|
+
|
1599
|
+
# First do a quick local check for obvious inconsistencies
|
1600
|
+
local_issues = []
|
1601
|
+
for mutation in sample_mutations:
|
1602
|
+
if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
|
1603
|
+
pos = mutation.position - 1 # Convert to 0-indexed
|
1604
|
+
if 0 <= pos < len(sequence):
|
1605
|
+
actual_aa = sequence[pos]
|
1606
|
+
expected_aa = mutation.original
|
1607
|
+
if actual_aa != expected_aa:
|
1608
|
+
local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
|
1609
|
+
|
1610
|
+
if not local_issues:
|
1611
|
+
return None # No obvious issues found
|
1612
|
+
|
1613
|
+
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
|
1614
|
+
|
1615
|
+
prompt = f"""
|
1616
|
+
You are validating a protein sequence that was extracted from a scientific paper.
|
1617
|
+
The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
|
1618
|
+
|
1619
|
+
Original sequence (length {len(sequence)}):
|
1620
|
+
{sequence}
|
1621
|
+
|
1622
|
+
Known mutations that should be applicable to this sequence:
|
1623
|
+
{', '.join(str(m) for m in sample_mutations)}
|
1624
|
+
|
1625
|
+
Potential issues detected:
|
1626
|
+
{chr(10).join(local_issues)}
|
1627
|
+
|
1628
|
+
Please check if the sequence is consistent with these mutations:
|
1629
|
+
1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
|
1630
|
+
2. If you find inconsistencies, suggest the most likely correction
|
1631
|
+
3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
|
1632
|
+
4. Pay special attention to consecutive identical amino acids that might be OCR errors
|
1633
|
+
|
1634
|
+
Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
|
1635
|
+
If you cannot determine the correct sequence, return "UNCERTAIN".
|
1636
|
+
"""
|
1637
|
+
|
1638
|
+
try:
|
1639
|
+
response = model.generate_content(prompt)
|
1640
|
+
result = _extract_text(response).strip()
|
1641
|
+
|
1642
|
+
if result == "VALID":
|
1643
|
+
return None # No changes needed
|
1644
|
+
elif result == "UNCERTAIN":
|
1645
|
+
log.warning("Gemini could not validate sequence against mutations")
|
1646
|
+
return None
|
1647
|
+
elif result.startswith("M") and len(result) > 50:
|
1648
|
+
# Gemini returned a corrected sequence
|
1649
|
+
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
1650
|
+
return result
|
1651
|
+
else:
|
1652
|
+
log.warning(f"Unexpected validation response: {result[:100]}...")
|
1653
|
+
return None
|
1654
|
+
|
1655
|
+
except Exception as e:
|
1656
|
+
log.warning(f"Failed to validate sequence: {e}")
|
1657
|
+
return None
|
1658
|
+
|
1659
|
+
|
1562
1660
|
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
1563
1661
|
"""Extract text from a specific page number in the PDFs.
|
1564
1662
|
|
@@ -2040,7 +2138,13 @@ If you cannot determine certain fields, set them to null.
|
|
2040
2138
|
# Clean the sequence
|
2041
2139
|
seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
|
2042
2140
|
# Validate it looks like a protein sequence
|
2043
|
-
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
|
2141
|
+
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
|
2142
|
+
# Sanity check the sequence against known mutations
|
2143
|
+
validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
|
2144
|
+
if validated_seq:
|
2145
|
+
seq = validated_seq
|
2146
|
+
log.info(f"Sequence validated and potentially corrected by Gemini")
|
2147
|
+
|
2044
2148
|
# Map to the first variant or wild-type
|
2045
2149
|
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
2046
2150
|
if wt_variant:
|
@@ -2365,7 +2469,8 @@ Papers often use different naming conventions for the same variant:
|
|
2365
2469
|
- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
|
2366
2470
|
- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
|
2367
2471
|
|
2368
|
-
Match variants by analyzing generation numbers, prefixes, and patterns.
|
2472
|
+
Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
|
2473
|
+
use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
|
2369
2474
|
|
2370
2475
|
Lineage variant IDs (need sequences):
|
2371
2476
|
{json.dumps(unmatched_lineage_ids)}
|
@@ -2378,8 +2483,24 @@ Format: {{"lineage_id": "sequence_id", ...}}
|
|
2378
2483
|
"""
|
2379
2484
|
|
2380
2485
|
try:
|
2486
|
+
log.info("Sending variant matching request to Gemini...")
|
2487
|
+
log.debug(f"Prompt length: {len(prompt)} characters")
|
2488
|
+
|
2381
2489
|
response = model.generate_content(prompt)
|
2490
|
+
log.debug(f"Gemini response object: {response}")
|
2491
|
+
log.debug(f"Response candidates: {getattr(response, 'candidates', 'N/A')}")
|
2492
|
+
|
2382
2493
|
text = _extract_text(response).strip()
|
2494
|
+
log.info(f"Extracted text length: {len(text)}")
|
2495
|
+
|
2496
|
+
if not text:
|
2497
|
+
log.error("Gemini returned empty text - API call may have failed")
|
2498
|
+
log.error(f"Response object: {response}")
|
2499
|
+
if hasattr(response, 'prompt_feedback'):
|
2500
|
+
log.error(f"Prompt feedback: {response.prompt_feedback}")
|
2501
|
+
raise ValueError("Empty response from Gemini")
|
2502
|
+
|
2503
|
+
log.debug(f"Raw response (first 500 chars): {text[:500]}")
|
2383
2504
|
|
2384
2505
|
# Parse JSON response
|
2385
2506
|
if text.startswith("```"):
|
@@ -2387,8 +2508,31 @@ Format: {{"lineage_id": "sequence_id", ...}}
|
|
2387
2508
|
if text.startswith("json"):
|
2388
2509
|
text = text[4:].strip()
|
2389
2510
|
|
2390
|
-
|
2391
|
-
|
2511
|
+
log.debug(f"Cleaned text for JSON parsing (first 500 chars): {text[:500]}")
|
2512
|
+
|
2513
|
+
if not text.strip():
|
2514
|
+
log.error("Text is empty after cleaning")
|
2515
|
+
matches = {}
|
2516
|
+
else:
|
2517
|
+
try:
|
2518
|
+
matches = json.loads(text)
|
2519
|
+
log.info(f"Successfully parsed {len(matches)} matches from Gemini")
|
2520
|
+
except json.JSONDecodeError as e:
|
2521
|
+
log.error(f"JSON parsing failed: {e}")
|
2522
|
+
log.error(f"Full cleaned text: {text}")
|
2523
|
+
# Try to extract JSON from within the response
|
2524
|
+
import re
|
2525
|
+
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
2526
|
+
if json_match:
|
2527
|
+
try:
|
2528
|
+
matches = json.loads(json_match.group(0))
|
2529
|
+
log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
|
2530
|
+
except json.JSONDecodeError:
|
2531
|
+
log.error("Failed to extract JSON from response")
|
2532
|
+
matches = {}
|
2533
|
+
else:
|
2534
|
+
log.error("No JSON object found in response")
|
2535
|
+
matches = {}
|
2392
2536
|
|
2393
2537
|
# Create a mapping of sequence IDs to their data for efficient lookup
|
2394
2538
|
seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
|
@@ -2456,8 +2600,8 @@ Format: {{"lineage_id": "sequence_id", ...}}
|
|
2456
2600
|
# 5. Attach DOI column
|
2457
2601
|
df["doi"] = doi
|
2458
2602
|
|
2459
|
-
# 6. Sort by
|
2460
|
-
df = df.sort_values(["
|
2603
|
+
# 6. Sort by campaign_id, then generation
|
2604
|
+
df = df.sort_values(["campaign_id", "generation"], kind="mergesort")
|
2461
2605
|
|
2462
2606
|
# 7. Log final state
|
2463
2607
|
aa_count = (~df['aa_seq'].isna()).sum()
|