debase 0.1.18__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {debase-0.1.18 → debase-0.4.0}/PKG-INFO +1 -1
  2. {debase-0.1.18 → debase-0.4.0}/src/debase/_version.py +1 -1
  3. {debase-0.1.18 → debase-0.4.0}/src/debase/cleanup_sequence.py +40 -8
  4. {debase-0.1.18 → debase-0.4.0}/src/debase/enzyme_lineage_extractor.py +153 -9
  5. {debase-0.1.18 → debase-0.4.0}/src/debase/reaction_info_extractor.py +1181 -493
  6. {debase-0.1.18 → debase-0.4.0}/src/debase/substrate_scope_extractor.py +83 -34
  7. {debase-0.1.18 → debase-0.4.0}/src/debase/wrapper.py +75 -0
  8. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/PKG-INFO +1 -1
  9. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/SOURCES.txt +0 -7
  10. debase-0.1.18/.gitignore +0 -177
  11. debase-0.1.18/CONTRIBUTING.md +0 -61
  12. debase-0.1.18/docs/README.md +0 -19
  13. debase-0.1.18/docs/examples/README.md +0 -24
  14. debase-0.1.18/environment.yml +0 -21
  15. debase-0.1.18/src/__init__.py +0 -1
  16. debase-0.1.18/src/debase/PIPELINE_FLOW.md +0 -100
  17. {debase-0.1.18 → debase-0.4.0}/LICENSE +0 -0
  18. {debase-0.1.18 → debase-0.4.0}/MANIFEST.in +0 -0
  19. {debase-0.1.18 → debase-0.4.0}/README.md +0 -0
  20. {debase-0.1.18 → debase-0.4.0}/pyproject.toml +0 -0
  21. {debase-0.1.18 → debase-0.4.0}/setup.cfg +0 -0
  22. {debase-0.1.18 → debase-0.4.0}/setup.py +0 -0
  23. {debase-0.1.18 → debase-0.4.0}/src/debase/__init__.py +0 -0
  24. {debase-0.1.18 → debase-0.4.0}/src/debase/__main__.py +0 -0
  25. {debase-0.1.18 → debase-0.4.0}/src/debase/build_db.py +0 -0
  26. {debase-0.1.18 → debase-0.4.0}/src/debase/lineage_format.py +0 -0
  27. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/dependency_links.txt +0 -0
  28. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/entry_points.txt +0 -0
  29. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/requires.txt +0 -0
  30. {debase-0.1.18 → debase-0.4.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.18
3
+ Version: 0.4.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.18"
3
+ __version__ = "0.4.0"
@@ -827,20 +827,52 @@ class SequenceProcessor:
827
827
  log.info(f"Saved results to {self.output_csv}")
828
828
 
829
829
  def run(self) -> None:
830
- """Run the complete processing pipeline."""
830
+ """Run the complete processing pipeline with campaign-based processing."""
831
831
  log.info("Starting sequence generation pipeline")
832
832
 
833
833
  # Load data
834
834
  self.load_data()
835
835
 
836
- # Flag complex mutations
837
- self.flag_complex_mutations()
836
+ # Process each campaign separately
837
+ campaigns = self.df['campaign_id'].unique()
838
+ log.info(f"Processing {len(campaigns)} campaigns: {list(campaigns)}")
838
839
 
839
- # Process in order
840
- self.process_simple_mutations()
841
- self.process_complex_mutations()
842
- self.process_remaining()
843
- self.backward_pass()
840
+ for campaign_id in campaigns:
841
+ if pd.isna(campaign_id):
842
+ campaign_id = "unknown"
843
+
844
+ log.info(f"Processing campaign: {campaign_id}")
845
+
846
+ # Filter data for this campaign
847
+ campaign_mask = self.df['campaign_id'] == campaign_id
848
+ if pd.isna(campaign_id):
849
+ campaign_mask = self.df['campaign_id'].isna()
850
+
851
+ # Store original dataframe
852
+ original_df = self.df
853
+
854
+ # Process only this campaign's data
855
+ self.df = self.df[campaign_mask].copy()
856
+
857
+ # Rebuild relationships for this campaign
858
+ self.generator = SequenceGenerator(self.df)
859
+
860
+ # Flag complex mutations
861
+ self.flag_complex_mutations()
862
+
863
+ # Process in order
864
+ self.process_simple_mutations()
865
+ self.process_complex_mutations()
866
+ self.process_remaining()
867
+ self.backward_pass()
868
+
869
+ # Update the original dataframe with results
870
+ original_df.loc[campaign_mask, :] = self.df
871
+
872
+ # Restore original dataframe
873
+ self.df = original_df
874
+
875
+ log.info(f"Completed campaign: {campaign_id}")
844
876
 
845
877
  # Save results
846
878
  self.save_results()
@@ -377,13 +377,28 @@ def get_model():
377
377
 
378
378
  # === 5.3 Unified call helper ----------------------------------------------
379
379
 
380
- def _extract_text(resp) -> str:
380
+ def _extract_text_and_track_tokens(resp) -> str:
381
381
  """
382
382
  Pull the *first* textual part out of a GenerativeAI response, handling both
383
- the old prerelease SDK and the >=1.0 SDK.
383
+ the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
384
384
 
385
385
  Returns an empty string if no textual content is found.
386
386
  """
387
+ # Track token usage if available
388
+ try:
389
+ if hasattr(resp, 'usage_metadata'):
390
+ input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
391
+ output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
392
+ if input_tokens or output_tokens:
393
+ # Import wrapper token tracking
394
+ try:
395
+ from .wrapper import add_token_usage
396
+ add_token_usage('enzyme_lineage_extractor', input_tokens, output_tokens)
397
+ except ImportError:
398
+ pass # wrapper not available
399
+ except Exception:
400
+ pass # token tracking is best-effort
401
+
387
402
  # 1) Legacy SDK (<= 0.4) - still has nice `.text`
388
403
  if getattr(resp, "text", None):
389
404
  return resp.text
@@ -409,6 +424,10 @@ def _extract_text(resp) -> str:
409
424
  # 3) As a last resort fall back to str()
410
425
  return str(resp)
411
426
 
427
+ def _extract_text(resp) -> str:
428
+ """Backward compatibility wrapper for _extract_text_and_track_tokens."""
429
+ return _extract_text_and_track_tokens(resp)
430
+
412
431
 
413
432
  def generate_json_with_retry(
414
433
  model,
@@ -572,7 +591,7 @@ Look for:
572
591
  Return a JSON array of campaigns:
573
592
  [
574
593
  {{
575
- "campaign_id": "unique_id",
594
+ "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
576
595
  "campaign_name": "descriptive name",
577
596
  "description": "what this campaign evolved for",
578
597
  "model_substrate": "substrate name/id",
@@ -585,6 +604,9 @@ Return a JSON array of campaigns:
585
604
  }}
586
605
  ]
587
606
 
607
+ IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
608
+ Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
609
+
588
610
  TEXT:
589
611
  {text}
590
612
  """.strip()
@@ -1559,6 +1581,82 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
1559
1581
  return []
1560
1582
 
1561
1583
  # --- 7.2 Page-based extraction helper ---------------------------------------
1584
+ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
1585
+ """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
1586
+
1587
+ # Extract mutations from variants
1588
+ mutations = []
1589
+ for variant in variants:
1590
+ if variant.mutations:
1591
+ mutations.extend(variant.mutations)
1592
+
1593
+ if not mutations:
1594
+ return None
1595
+
1596
+ # Take a sample of mutations for validation
1597
+ sample_mutations = mutations[:10] # Check first 10 mutations
1598
+
1599
+ # First do a quick local check for obvious inconsistencies
1600
+ local_issues = []
1601
+ for mutation in sample_mutations:
1602
+ if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
1603
+ pos = mutation.position - 1 # Convert to 0-indexed
1604
+ if 0 <= pos < len(sequence):
1605
+ actual_aa = sequence[pos]
1606
+ expected_aa = mutation.original
1607
+ if actual_aa != expected_aa:
1608
+ local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
1609
+
1610
+ if not local_issues:
1611
+ return None # No obvious issues found
1612
+
1613
+ log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
1614
+
1615
+ prompt = f"""
1616
+ You are validating a protein sequence that was extracted from a scientific paper.
1617
+ The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
1618
+
1619
+ Original sequence (length {len(sequence)}):
1620
+ {sequence}
1621
+
1622
+ Known mutations that should be applicable to this sequence:
1623
+ {', '.join(str(m) for m in sample_mutations)}
1624
+
1625
+ Potential issues detected:
1626
+ {chr(10).join(local_issues)}
1627
+
1628
+ Please check if the sequence is consistent with these mutations:
1629
+ 1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
1630
+ 2. If you find inconsistencies, suggest the most likely correction
1631
+ 3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
1632
+ 4. Pay special attention to consecutive identical amino acids that might be OCR errors
1633
+
1634
+ Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
1635
+ If you cannot determine the correct sequence, return "UNCERTAIN".
1636
+ """
1637
+
1638
+ try:
1639
+ response = model.generate_content(prompt)
1640
+ result = _extract_text(response).strip()
1641
+
1642
+ if result == "VALID":
1643
+ return None # No changes needed
1644
+ elif result == "UNCERTAIN":
1645
+ log.warning("Gemini could not validate sequence against mutations")
1646
+ return None
1647
+ elif result.startswith("M") and len(result) > 50:
1648
+ # Gemini returned a corrected sequence
1649
+ log.info(f"Gemini suggested sequence correction (length {len(result)})")
1650
+ return result
1651
+ else:
1652
+ log.warning(f"Unexpected validation response: {result[:100]}...")
1653
+ return None
1654
+
1655
+ except Exception as e:
1656
+ log.warning(f"Failed to validate sequence: {e}")
1657
+ return None
1658
+
1659
+
1562
1660
  def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
1563
1661
  """Extract text from a specific page number in the PDFs.
1564
1662
 
@@ -2040,7 +2138,13 @@ If you cannot determine certain fields, set them to null.
2040
2138
  # Clean the sequence
2041
2139
  seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
2042
2140
  # Validate it looks like a protein sequence
2043
- if seq and all(c in "ACDEFGHIKLMNPQRSTVWY" for c in seq) and len(seq) > 50:
2141
+ if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
2142
+ # Sanity check the sequence against known mutations
2143
+ validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
2144
+ if validated_seq:
2145
+ seq = validated_seq
2146
+ log.info(f"Sequence validated and potentially corrected by Gemini")
2147
+
2044
2148
  # Map to the first variant or wild-type
2045
2149
  wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
2046
2150
  if wt_variant:
@@ -2365,7 +2469,8 @@ Papers often use different naming conventions for the same variant:
2365
2469
  - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
2366
2470
  - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
2367
2471
 
2368
- Match variants by analyzing generation numbers, prefixes, and patterns.
2472
+ Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
2473
+ use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
2369
2474
 
2370
2475
  Lineage variant IDs (need sequences):
2371
2476
  {json.dumps(unmatched_lineage_ids)}
@@ -2378,8 +2483,24 @@ Format: {{"lineage_id": "sequence_id", ...}}
2378
2483
  """
2379
2484
 
2380
2485
  try:
2486
+ log.info("Sending variant matching request to Gemini...")
2487
+ log.debug(f"Prompt length: {len(prompt)} characters")
2488
+
2381
2489
  response = model.generate_content(prompt)
2490
+ log.debug(f"Gemini response object: {response}")
2491
+ log.debug(f"Response candidates: {getattr(response, 'candidates', 'N/A')}")
2492
+
2382
2493
  text = _extract_text(response).strip()
2494
+ log.info(f"Extracted text length: {len(text)}")
2495
+
2496
+ if not text:
2497
+ log.error("Gemini returned empty text - API call may have failed")
2498
+ log.error(f"Response object: {response}")
2499
+ if hasattr(response, 'prompt_feedback'):
2500
+ log.error(f"Prompt feedback: {response.prompt_feedback}")
2501
+ raise ValueError("Empty response from Gemini")
2502
+
2503
+ log.debug(f"Raw response (first 500 chars): {text[:500]}")
2383
2504
 
2384
2505
  # Parse JSON response
2385
2506
  if text.startswith("```"):
@@ -2387,8 +2508,31 @@ Format: {{"lineage_id": "sequence_id", ...}}
2387
2508
  if text.startswith("json"):
2388
2509
  text = text[4:].strip()
2389
2510
 
2390
- matches = json.loads(text)
2391
- log.info(f"Gemini returned {len(matches)} matches")
2511
+ log.debug(f"Cleaned text for JSON parsing (first 500 chars): {text[:500]}")
2512
+
2513
+ if not text.strip():
2514
+ log.error("Text is empty after cleaning")
2515
+ matches = {}
2516
+ else:
2517
+ try:
2518
+ matches = json.loads(text)
2519
+ log.info(f"Successfully parsed {len(matches)} matches from Gemini")
2520
+ except json.JSONDecodeError as e:
2521
+ log.error(f"JSON parsing failed: {e}")
2522
+ log.error(f"Full cleaned text: {text}")
2523
+ # Try to extract JSON from within the response
2524
+ import re
2525
+ json_match = re.search(r'\{.*\}', text, re.DOTALL)
2526
+ if json_match:
2527
+ try:
2528
+ matches = json.loads(json_match.group(0))
2529
+ log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
2530
+ except json.JSONDecodeError:
2531
+ log.error("Failed to extract JSON from response")
2532
+ matches = {}
2533
+ else:
2534
+ log.error("No JSON object found in response")
2535
+ matches = {}
2392
2536
 
2393
2537
  # Create a mapping of sequence IDs to their data for efficient lookup
2394
2538
  seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
@@ -2456,8 +2600,8 @@ Format: {{"lineage_id": "sequence_id", ...}}
2456
2600
  # 5. Attach DOI column
2457
2601
  df["doi"] = doi
2458
2602
 
2459
- # 6. Sort by generation, then variant_id
2460
- df = df.sort_values(["generation", "variant_id"], kind="mergesort")
2603
+ # 6. Sort by campaign_id, then generation
2604
+ df = df.sort_values(["campaign_id", "generation"], kind="mergesort")
2461
2605
 
2462
2606
  # 7. Log final state
2463
2607
  aa_count = (~df['aa_seq'].isna()).sum()