debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +623 -234
- debase/lineage_format.py +113 -11
- debase/reaction_info_extractor.py +21 -7
- debase/substrate_scope_extractor.py +516 -67
- debase/wrapper.py +301 -67
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
- debase-0.1.17.dist-info/RECORD +17 -0
- debase-0.1.11.dist-info/RECORD +0 -17
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -188,11 +188,17 @@ class VariantRecord:
|
|
188
188
|
# Reaction-related -------------------------------------------------------------
|
189
189
|
def substrate_iupac(self) -> List[str]:
|
190
190
|
raw = str(self.row.get("substrate_iupac_list", "")).strip()
|
191
|
-
|
191
|
+
result = _split_list(raw)
|
192
|
+
if not result and raw and raw.lower() != 'nan':
|
193
|
+
log.debug(f"substrate_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
|
194
|
+
return result
|
192
195
|
|
193
196
|
def product_iupac(self) -> List[str]:
|
194
197
|
raw = str(self.row.get("product_iupac_list", "")).strip()
|
195
|
-
|
198
|
+
result = _split_list(raw)
|
199
|
+
if not result and raw and raw.lower() != 'nan':
|
200
|
+
log.debug(f"product_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
|
201
|
+
return result
|
196
202
|
|
197
203
|
|
198
204
|
def ttn_or_yield(self) -> Optional[float]:
|
@@ -377,6 +383,53 @@ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str =
|
|
377
383
|
|
378
384
|
# === 6. SMILES CONVERSION HELPERS ==================================================
|
379
385
|
|
386
|
+
def search_smiles_with_gemini(compound_name: str, model=None) -> Optional[str]:
|
387
|
+
"""
|
388
|
+
Use Gemini to search for SMILES strings of complex compounds.
|
389
|
+
Returns SMILES string if found, None otherwise.
|
390
|
+
"""
|
391
|
+
if not compound_name or compound_name.lower() in ['nan', 'none', '']:
|
392
|
+
return None
|
393
|
+
|
394
|
+
if not model:
|
395
|
+
try:
|
396
|
+
# Import get_model from enzyme_lineage_extractor
|
397
|
+
import sys
|
398
|
+
from pathlib import Path
|
399
|
+
sys.path.append(str(Path(__file__).parent))
|
400
|
+
from enzyme_lineage_extractor import get_model
|
401
|
+
model = get_model()
|
402
|
+
except Exception as e:
|
403
|
+
log.warning(f"Could not load Gemini model: {e}")
|
404
|
+
return None
|
405
|
+
|
406
|
+
prompt = f"""Search for the SMILES string representation of this chemical compound:
|
407
|
+
"{compound_name}"
|
408
|
+
|
409
|
+
IMPORTANT:
|
410
|
+
- Do NOT generate or create a SMILES string
|
411
|
+
- Only provide SMILES that you can find in chemical databases or literature
|
412
|
+
- For deuterated compounds, search for the specific isotope-labeled SMILES
|
413
|
+
- If you cannot find the exact SMILES, say "NOT FOUND"
|
414
|
+
|
415
|
+
Return ONLY the SMILES string if found, or "NOT FOUND" if not found.
|
416
|
+
No explanation or additional text."""
|
417
|
+
|
418
|
+
try:
|
419
|
+
response = model.generate_content(prompt)
|
420
|
+
result = response.text.strip()
|
421
|
+
|
422
|
+
if result and result != "NOT FOUND" and not result.startswith("I"):
|
423
|
+
# Basic validation that it looks like SMILES
|
424
|
+
if any(c in result for c in ['C', 'c', 'N', 'O', 'S', 'P', '[', ']', '(', ')']):
|
425
|
+
log.info(f"Gemini found SMILES for '{compound_name}': {result}")
|
426
|
+
return result
|
427
|
+
return None
|
428
|
+
except Exception as e:
|
429
|
+
log.debug(f"Gemini SMILES search failed for '{compound_name}': {e}")
|
430
|
+
return None
|
431
|
+
|
432
|
+
|
380
433
|
def _split_list(raw: str) -> List[str]:
|
381
434
|
if not raw or str(raw).lower() == 'nan':
|
382
435
|
return []
|
@@ -429,7 +482,12 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
|
429
482
|
except FileNotFoundError:
|
430
483
|
pass # OPSIN not installed
|
431
484
|
|
432
|
-
# 3.
|
485
|
+
# 3. Gemini search (for complex compounds) ---------------------------------
|
486
|
+
gemini_smiles = search_smiles_with_gemini(name)
|
487
|
+
if gemini_smiles:
|
488
|
+
return gemini_smiles
|
489
|
+
|
490
|
+
# 4. PubChem PUG REST (online) ---------------------------------------------
|
433
491
|
try:
|
434
492
|
import requests
|
435
493
|
|
@@ -538,13 +596,23 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
|
|
538
596
|
|
539
597
|
def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
540
598
|
"""Infer lineage roots using generation numbers and simple sequence similarity."""
|
541
|
-
|
599
|
+
# Create idmap, handling missing enzyme_id gracefully
|
600
|
+
idmap: Dict[str, Dict[str, str]] = {}
|
601
|
+
for _, r in df.iterrows():
|
602
|
+
eid = r.get("enzyme_id")
|
603
|
+
if pd.isna(eid) or str(eid).strip() == "":
|
604
|
+
continue
|
605
|
+
idmap[str(eid)] = r
|
542
606
|
roots: Dict[str, str] = {}
|
543
607
|
# Look for generation 0 as the root
|
544
|
-
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
608
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
609
|
+
if str(r.get("generation", "")).strip() == "0"
|
610
|
+
and not pd.isna(r.get("enzyme_id"))}
|
545
611
|
# If no gen0 found, fall back to gen1
|
546
612
|
if not gen0:
|
547
|
-
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
613
|
+
gen0 = {r["enzyme_id"] for _, r in df.iterrows()
|
614
|
+
if str(r.get("generation", "")).strip() == "1"
|
615
|
+
and not pd.isna(r.get("enzyme_id"))}
|
548
616
|
|
549
617
|
def _seq_sim(a: str, b: str) -> float:
|
550
618
|
if not a or not b:
|
@@ -553,7 +621,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
|
553
621
|
return matches / max(len(a), len(b))
|
554
622
|
|
555
623
|
for _, row in df.iterrows():
|
556
|
-
eid = row
|
624
|
+
eid = row.get("enzyme_id")
|
625
|
+
if pd.isna(eid) or str(eid).strip() == "":
|
626
|
+
continue
|
557
627
|
if eid in gen0:
|
558
628
|
roots[eid] = eid
|
559
629
|
continue
|
@@ -593,6 +663,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
|
|
593
663
|
|
594
664
|
def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
595
665
|
"""Main public API: returns a DataFrame in the flat output format."""
|
666
|
+
log.info(f"Starting flatten_dataframe with {len(df)} input rows")
|
667
|
+
log.info(f"Input columns: {list(df.columns)}")
|
668
|
+
|
596
669
|
# Apply column aliases to the dataframe
|
597
670
|
for alias, canonical in COLUMN_ALIASES.items():
|
598
671
|
if alias in df.columns and canonical not in df.columns:
|
@@ -621,8 +694,29 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
621
694
|
# _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
|
622
695
|
|
623
696
|
# 3. Flatten rows ---------------------------------------------------------
|
624
|
-
|
697
|
+
# Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
|
698
|
+
idmap = {}
|
699
|
+
for _, r in df.iterrows():
|
700
|
+
eid = str(r["enzyme_id"])
|
701
|
+
if eid in idmap:
|
702
|
+
log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
|
703
|
+
idmap[eid] = r.to_dict()
|
704
|
+
|
705
|
+
# Check for duplicate enzyme_ids
|
706
|
+
enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
|
707
|
+
unique_ids = set(enzyme_ids)
|
708
|
+
if len(enzyme_ids) != len(unique_ids):
|
709
|
+
log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
|
710
|
+
from collections import Counter
|
711
|
+
id_counts = Counter(enzyme_ids)
|
712
|
+
duplicates = {k: v for k, v in id_counts.items() if v > 1}
|
713
|
+
log.warning(f"Duplicate enzyme_ids: {duplicates}")
|
714
|
+
log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
|
715
|
+
|
625
716
|
output_rows: List[Dict[str, str]] = []
|
717
|
+
skipped_count = 0
|
718
|
+
processed_count = 0
|
719
|
+
|
626
720
|
for idx, (_, row) in enumerate(df.iterrows()):
|
627
721
|
rec = VariantRecord(row.to_dict())
|
628
722
|
eid = rec.eid
|
@@ -632,13 +726,19 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
632
726
|
prods = rec.product_iupac()
|
633
727
|
data_type = rec.row.get("data_type", "")
|
634
728
|
|
635
|
-
if not
|
636
|
-
# Skip entries without
|
729
|
+
if not prods:
|
730
|
+
# Skip entries without product info unless it's marked as lineage only
|
637
731
|
if data_type == "lineage":
|
638
732
|
subs, prods = [""], [""] # placeholders
|
639
733
|
else:
|
640
|
-
log.
|
734
|
+
log.info(f"Skipping enzyme_id={eid} (row {idx}) due to missing product data. prods={prods}, data_type={data_type}")
|
735
|
+
skipped_count += 1
|
641
736
|
continue
|
737
|
+
|
738
|
+
# If no substrates but we have products, use empty substrate list
|
739
|
+
if not subs:
|
740
|
+
log.debug(f"Empty substrate list for enzyme_id={eid}, using empty placeholder")
|
741
|
+
subs = [""]
|
642
742
|
|
643
743
|
sub_smiles = [sub_cache.get(s, "") for s in subs]
|
644
744
|
prod_smiles = [prod_cache.get(p, "") for p in prods]
|
@@ -712,7 +812,9 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
712
812
|
additional_information=additional_information,
|
713
813
|
)
|
714
814
|
output_rows.append(flat.as_dict())
|
815
|
+
processed_count += 1
|
715
816
|
|
817
|
+
log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
|
716
818
|
out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
|
717
819
|
return out_df
|
718
820
|
|
@@ -761,6 +761,15 @@ Ignore locations that contain data for other campaigns.
|
|
761
761
|
return line
|
762
762
|
return page[:800]
|
763
763
|
|
764
|
+
def _ensure_rgb_pixmap(self, pix: fitz.Pixmap) -> fitz.Pixmap:
|
765
|
+
"""Ensure pixmap is in RGB colorspace for PIL compatibility."""
|
766
|
+
if pix.alpha: # RGBA -> RGB
|
767
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
768
|
+
elif pix.colorspace and pix.colorspace.name not in ["DeviceRGB", "DeviceGray"]:
|
769
|
+
# Convert unsupported colorspaces (CMYK, LAB, etc.) to RGB
|
770
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
771
|
+
return pix
|
772
|
+
|
764
773
|
# ---- NEW: Page image helper for both figures and tables ----
|
765
774
|
def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
|
766
775
|
"""Export the page containing the reference as PNG.
|
@@ -802,14 +811,14 @@ Ignore locations that contain data for other campaigns.
|
|
802
811
|
if img_rect.y1 < cap_rect.y0: # fully above caption
|
803
812
|
# Extract image bytes
|
804
813
|
pix = fitz.Pixmap(doc, xref)
|
805
|
-
|
806
|
-
pix = fitz.Pixmap(fitz.csRGB, pix)
|
814
|
+
pix = self._ensure_rgb_pixmap(pix)
|
807
815
|
img_bytes = pix.tobytes("png")
|
808
816
|
return b64encode(img_bytes).decode()
|
809
817
|
else:
|
810
818
|
# Extract the entire page as an image
|
811
819
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
812
820
|
pix = page.get_pixmap(matrix=mat)
|
821
|
+
pix = self._ensure_rgb_pixmap(pix)
|
813
822
|
img_bytes = pix.tobytes("png")
|
814
823
|
return b64encode(img_bytes).decode()
|
815
824
|
return None
|
@@ -842,11 +851,13 @@ Ignore locations that contain data for other campaigns.
|
|
842
851
|
# Add the current page
|
843
852
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
844
853
|
pix = doc.load_page(page_num).get_pixmap(matrix=mat)
|
854
|
+
pix = self._ensure_rgb_pixmap(pix)
|
845
855
|
all_images.append(pix)
|
846
856
|
|
847
857
|
# If this is the last page with the reference, also add the next page
|
848
858
|
if i == len(pages) - 1 and page_num + 1 < doc.page_count:
|
849
859
|
next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
|
860
|
+
next_pix = self._ensure_rgb_pixmap(next_pix)
|
850
861
|
all_images.append(next_pix)
|
851
862
|
LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
|
852
863
|
|
@@ -855,14 +866,16 @@ Ignore locations that contain data for other campaigns.
|
|
855
866
|
|
856
867
|
# If only one page, return it directly
|
857
868
|
if len(all_images) == 1:
|
858
|
-
|
869
|
+
pix = self._ensure_rgb_pixmap(all_images[0])
|
870
|
+
return b64encode(pix.tobytes("png")).decode()
|
859
871
|
|
860
872
|
# Combine multiple pages vertically
|
861
873
|
if not all_images:
|
862
874
|
return None
|
863
875
|
|
864
876
|
if len(all_images) == 1:
|
865
|
-
|
877
|
+
pix = self._ensure_rgb_pixmap(all_images[0])
|
878
|
+
return b64encode(pix.tobytes("png")).decode()
|
866
879
|
|
867
880
|
# Calculate dimensions for combined image
|
868
881
|
total_height = sum(pix.height for pix in all_images)
|
@@ -903,6 +916,7 @@ Ignore locations that contain data for other campaigns.
|
|
903
916
|
# Convert the page to a pixmap
|
904
917
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
|
905
918
|
combined_pix = page.get_pixmap(matrix=mat)
|
919
|
+
combined_pix = self._ensure_rgb_pixmap(combined_pix)
|
906
920
|
|
907
921
|
# Convert to PNG and return
|
908
922
|
img_bytes = combined_pix.tobytes("png")
|
@@ -2025,9 +2039,9 @@ TEXT FROM MANUSCRIPT:
|
|
2025
2039
|
filtered = []
|
2026
2040
|
for loc in locations:
|
2027
2041
|
# Check caption and clues for campaign indicators
|
2028
|
-
caption = loc.get('caption'
|
2029
|
-
campaign_clues = loc.get('campaign_clues'
|
2030
|
-
lineage_hint = loc.get('lineage_hint'
|
2042
|
+
caption = (loc.get('caption') or '').lower()
|
2043
|
+
campaign_clues = (loc.get('campaign_clues') or '').lower()
|
2044
|
+
lineage_hint = (loc.get('lineage_hint') or '').lower()
|
2031
2045
|
combined_text = caption + ' ' + campaign_clues + ' ' + lineage_hint
|
2032
2046
|
|
2033
2047
|
# Check if location is relevant to this campaign
|