debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -188,11 +188,17 @@ class VariantRecord:
188
188
  # Reaction-related -------------------------------------------------------------
189
189
  def substrate_iupac(self) -> List[str]:
190
190
  raw = str(self.row.get("substrate_iupac_list", "")).strip()
191
- return _split_list(raw)
191
+ result = _split_list(raw)
192
+ if not result and raw and raw.lower() != 'nan':
193
+ log.debug(f"substrate_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
194
+ return result
192
195
 
193
196
  def product_iupac(self) -> List[str]:
194
197
  raw = str(self.row.get("product_iupac_list", "")).strip()
195
- return _split_list(raw)
198
+ result = _split_list(raw)
199
+ if not result and raw and raw.lower() != 'nan':
200
+ log.debug(f"product_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
201
+ return result
196
202
 
197
203
 
198
204
  def ttn_or_yield(self) -> Optional[float]:
@@ -377,6 +383,53 @@ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str =
377
383
 
378
384
  # === 6. SMILES CONVERSION HELPERS ==================================================
379
385
 
386
+ def search_smiles_with_gemini(compound_name: str, model=None) -> Optional[str]:
387
+ """
388
+ Use Gemini to search for SMILES strings of complex compounds.
389
+ Returns SMILES string if found, None otherwise.
390
+ """
391
+ if not compound_name or compound_name.lower() in ['nan', 'none', '']:
392
+ return None
393
+
394
+ if not model:
395
+ try:
396
+ # Import get_model from enzyme_lineage_extractor
397
+ import sys
398
+ from pathlib import Path
399
+ sys.path.append(str(Path(__file__).parent))
400
+ from enzyme_lineage_extractor import get_model
401
+ model = get_model()
402
+ except Exception as e:
403
+ log.warning(f"Could not load Gemini model: {e}")
404
+ return None
405
+
406
+ prompt = f"""Search for the SMILES string representation of this chemical compound:
407
+ "{compound_name}"
408
+
409
+ IMPORTANT:
410
+ - Do NOT generate or create a SMILES string
411
+ - Only provide SMILES that you can find in chemical databases or literature
412
+ - For deuterated compounds, search for the specific isotope-labeled SMILES
413
+ - If you cannot find the exact SMILES, say "NOT FOUND"
414
+
415
+ Return ONLY the SMILES string if found, or "NOT FOUND" if not found.
416
+ No explanation or additional text."""
417
+
418
+ try:
419
+ response = model.generate_content(prompt)
420
+ result = response.text.strip()
421
+
422
+ if result and result != "NOT FOUND" and not result.startswith("I"):
423
+ # Basic validation that it looks like SMILES
424
+ if any(c in result for c in ['C', 'c', 'N', 'O', 'S', 'P', '[', ']', '(', ')']):
425
+ log.info(f"Gemini found SMILES for '{compound_name}': {result}")
426
+ return result
427
+ return None
428
+ except Exception as e:
429
+ log.debug(f"Gemini SMILES search failed for '{compound_name}': {e}")
430
+ return None
431
+
432
+
380
433
  def _split_list(raw: str) -> List[str]:
381
434
  if not raw or str(raw).lower() == 'nan':
382
435
  return []
@@ -429,7 +482,12 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
429
482
  except FileNotFoundError:
430
483
  pass # OPSIN not installed
431
484
 
432
- # 3. PubChem PUG REST (online) ---------------------------------------------
485
+ # 3. Gemini search (for complex compounds) ---------------------------------
486
+ gemini_smiles = search_smiles_with_gemini(name)
487
+ if gemini_smiles:
488
+ return gemini_smiles
489
+
490
+ # 4. PubChem PUG REST (online) ---------------------------------------------
433
491
  try:
434
492
  import requests
435
493
 
@@ -538,13 +596,23 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
538
596
 
539
597
  def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
540
598
  """Infer lineage roots using generation numbers and simple sequence similarity."""
541
- idmap: Dict[str, Dict[str, str]] = {str(r["enzyme_id"]): r for _, r in df.iterrows()}
599
+ # Create idmap, handling missing enzyme_id gracefully
600
+ idmap: Dict[str, Dict[str, str]] = {}
601
+ for _, r in df.iterrows():
602
+ eid = r.get("enzyme_id")
603
+ if pd.isna(eid) or str(eid).strip() == "":
604
+ continue
605
+ idmap[str(eid)] = r
542
606
  roots: Dict[str, str] = {}
543
607
  # Look for generation 0 as the root
544
- gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "0"}
608
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows()
609
+ if str(r.get("generation", "")).strip() == "0"
610
+ and not pd.isna(r.get("enzyme_id"))}
545
611
  # If no gen0 found, fall back to gen1
546
612
  if not gen0:
547
- gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "1"}
613
+ gen0 = {r["enzyme_id"] for _, r in df.iterrows()
614
+ if str(r.get("generation", "")).strip() == "1"
615
+ and not pd.isna(r.get("enzyme_id"))}
548
616
 
549
617
  def _seq_sim(a: str, b: str) -> float:
550
618
  if not a or not b:
@@ -553,7 +621,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
553
621
  return matches / max(len(a), len(b))
554
622
 
555
623
  for _, row in df.iterrows():
556
- eid = row["enzyme_id"]
624
+ eid = row.get("enzyme_id")
625
+ if pd.isna(eid) or str(eid).strip() == "":
626
+ continue
557
627
  if eid in gen0:
558
628
  roots[eid] = eid
559
629
  continue
@@ -593,6 +663,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
593
663
 
594
664
  def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
595
665
  """Main public API: returns a DataFrame in the flat output format."""
666
+ log.info(f"Starting flatten_dataframe with {len(df)} input rows")
667
+ log.info(f"Input columns: {list(df.columns)}")
668
+
596
669
  # Apply column aliases to the dataframe
597
670
  for alias, canonical in COLUMN_ALIASES.items():
598
671
  if alias in df.columns and canonical not in df.columns:
@@ -621,8 +694,29 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
621
694
  # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
622
695
 
623
696
  # 3. Flatten rows ---------------------------------------------------------
624
- idmap = {str(r["enzyme_id"]): r.to_dict() for _, r in df.iterrows()}
697
+ # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
698
+ idmap = {}
699
+ for _, r in df.iterrows():
700
+ eid = str(r["enzyme_id"])
701
+ if eid in idmap:
702
+ log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
703
+ idmap[eid] = r.to_dict()
704
+
705
+ # Check for duplicate enzyme_ids
706
+ enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
707
+ unique_ids = set(enzyme_ids)
708
+ if len(enzyme_ids) != len(unique_ids):
709
+ log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
710
+ from collections import Counter
711
+ id_counts = Counter(enzyme_ids)
712
+ duplicates = {k: v for k, v in id_counts.items() if v > 1}
713
+ log.warning(f"Duplicate enzyme_ids: {duplicates}")
714
+ log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
715
+
625
716
  output_rows: List[Dict[str, str]] = []
717
+ skipped_count = 0
718
+ processed_count = 0
719
+
626
720
  for idx, (_, row) in enumerate(df.iterrows()):
627
721
  rec = VariantRecord(row.to_dict())
628
722
  eid = rec.eid
@@ -632,13 +726,19 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
632
726
  prods = rec.product_iupac()
633
727
  data_type = rec.row.get("data_type", "")
634
728
 
635
- if not subs or not prods:
636
- # Skip entries without reaction info unless it's marked as lineage only
729
+ if not prods:
730
+ # Skip entries without product info unless it's marked as lineage only
637
731
  if data_type == "lineage":
638
732
  subs, prods = [""], [""] # placeholders
639
733
  else:
640
- log.debug("Skipping %s due to missing reaction data", eid)
734
+ log.info(f"Skipping enzyme_id={eid} (row {idx}) due to missing product data. prods={prods}, data_type={data_type}")
735
+ skipped_count += 1
641
736
  continue
737
+
738
+ # If no substrates but we have products, use empty substrate list
739
+ if not subs:
740
+ log.debug(f"Empty substrate list for enzyme_id={eid}, using empty placeholder")
741
+ subs = [""]
642
742
 
643
743
  sub_smiles = [sub_cache.get(s, "") for s in subs]
644
744
  prod_smiles = [prod_cache.get(p, "") for p in prods]
@@ -712,7 +812,9 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
712
812
  additional_information=additional_information,
713
813
  )
714
814
  output_rows.append(flat.as_dict())
815
+ processed_count += 1
715
816
 
817
+ log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
716
818
  out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
717
819
  return out_df
718
820
 
@@ -761,6 +761,15 @@ Ignore locations that contain data for other campaigns.
761
761
  return line
762
762
  return page[:800]
763
763
 
764
+ def _ensure_rgb_pixmap(self, pix: fitz.Pixmap) -> fitz.Pixmap:
765
+ """Ensure pixmap is in RGB colorspace for PIL compatibility."""
766
+ if pix.alpha: # RGBA -> RGB
767
+ pix = fitz.Pixmap(fitz.csRGB, pix)
768
+ elif pix.colorspace and pix.colorspace.name not in ["DeviceRGB", "DeviceGray"]:
769
+ # Convert unsupported colorspaces (CMYK, LAB, etc.) to RGB
770
+ pix = fitz.Pixmap(fitz.csRGB, pix)
771
+ return pix
772
+
764
773
  # ---- NEW: Page image helper for both figures and tables ----
765
774
  def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
766
775
  """Export the page containing the reference as PNG.
@@ -802,14 +811,14 @@ Ignore locations that contain data for other campaigns.
802
811
  if img_rect.y1 < cap_rect.y0: # fully above caption
803
812
  # Extract image bytes
804
813
  pix = fitz.Pixmap(doc, xref)
805
- if pix.alpha: # RGBA -> RGB
806
- pix = fitz.Pixmap(fitz.csRGB, pix)
814
+ pix = self._ensure_rgb_pixmap(pix)
807
815
  img_bytes = pix.tobytes("png")
808
816
  return b64encode(img_bytes).decode()
809
817
  else:
810
818
  # Extract the entire page as an image
811
819
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
812
820
  pix = page.get_pixmap(matrix=mat)
821
+ pix = self._ensure_rgb_pixmap(pix)
813
822
  img_bytes = pix.tobytes("png")
814
823
  return b64encode(img_bytes).decode()
815
824
  return None
@@ -842,11 +851,13 @@ Ignore locations that contain data for other campaigns.
842
851
  # Add the current page
843
852
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
844
853
  pix = doc.load_page(page_num).get_pixmap(matrix=mat)
854
+ pix = self._ensure_rgb_pixmap(pix)
845
855
  all_images.append(pix)
846
856
 
847
857
  # If this is the last page with the reference, also add the next page
848
858
  if i == len(pages) - 1 and page_num + 1 < doc.page_count:
849
859
  next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
860
+ next_pix = self._ensure_rgb_pixmap(next_pix)
850
861
  all_images.append(next_pix)
851
862
  LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
852
863
 
@@ -855,14 +866,16 @@ Ignore locations that contain data for other campaigns.
855
866
 
856
867
  # If only one page, return it directly
857
868
  if len(all_images) == 1:
858
- return b64encode(all_images[0].tobytes("png")).decode()
869
+ pix = self._ensure_rgb_pixmap(all_images[0])
870
+ return b64encode(pix.tobytes("png")).decode()
859
871
 
860
872
  # Combine multiple pages vertically
861
873
  if not all_images:
862
874
  return None
863
875
 
864
876
  if len(all_images) == 1:
865
- return b64encode(all_images[0].tobytes("png")).decode()
877
+ pix = self._ensure_rgb_pixmap(all_images[0])
878
+ return b64encode(pix.tobytes("png")).decode()
866
879
 
867
880
  # Calculate dimensions for combined image
868
881
  total_height = sum(pix.height for pix in all_images)
@@ -903,6 +916,7 @@ Ignore locations that contain data for other campaigns.
903
916
  # Convert the page to a pixmap
904
917
  mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
905
918
  combined_pix = page.get_pixmap(matrix=mat)
919
+ combined_pix = self._ensure_rgb_pixmap(combined_pix)
906
920
 
907
921
  # Convert to PNG and return
908
922
  img_bytes = combined_pix.tobytes("png")
@@ -2025,9 +2039,9 @@ TEXT FROM MANUSCRIPT:
2025
2039
  filtered = []
2026
2040
  for loc in locations:
2027
2041
  # Check caption and clues for campaign indicators
2028
- caption = loc.get('caption', '').lower()
2029
- campaign_clues = loc.get('campaign_clues', '').lower()
2030
- lineage_hint = loc.get('lineage_hint', '').lower()
2042
+ caption = (loc.get('caption') or '').lower()
2043
+ campaign_clues = (loc.get('campaign_clues') or '').lower()
2044
+ lineage_hint = (loc.get('lineage_hint') or '').lower()
2031
2045
  combined_text = caption + ' ' + campaign_clues + ' ' + lineage_hint
2032
2046
 
2033
2047
  # Check if location is relevant to this campaign