debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
553
553
  This function:
554
554
  1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
555
555
  2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
556
+ 3. Uses Gemini API for intelligent matching when exact matches fail
556
557
  """
557
558
  # Step 1: Clean up 3a data format
558
559
  log.info("Cleaning up reaction data (3a) format...")
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
564
565
 
565
566
  # Step 2: Create sequence lookup from cleaned 3a data
566
567
  seq_lookup = {}
568
+ campaign_enzymes = {} # Track enzymes by campaign for Gemini matching
567
569
 
568
570
  # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
569
571
  reaction_entries = df[df.get("data_type") == "lineage"]
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
584
586
  "aa_sequence": aa_seq,
585
587
  "nt_sequence": nt_seq if nt_seq != "nan" else "",
586
588
  "campaign_id": campaign_id,
587
- "enzyme_id": eid
589
+ "enzyme_id": eid,
590
+ "generation": str(row.get("generation", "")),
591
+ "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
588
592
  }
589
593
 
590
594
  # Also keep simple enzyme_id lookup as fallback
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
592
596
  "aa_sequence": aa_seq,
593
597
  "nt_sequence": nt_seq if nt_seq != "nan" else "",
594
598
  "campaign_id": campaign_id,
595
- "enzyme_id": eid
599
+ "enzyme_id": eid,
600
+ "generation": str(row.get("generation", "")),
601
+ "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
596
602
  }
603
+
604
+ # Track enzymes by campaign for Gemini matching
605
+ if campaign_id not in campaign_enzymes:
606
+ campaign_enzymes[campaign_id] = []
607
+ campaign_enzymes[campaign_id].append({
608
+ "enzyme_id": eid,
609
+ "has_sequence": True,
610
+ "generation": str(row.get("generation", "")),
611
+ "parent_id": str(row.get("parent_enzyme_id", ""))
612
+ })
597
613
 
598
614
  log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
599
615
 
616
+ # Setup Gemini if available
617
+ gemini_model = None
618
+ if GEMINI_OK and GEMINI_API_KEY:
619
+ try:
620
+ genai.configure(api_key=GEMINI_API_KEY)
621
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash')
622
+ log.info("Gemini API configured for intelligent enzyme matching")
623
+ except Exception as e:
624
+ log.warning(f"Failed to configure Gemini API: {e}")
625
+
600
626
  # Step 3: Fill missing sequences in substrate scope entries (3b)
601
627
  substrate_entries = df[df.get("data_type") == "substrate_scope"]
602
628
  log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
603
629
 
604
630
  filled_count = 0
631
+ gemini_matched_count = 0
632
+ unmatched_enzymes = [] # Track enzymes that need Gemini matching
633
+
605
634
  for idx, row in df.iterrows():
606
635
  if row.get("data_type") != "substrate_scope":
607
636
  continue
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
620
649
  if seq_lookup[composite_key]["nt_sequence"]:
621
650
  df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
622
651
  df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
652
+ df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
653
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
623
654
  filled_count += 1
624
655
  log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
625
656
 
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
630
661
  if seq_lookup[eid]["nt_sequence"]:
631
662
  df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
632
663
  df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
664
+ df.at[idx, "generation"] = seq_lookup[eid]["generation"]
665
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
633
666
  filled_count += 1
634
667
  log.debug(f"Filled sequence for {eid} (fallback lookup)")
635
668
 
636
669
  else:
637
- log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
670
+ # Collect for Gemini matching
671
+ unmatched_enzymes.append({
672
+ "idx": idx,
673
+ "enzyme_id": eid,
674
+ "campaign_id": campaign_id
675
+ })
676
+
677
+ # Step 4: Use Gemini for intelligent matching of unmatched enzymes
678
+ if unmatched_enzymes and gemini_model:
679
+ log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
680
+
681
+ # Group unmatched enzymes by campaign
682
+ unmatched_by_campaign = {}
683
+ for entry in unmatched_enzymes:
684
+ cid = entry["campaign_id"]
685
+ if cid not in unmatched_by_campaign:
686
+ unmatched_by_campaign[cid] = []
687
+ unmatched_by_campaign[cid].append(entry)
688
+
689
+ # Process each campaign
690
+ for campaign_id, entries in unmatched_by_campaign.items():
691
+ if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
692
+ log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
693
+ continue
694
+
695
+ # Get enzyme IDs that need matching
696
+ unmatched_ids = [e["enzyme_id"] for e in entries]
697
+
698
+ # Get available enzymes in this campaign
699
+ available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
700
+
701
+ if not available_ids:
702
+ log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
703
+ continue
704
+
705
+ # Create prompt for Gemini
706
+ prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
707
+ These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
708
+
709
+ Enzymes needing sequences (from substrate scope):
710
+ {json.dumps(unmatched_ids, indent=2)}
711
+
712
+ Enzymes with sequences available (from reaction data):
713
+ {json.dumps(available_ids, indent=2)}
714
+
715
+ Match each enzyme from the first list to its corresponding enzyme in the second list.
716
+ Consider variations like:
717
+ - Case differences (p411-hf vs P411-HF)
718
+ - Underscore vs hyphen (p411_hf vs p411-hf)
719
+ - Additional prefixes/suffixes
720
+ - Similar naming patterns within the campaign
721
+
722
+ Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
723
+ {{"substrate_scope_id": "reaction_data_id", ...}}
724
+
725
+ Only include matches you are confident about. If no match exists, omit that enzyme.
726
+ """
727
+
728
+ try:
729
+ response = gemini_model.generate_content(prompt)
730
+ mapping_text = response.text.strip()
731
+
732
+ # Extract JSON from response
733
+ if '```json' in mapping_text:
734
+ mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
735
+ elif '```' in mapping_text:
736
+ mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
737
+
738
+ mapping = json.loads(mapping_text)
739
+
740
+ # Apply the matches
741
+ for entry in entries:
742
+ substrate_id = entry["enzyme_id"]
743
+ if substrate_id in mapping:
744
+ matched_id = mapping[substrate_id]
745
+ composite_key = f"{campaign_id}_{matched_id}"
746
+
747
+ if composite_key in seq_lookup:
748
+ idx = entry["idx"]
749
+ df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
750
+ df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
751
+ if seq_lookup[composite_key]["nt_sequence"]:
752
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
753
+ df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
754
+
755
+ # Also copy generation and parent_enzyme_id
756
+ df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
757
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
758
+
759
+ # Store the match for later mutation copying
760
+ df.at[idx, "_matched_enzyme_id"] = matched_id
761
+ df.at[idx, "_matched_campaign_id"] = campaign_id
762
+
763
+ gemini_matched_count += 1
764
+ log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
765
+ else:
766
+ log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
767
+
768
+ except Exception as e:
769
+ log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
638
770
 
639
- if filled_count > 0:
640
- log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
771
+ # Final logging
772
+ total_filled = filled_count + gemini_matched_count
773
+ if total_filled > 0:
774
+ log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
775
+ f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
776
+
777
+ # Log any remaining unmatched
778
+ for entry in unmatched_enzymes:
779
+ if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
780
+ if col in df.columns and df.at[entry["idx"], col]):
781
+ log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
641
782
 
642
783
  return df
643
784
 
644
785
 
786
+ def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
787
+ """Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
788
+
789
+ This function looks for entries that were matched by Gemini and copies their
790
+ mutation information from the corresponding matched enzyme.
791
+ """
792
+ # Look for entries with _matched_enzyme_id (these were matched by Gemini)
793
+ if "_matched_enzyme_id" not in orig_df.columns:
794
+ return out_df
795
+
796
+ matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
797
+
798
+ if len(matched_entries) == 0:
799
+ return out_df
800
+
801
+ log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
802
+
803
+ # Create a lookup of mutations from the output dataframe
804
+ mutation_lookup = {}
805
+ for idx, row in out_df.iterrows():
806
+ key = f"{row['campaign_id']}_{row['id']}" # 'id' is the enzyme_id in output
807
+ mutation_lookup[key] = {
808
+ "nucleotide_mutation": row.get("nucleotide_mutation", ""),
809
+ "amino_acid_substitutions": row.get("amino_acid_substitutions", "")
810
+ }
811
+
812
+ # Copy mutations for matched entries
813
+ mutations_copied = 0
814
+ for idx, row in out_df.iterrows():
815
+ # Check if this row needs mutation copying
816
+ # Find the original row in orig_df with the same enzyme_id and campaign_id
817
+ orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
818
+ orig_rows = orig_df[orig_mask]
819
+
820
+ if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
821
+ orig_row = orig_rows.iloc[0]
822
+ if pd.notna(orig_row.get("_matched_enzyme_id")):
823
+ # This was a Gemini-matched entry
824
+ matched_id = orig_row["_matched_enzyme_id"]
825
+ matched_campaign = orig_row["_matched_campaign_id"]
826
+ lookup_key = f"{matched_campaign}_{matched_id}"
827
+
828
+ if lookup_key in mutation_lookup:
829
+ out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
830
+ out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
831
+ mutations_copied += 1
832
+ log.debug(f"Copied mutations for {row['id']} from {matched_id}")
833
+
834
+ if mutations_copied > 0:
835
+ log.info(f"Successfully copied mutations for {mutations_copied} entries")
836
+
837
+ return out_df
838
+
839
+
645
840
  def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
646
841
  """Use Gemini API to identify parent enzymes for entries with missing parent information."""
647
842
  if not GEMINI_OK:
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
885
1080
  # Fill missing sequences in substrate scope entries from lineage data
886
1081
  df = _fill_missing_sequences(df)
887
1082
 
888
- # Use Gemini API to identify parent enzymes for entries with missing sequences
889
- df = _identify_parents_with_gemini(df)
890
-
891
- # Fill sequences again after parent identification to propagate sequences from identified parents
892
- df = _fill_missing_sequences(df)
1083
+ # Note: Removed parent identification - we only want exact variant matching
893
1084
 
894
1085
  # 1. Generate lineage roots once -----------------------------------------
895
1086
  lineage_roots = _generate_lineage_roots(df)
@@ -1095,6 +1286,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1095
1286
 
1096
1287
  log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
1097
1288
  out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
1289
+
1290
+ # Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
1291
+ out_df = _copy_mutations_from_matched_enzymes(out_df, df)
1292
+
1098
1293
  return out_df
1099
1294
 
1100
1295
 
@@ -1137,10 +1332,24 @@ def run_pipeline(reaction_csv: str | Path | None = None,
1137
1332
  if not dfs:
1138
1333
  raise ValueError("At least one input CSV must be provided")
1139
1334
 
1140
- # Combine dataframes
1335
+ # Combine dataframes with deduplication
1141
1336
  if len(dfs) > 1:
1142
1337
  df_in = pd.concat(dfs, ignore_index=True)
1143
- log.info("Combined data: %d total entries", len(df_in))
1338
+ log.info("Combined data: %d total entries (before deduplication)", len(df_in))
1339
+
1340
+ # Deduplicate based on unique combination of campaign, variant, fitness, and product
1341
+ # Define the key columns that should be unique
1342
+ unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
1343
+
1344
+ # Check if we have these columns
1345
+ available_cols = [col for col in unique_cols if col in df_in.columns]
1346
+
1347
+ if len(available_cols) >= 2: # Need at least campaign_id and enzyme_id
1348
+ # Keep the first occurrence of each unique combination
1349
+ df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
1350
+ log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
1351
+ else:
1352
+ log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
1144
1353
  else:
1145
1354
  df_in = dfs[0]
1146
1355
 
@@ -58,7 +58,7 @@ class Config:
58
58
  extract_temperature: float = 0.0
59
59
  model_reaction_temperature: float = 0.0
60
60
  top_p: float = 1.0
61
- max_tokens: int = 4096
61
+ max_tokens: int = 12288 # Increased 3x from 4096
62
62
  pdf_cache_size: int = 8
63
63
  retries: int = 2
64
64
 
@@ -209,7 +209,7 @@ def _cached_gemini_call(
209
209
  parts,
210
210
  generation_config={
211
211
  "temperature": temperature,
212
- "max_output_tokens": 8192,
212
+ "max_output_tokens": 24576, # Increased 3x from 8192
213
213
  }
214
214
  )
215
215
  # Track token usage if available
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
450
450
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
451
451
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
452
452
 
453
- Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
453
+ Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
454
454
  Do not include too much sources, just return 2 or 3 sources.
455
455
  Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
456
  When returning confidence scores, be more accurate and avoid scores that are too close together.
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
703
703
  - If no IUPAC name is found for a compound, return null for iupac_name
704
704
  - Include ALL compounds found or referenced
705
705
 
706
+ IMPORTANT - ONE NAME PER COMPOUND:
707
+ - Return ONLY ONE IUPAC name per compound identifier
708
+ - If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
709
+ 1. Names explicitly labeled as "IUPAC name:" in the text
710
+ 2. Names in compound characterization sections
711
+ 3. The most systematic/complete chemical name
712
+ - Do NOT return multiple IUPAC names in a single iupac_name field
713
+
706
714
  Return as JSON:
707
715
  {
708
716
  "compound_mappings": [
@@ -1907,8 +1915,14 @@ TEXT FROM MANUSCRIPT:
1907
1915
  f.write(prompt)
1908
1916
  LOGGER.info("Full prompt saved to: %s", prompt_file)
1909
1917
 
1910
- # Make multimodal API call
1911
- response = self.model.generate_content(content_parts)
1918
+ # Make multimodal API call with increased token limit
1919
+ response = self.model.generate_content(
1920
+ content_parts,
1921
+ generation_config={
1922
+ "temperature": 0.0,
1923
+ "max_output_tokens": 24576, # Increased 3x for compound mapping
1924
+ }
1925
+ )
1912
1926
 
1913
1927
  # Track token usage if available
1914
1928
  try:
@@ -1971,6 +1985,7 @@ TEXT FROM MANUSCRIPT:
1971
1985
  compound_ids: List[str],
1972
1986
  initial_sections: List[str] = None,
1973
1987
  campaign_filter: Optional[str] = None,
1988
+ iupac_location_hint: Optional[Dict[str, Any]] = None,
1974
1989
  ) -> Dict[str, CompoundMapping]:
1975
1990
  """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
1976
1991
 
@@ -2002,14 +2017,57 @@ TEXT FROM MANUSCRIPT:
2002
2017
  LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
2003
2018
  len(uncached_compound_ids), sorted(uncached_compound_ids))
2004
2019
 
2005
- # Tier 1: Standard sections (manuscript + initial SI sections)
2006
- initial_sections = initial_sections or [
2007
- "General procedure", "Compound characterization",
2008
- "Synthesis", "Experimental", "Materials and methods"
2009
- ]
2010
-
2011
- # Include manuscript pages (first 10) for model reaction context
2012
- manuscript_text = "\n\n".join(self.ms_pages[:10])
2020
+ # Tier 1: Use IUPAC location hint if provided, otherwise standard sections
2021
+ if iupac_location_hint and iupac_location_hint.get('location'):
2022
+ LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
2023
+ if iupac_location_hint.get('compound_section_hint'):
2024
+ LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
2025
+
2026
+ # Extract text from the specific IUPAC location
2027
+ iupac_text = self._get_extended_text_around_location(
2028
+ iupac_location_hint['location'],
2029
+ before=2000,
2030
+ after=10000
2031
+ )
2032
+
2033
+ # Also check for compound-specific hints
2034
+ compound_hint = iupac_location_hint.get('compound_section_hint', '')
2035
+ if compound_hint and iupac_text:
2036
+ # Search for the specific compound section
2037
+ hint_pattern = re.escape(compound_hint)
2038
+ match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
2039
+ if match:
2040
+ # Extract more focused text around the compound hint
2041
+ start = max(0, match.start() - 500)
2042
+ end = min(len(iupac_text), match.end() + 2000)
2043
+ iupac_text = iupac_text[start:end]
2044
+ LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
2045
+
2046
+ extraction_text = iupac_text or ""
2047
+ if extraction_text:
2048
+ LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
2049
+ else:
2050
+ LOGGER.warning("Tier 1: No text found at IUPAC location hint")
2051
+ # Add some manuscript context
2052
+ manuscript_text = "\n\n".join(self.ms_pages[:5])
2053
+ else:
2054
+ # Fallback to standard sections
2055
+ initial_sections = initial_sections or [
2056
+ "General procedure", "Compound characterization",
2057
+ "Synthesis", "Experimental", "Materials and methods"
2058
+ ]
2059
+
2060
+ # Extract from initial sections - search in all pages (manuscript + SI)
2061
+ extraction_text = self._extract_sections_by_title(initial_sections)
2062
+
2063
+ # If no sections found by title, include first few SI pages which often have compound data
2064
+ if not extraction_text and self.si_pages:
2065
+ # SI often starts with compound characterization after TOC
2066
+ si_compound_pages = "\n\n".join(self.si_pages[2:10]) # Skip first 2 pages (usually TOC)
2067
+ extraction_text = si_compound_pages
2068
+
2069
+ # Include manuscript pages (first 10) for model reaction context
2070
+ manuscript_text = "\n\n".join(self.ms_pages[:10])
2013
2071
 
2014
2072
  # Add campaign context if provided
2015
2073
  campaign_context = ""
@@ -2033,8 +2091,7 @@ Do NOT include compound information from other campaigns.
2033
2091
 
2034
2092
  """
2035
2093
 
2036
- # Extract from initial sections
2037
- extraction_text = self._extract_sections_by_title(initial_sections)
2094
+ # Combine manuscript text, campaign context, and extraction text
2038
2095
  if extraction_text:
2039
2096
  extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
2040
2097
  else:
@@ -2083,11 +2140,11 @@ Do NOT include compound information from other campaigns.
2083
2140
  figure_images[ref] = img_b64
2084
2141
  LOGGER.info("Extracted %s for compound mapping", ref)
2085
2142
 
2086
- # Full text search including all pages
2087
- full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
2143
+ # Full text search including ALL pages (manuscript + SI)
2144
+ full_text = "\n\n".join(self.all_pages) # Send everything
2088
2145
 
2089
2146
  final_mappings = self._extract_compound_mappings_with_figures(
2090
- full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2147
+ full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2091
2148
  )
2092
2149
 
2093
2150
  # Merge final mappings with better compound ID matching
@@ -2261,7 +2318,13 @@ Do NOT include compound information from other campaigns.
2261
2318
  compound_mappings = {}
2262
2319
  if compound_ids:
2263
2320
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
2264
- compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
2321
+ # Pass the IUPAC location hint if we have it
2322
+ iupac_hint = locations.get("iupac_location") if locations else None
2323
+ compound_mappings = self._extract_compound_mappings_adaptive(
2324
+ compound_ids,
2325
+ campaign_filter=self.campaign_filter,
2326
+ iupac_location_hint=iupac_hint
2327
+ )
2265
2328
 
2266
2329
  # Add the mapped IUPAC names to the context for better extraction
2267
2330
  if compound_mappings:
@@ -2404,6 +2467,34 @@ Different campaigns may use different model reactions and substrates.
2404
2467
  LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2405
2468
  list(compound_mappings.keys()))
2406
2469
 
2470
+ # First, populate IUPAC lists directly from compound mappings based on compound_type
2471
+ substrate_iupacs_from_mappings = []
2472
+ product_iupacs_from_mappings = []
2473
+
2474
+ for mapping in compound_mappings.values():
2475
+ if mapping.iupac_name and mapping.compound_type:
2476
+ if mapping.compound_type.lower() == "substrate":
2477
+ substrate_iupacs_from_mappings.append(mapping.iupac_name)
2478
+ LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
2479
+ elif mapping.compound_type.lower() == "product":
2480
+ product_iupacs_from_mappings.append(mapping.iupac_name)
2481
+ LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
2482
+
2483
+ # Initialize or update the IUPAC lists with mapped compounds
2484
+ if substrate_iupacs_from_mappings:
2485
+ existing_substrates = data.get("substrate_iupac_list", []) or []
2486
+ if isinstance(existing_substrates, list):
2487
+ data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
2488
+ else:
2489
+ data["substrate_iupac_list"] = substrate_iupacs_from_mappings
2490
+
2491
+ if product_iupacs_from_mappings:
2492
+ existing_products = data.get("product_iupac_list", []) or []
2493
+ if isinstance(existing_products, list):
2494
+ data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
2495
+ else:
2496
+ data["product_iupac_list"] = product_iupacs_from_mappings
2497
+
2407
2498
  # Try to map substrate/product lists through compound IDs
2408
2499
  substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
2409
2500
  if isinstance(substrate_list, list):
@@ -3021,7 +3112,14 @@ def main() -> None:
3021
3112
  campaign_filter = all_campaigns[0]
3022
3113
  LOGGER.info("Detected single campaign: %s", campaign_filter)
3023
3114
 
3024
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3115
+ # Create campaign-specific debug directory even for single campaign
3116
+ campaign_debug_dir = None
3117
+ if args.debug_dir:
3118
+ campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
3119
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3120
+ LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3121
+
3122
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3025
3123
  campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3026
3124
  df_metrics = extractor.run(enzyme_df)
3027
3125
 
@@ -3041,8 +3139,14 @@ def main() -> None:
3041
3139
  LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
3042
3140
  continue
3043
3141
 
3044
- # Create extractor for this campaign
3045
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3142
+ # Create extractor for this campaign with campaign-specific debug directory
3143
+ campaign_debug_dir = None
3144
+ if args.debug_dir:
3145
+ campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
3146
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3147
+ LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3148
+
3149
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3046
3150
  campaign_filter=campaign, all_campaigns=all_campaigns)
3047
3151
 
3048
3152
  # Run extraction for this campaign
@@ -3088,7 +3192,13 @@ def main() -> None:
3088
3192
  df_metrics = pd.DataFrame()
3089
3193
  else:
3090
3194
  # No campaign information, process all enzymes together
3091
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3195
+ campaign_debug_dir = None
3196
+ if args.debug_dir:
3197
+ campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
3198
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3199
+ LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
3200
+
3201
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3092
3202
  campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3093
3203
  df_metrics = extractor.run(enzyme_df)
3094
3204
 
@@ -28,6 +28,7 @@ import re
28
28
  import json
29
29
  import time
30
30
  import logging
31
+ import subprocess
31
32
  from pathlib import Path
32
33
  from dataclasses import dataclass, field
33
34
  from typing import List, Optional, Dict, Any, Union
@@ -103,6 +104,52 @@ class CompoundMapping:
103
104
  compound_type: str = "unknown"
104
105
  source_location: Optional[str] = None
105
106
 
107
+ def is_valid_iupac_name_with_opsin(name: str) -> bool:
108
+ """Check if a name is a valid IUPAC name using the local OPSIN command."""
109
+ if not name or len(name.strip()) < 3:
110
+ return False
111
+
112
+ try:
113
+ # Use local OPSIN command to check if name can be converted to SMILES
114
+ process = subprocess.run(
115
+ ['opsin', '-o', 'smi'],
116
+ input=name.strip(),
117
+ text=True,
118
+ capture_output=True,
119
+ timeout=30
120
+ )
121
+
122
+ # If OPSIN successfully converts to SMILES, the name is valid IUPAC
123
+ if process.returncode == 0 and process.stdout.strip():
124
+ output = process.stdout.strip()
125
+ # Check if output looks like a valid SMILES (contains common SMILES characters)
126
+ if any(char in output for char in 'CNOS()=[]#+-'):
127
+ return True
128
+
129
+ return False
130
+
131
+ except Exception as e:
132
+ log.debug(f"OPSIN check failed for '{name}': {e}")
133
+ return False
134
+
135
+ def _get_iupac_name(compound) -> str:
136
+ """Get IUPAC name for a compound, checking if the common name is already IUPAC."""
137
+ if not compound:
138
+ return ''
139
+
140
+ # If we already have an IUPAC name, use it
141
+ if compound.iupac_name:
142
+ return compound.iupac_name
143
+
144
+ # If no IUPAC name but we have a common name, check if it's already IUPAC
145
+ if compound.name:
146
+ # Check with OPSIN if the name is a valid IUPAC name
147
+ if is_valid_iupac_name_with_opsin(compound.name):
148
+ log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
149
+ return compound.name
150
+
151
+ return ''
152
+
106
153
  # === 3. LOGGING HELPERS ===
107
154
 
108
155
  # --- Debug dump helper ----------------------------------------------------
@@ -2532,9 +2579,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2532
2579
  'flag': '',
2533
2580
 
2534
2581
  'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
2535
- 'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
2582
+ 'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
2536
2583
  'product_list': '; '.join(p.name for p in entry.products if p.name),
2537
- 'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
2584
+ 'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
2538
2585
 
2539
2586
  'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
2540
2587
  'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),