debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,7 +58,7 @@ class Config:
58
58
  extract_temperature: float = 0.0
59
59
  model_reaction_temperature: float = 0.0
60
60
  top_p: float = 1.0
61
- max_tokens: int = 4096
61
+ max_tokens: int = 12288 # Increased 3x from 4096
62
62
  pdf_cache_size: int = 8
63
63
  retries: int = 2
64
64
 
@@ -209,7 +209,7 @@ def _cached_gemini_call(
209
209
  parts,
210
210
  generation_config={
211
211
  "temperature": temperature,
212
- "max_output_tokens": 8192,
212
+ "max_output_tokens": 24576, # Increased 3x from 8192
213
213
  }
214
214
  )
215
215
  # Track token usage if available
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
450
450
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
451
451
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
452
452
 
453
- Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
453
+ Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
454
454
  Do not include too much sources, just return 2 or 3 sources.
455
455
  Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
456
  When returning confidence scores, be more accurate and avoid scores that are too close together.
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
703
703
  - If no IUPAC name is found for a compound, return null for iupac_name
704
704
  - Include ALL compounds found or referenced
705
705
 
706
+ IMPORTANT - ONE NAME PER COMPOUND:
707
+ - Return ONLY ONE IUPAC name per compound identifier
708
+ - If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
709
+ 1. Names explicitly labeled as "IUPAC name:" in the text
710
+ 2. Names in compound characterization sections
711
+ 3. The most systematic/complete chemical name
712
+ - Do NOT return multiple IUPAC names in a single iupac_name field
713
+
706
714
  Return as JSON:
707
715
  {
708
716
  "compound_mappings": [
@@ -722,8 +730,8 @@ Return as JSON:
722
730
  ###############################################################################
723
731
 
724
732
  class ReactionExtractor:
725
- _FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
726
- _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
733
+ _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
734
+ _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
727
735
 
728
736
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
729
737
  campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
@@ -792,14 +800,24 @@ class ReactionExtractor:
792
800
  context_start = context_start + last_period + 1
793
801
 
794
802
  # For tables, include much more content after the caption to show actual table data
795
- # For figures, keep the original limit
796
- is_table = match.group(1).lower() == 'table'
797
- max_chars = 5000 if is_table else 3000
798
-
799
- # Get up to max_chars or until double newline
800
- caption_end = all_text.find("\n\n", caption_start)
801
- if caption_end == -1 or caption_end - caption_start > max_chars:
803
+ # For figures, include more content to ensure complete captions
804
+ is_table = 'table' in match.group(1).lower()
805
+ max_chars = 8000 if is_table else 5000
806
+
807
+ # Get up to max_chars or until double newline (but ensure we get complete caption)
808
+ # First, try to find the end of the caption sentence
809
+ caption_end = caption_start
810
+ period_pos = all_text.find('. ', caption_start)
811
+ if period_pos != -1 and period_pos < caption_start + 1000:
812
+ # Include at least to the end of the caption sentence
813
+ caption_end = period_pos + 1
814
+
815
+ # Then extend to include more context or until double newline
816
+ double_newline_pos = all_text.find("\n\n", caption_end)
817
+ if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
802
818
  caption_end = caption_start + max_chars
819
+ else:
820
+ caption_end = double_newline_pos
803
821
 
804
822
  # Include the context and full caption with table content
805
823
  full_caption = all_text[context_start:caption_end].strip()
@@ -1082,6 +1100,7 @@ class ReactionExtractor:
1082
1100
  If extract_figure_only=True, extracts just the figure above the caption.
1083
1101
  If False, extracts the entire page (useful for tables).
1084
1102
  Returns a base64-encoded PNG or None."""
1103
+ LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
1085
1104
 
1086
1105
  # Check cache first
1087
1106
  cache_key = f"{ref}_{extract_figure_only}"
@@ -1099,10 +1118,18 @@ class ReactionExtractor:
1099
1118
  return None
1100
1119
 
1101
1120
  # For figure extraction, search both documents for actual figure captions
1102
- for doc in filter(None, [self.ms_doc, self.si_doc]):
1121
+ docs = list(filter(None, [self.ms_doc, self.si_doc]))
1122
+ LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
1123
+
1124
+ for doc_idx, doc in enumerate(docs):
1125
+ doc_name = "MS" if doc_idx == 0 else "SI"
1126
+ LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
1127
+
1103
1128
  for page_number in range(doc.page_count):
1104
1129
  page = doc.load_page(page_number)
1105
1130
  page_text = page.get_text()
1131
+ LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
1132
+ page_number + 1, doc_name, len(page_text))
1106
1133
 
1107
1134
  # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
1108
1135
  # For subfigures like "Figure 1C", extract the main figure "Figure 1"
@@ -1150,6 +1177,14 @@ class ReactionExtractor:
1150
1177
  if figure_mentions:
1151
1178
  LOGGER.debug("Page %d has figure mentions but no caption match: %s",
1152
1179
  page_number, figure_mentions[:3])
1180
+
1181
+ # For supplementary figures, also check for "supplementary" mentions
1182
+ if 'supplementary' in ref.lower():
1183
+ supp_mentions = [line.strip() for line in page_text.split('\n')
1184
+ if 'supplementary' in line.lower() and 'figure' in line.lower()]
1185
+ if supp_mentions:
1186
+ LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
1187
+ page_number + 1, supp_mentions[:3])
1153
1188
  continue
1154
1189
 
1155
1190
  if extract_figure_only:
@@ -1207,6 +1242,8 @@ class ReactionExtractor:
1207
1242
  # Cache the result
1208
1243
  self._figure_cache.put(cache_key, result)
1209
1244
  return result
1245
+
1246
+ LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
1210
1247
  return None
1211
1248
 
1212
1249
  def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
@@ -1437,6 +1474,7 @@ class ReactionExtractor:
1437
1474
 
1438
1475
  def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
1439
1476
  """Extract performance metrics for multiple enzymes from the identified location in batch."""
1477
+ LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
1440
1478
  ref_lc = ref.lower()
1441
1479
  image_b64: Optional[str] = None
1442
1480
 
@@ -1458,11 +1496,15 @@ class ReactionExtractor:
1458
1496
  snippet = self._extract_table_context(ref)
1459
1497
  elif self._FIG_RE.search(ref_lc):
1460
1498
  # For figures, extract just the figure image (same logic as compound mapping)
1499
+ LOGGER.debug("Attempting to extract figure image for '%s'", ref)
1461
1500
  image_b64 = self._extract_page_png(ref, extract_figure_only=True)
1462
1501
  if not image_b64:
1463
- LOGGER.debug("No figure image found for %s - using caption text", ref)
1502
+ LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
1464
1503
  snippet = self._extract_figure_caption(ref)
1504
+ LOGGER.debug("Caption extraction result: %s",
1505
+ f"'{snippet[:100]}...'" if snippet else "empty")
1465
1506
  else:
1507
+ LOGGER.info("Successfully extracted figure image for '%s'", ref)
1466
1508
  # If figure is found, ignore text information - use image only
1467
1509
  snippet = ""
1468
1510
  else:
@@ -1907,8 +1949,14 @@ TEXT FROM MANUSCRIPT:
1907
1949
  f.write(prompt)
1908
1950
  LOGGER.info("Full prompt saved to: %s", prompt_file)
1909
1951
 
1910
- # Make multimodal API call
1911
- response = self.model.generate_content(content_parts)
1952
+ # Make multimodal API call with increased token limit
1953
+ response = self.model.generate_content(
1954
+ content_parts,
1955
+ generation_config={
1956
+ "temperature": 0.0,
1957
+ "max_output_tokens": 24576, # Increased 3x for compound mapping
1958
+ }
1959
+ )
1912
1960
 
1913
1961
  # Track token usage if available
1914
1962
  try:
@@ -1971,6 +2019,7 @@ TEXT FROM MANUSCRIPT:
1971
2019
  compound_ids: List[str],
1972
2020
  initial_sections: List[str] = None,
1973
2021
  campaign_filter: Optional[str] = None,
2022
+ iupac_location_hint: Optional[Dict[str, Any]] = None,
1974
2023
  ) -> Dict[str, CompoundMapping]:
1975
2024
  """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
1976
2025
 
@@ -2002,14 +2051,57 @@ TEXT FROM MANUSCRIPT:
2002
2051
  LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
2003
2052
  len(uncached_compound_ids), sorted(uncached_compound_ids))
2004
2053
 
2005
- # Tier 1: Standard sections (manuscript + initial SI sections)
2006
- initial_sections = initial_sections or [
2007
- "General procedure", "Compound characterization",
2008
- "Synthesis", "Experimental", "Materials and methods"
2009
- ]
2010
-
2011
- # Include manuscript pages (first 10) for model reaction context
2012
- manuscript_text = "\n\n".join(self.ms_pages[:10])
2054
+ # Tier 1: Use IUPAC location hint if provided, otherwise standard sections
2055
+ if iupac_location_hint and iupac_location_hint.get('location'):
2056
+ LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
2057
+ if iupac_location_hint.get('compound_section_hint'):
2058
+ LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
2059
+
2060
+ # Extract text from the specific IUPAC location
2061
+ iupac_text = self._get_extended_text_around_location(
2062
+ iupac_location_hint['location'],
2063
+ before=2000,
2064
+ after=10000
2065
+ )
2066
+
2067
+ # Also check for compound-specific hints
2068
+ compound_hint = iupac_location_hint.get('compound_section_hint', '')
2069
+ if compound_hint and iupac_text:
2070
+ # Search for the specific compound section
2071
+ hint_pattern = re.escape(compound_hint)
2072
+ match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
2073
+ if match:
2074
+ # Extract more focused text around the compound hint
2075
+ start = max(0, match.start() - 500)
2076
+ end = min(len(iupac_text), match.end() + 2000)
2077
+ iupac_text = iupac_text[start:end]
2078
+ LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
2079
+
2080
+ extraction_text = iupac_text or ""
2081
+ if extraction_text:
2082
+ LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
2083
+ else:
2084
+ LOGGER.warning("Tier 1: No text found at IUPAC location hint")
2085
+ # Add some manuscript context
2086
+ manuscript_text = "\n\n".join(self.ms_pages[:5])
2087
+ else:
2088
+ # Fallback to standard sections
2089
+ initial_sections = initial_sections or [
2090
+ "General procedure", "Compound characterization",
2091
+ "Synthesis", "Experimental", "Materials and methods"
2092
+ ]
2093
+
2094
+ # Extract from initial sections - search in all pages (manuscript + SI)
2095
+ extraction_text = self._extract_sections_by_title(initial_sections)
2096
+
2097
+ # If no sections found by title, include first few SI pages which often have compound data
2098
+ if not extraction_text and self.si_pages:
2099
+ # SI often starts with compound characterization after TOC
2100
+ si_compound_pages = "\n\n".join(self.si_pages[2:10]) # Skip first 2 pages (usually TOC)
2101
+ extraction_text = si_compound_pages
2102
+
2103
+ # Include manuscript pages (first 10) for model reaction context
2104
+ manuscript_text = "\n\n".join(self.ms_pages[:10])
2013
2105
 
2014
2106
  # Add campaign context if provided
2015
2107
  campaign_context = ""
@@ -2033,8 +2125,7 @@ Do NOT include compound information from other campaigns.
2033
2125
 
2034
2126
  """
2035
2127
 
2036
- # Extract from initial sections
2037
- extraction_text = self._extract_sections_by_title(initial_sections)
2128
+ # Combine manuscript text, campaign context, and extraction text
2038
2129
  if extraction_text:
2039
2130
  extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
2040
2131
  else:
@@ -2083,11 +2174,11 @@ Do NOT include compound information from other campaigns.
2083
2174
  figure_images[ref] = img_b64
2084
2175
  LOGGER.info("Extracted %s for compound mapping", ref)
2085
2176
 
2086
- # Full text search including all pages
2087
- full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
2177
+ # Full text search including ALL pages (manuscript + SI)
2178
+ full_text = "\n\n".join(self.all_pages) # Send everything
2088
2179
 
2089
2180
  final_mappings = self._extract_compound_mappings_with_figures(
2090
- full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2181
+ full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2091
2182
  )
2092
2183
 
2093
2184
  # Merge final mappings with better compound ID matching
@@ -2261,7 +2352,13 @@ Do NOT include compound information from other campaigns.
2261
2352
  compound_mappings = {}
2262
2353
  if compound_ids:
2263
2354
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
2264
- compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
2355
+ # Pass the IUPAC location hint if we have it
2356
+ iupac_hint = locations.get("iupac_location") if locations else None
2357
+ compound_mappings = self._extract_compound_mappings_adaptive(
2358
+ compound_ids,
2359
+ campaign_filter=self.campaign_filter,
2360
+ iupac_location_hint=iupac_hint
2361
+ )
2265
2362
 
2266
2363
  # Add the mapped IUPAC names to the context for better extraction
2267
2364
  if compound_mappings:
@@ -2404,6 +2501,34 @@ Different campaigns may use different model reactions and substrates.
2404
2501
  LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2405
2502
  list(compound_mappings.keys()))
2406
2503
 
2504
+ # First, populate IUPAC lists directly from compound mappings based on compound_type
2505
+ substrate_iupacs_from_mappings = []
2506
+ product_iupacs_from_mappings = []
2507
+
2508
+ for mapping in compound_mappings.values():
2509
+ if mapping.iupac_name and mapping.compound_type:
2510
+ if mapping.compound_type.lower() == "substrate":
2511
+ substrate_iupacs_from_mappings.append(mapping.iupac_name)
2512
+ LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
2513
+ elif mapping.compound_type.lower() == "product":
2514
+ product_iupacs_from_mappings.append(mapping.iupac_name)
2515
+ LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
2516
+
2517
+ # Initialize or update the IUPAC lists with mapped compounds
2518
+ if substrate_iupacs_from_mappings:
2519
+ existing_substrates = data.get("substrate_iupac_list", []) or []
2520
+ if isinstance(existing_substrates, list):
2521
+ data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
2522
+ else:
2523
+ data["substrate_iupac_list"] = substrate_iupacs_from_mappings
2524
+
2525
+ if product_iupacs_from_mappings:
2526
+ existing_products = data.get("product_iupac_list", []) or []
2527
+ if isinstance(existing_products, list):
2528
+ data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
2529
+ else:
2530
+ data["product_iupac_list"] = product_iupacs_from_mappings
2531
+
2407
2532
  # Try to map substrate/product lists through compound IDs
2408
2533
  substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
2409
2534
  if isinstance(substrate_list, list):
@@ -3021,7 +3146,14 @@ def main() -> None:
3021
3146
  campaign_filter = all_campaigns[0]
3022
3147
  LOGGER.info("Detected single campaign: %s", campaign_filter)
3023
3148
 
3024
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3149
+ # Create campaign-specific debug directory even for single campaign
3150
+ campaign_debug_dir = None
3151
+ if args.debug_dir:
3152
+ campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
3153
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3154
+ LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3155
+
3156
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3025
3157
  campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3026
3158
  df_metrics = extractor.run(enzyme_df)
3027
3159
 
@@ -3041,8 +3173,14 @@ def main() -> None:
3041
3173
  LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
3042
3174
  continue
3043
3175
 
3044
- # Create extractor for this campaign
3045
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3176
+ # Create extractor for this campaign with campaign-specific debug directory
3177
+ campaign_debug_dir = None
3178
+ if args.debug_dir:
3179
+ campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
3180
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3181
+ LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3182
+
3183
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3046
3184
  campaign_filter=campaign, all_campaigns=all_campaigns)
3047
3185
 
3048
3186
  # Run extraction for this campaign
@@ -3088,7 +3226,13 @@ def main() -> None:
3088
3226
  df_metrics = pd.DataFrame()
3089
3227
  else:
3090
3228
  # No campaign information, process all enzymes together
3091
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3229
+ campaign_debug_dir = None
3230
+ if args.debug_dir:
3231
+ campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
3232
+ campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3233
+ LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
3234
+
3235
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3092
3236
  campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3093
3237
  df_metrics = extractor.run(enzyme_df)
3094
3238
 
@@ -28,6 +28,7 @@ import re
28
28
  import json
29
29
  import time
30
30
  import logging
31
+ import subprocess
31
32
  from pathlib import Path
32
33
  from dataclasses import dataclass, field
33
34
  from typing import List, Optional, Dict, Any, Union
@@ -103,6 +104,52 @@ class CompoundMapping:
103
104
  compound_type: str = "unknown"
104
105
  source_location: Optional[str] = None
105
106
 
107
+ def is_valid_iupac_name_with_opsin(name: str) -> bool:
108
+ """Check if a name is a valid IUPAC name using the local OPSIN command."""
109
+ if not name or len(name.strip()) < 3:
110
+ return False
111
+
112
+ try:
113
+ # Use local OPSIN command to check if name can be converted to SMILES
114
+ process = subprocess.run(
115
+ ['opsin', '-o', 'smi'],
116
+ input=name.strip(),
117
+ text=True,
118
+ capture_output=True,
119
+ timeout=30
120
+ )
121
+
122
+ # If OPSIN successfully converts to SMILES, the name is valid IUPAC
123
+ if process.returncode == 0 and process.stdout.strip():
124
+ output = process.stdout.strip()
125
+ # Check if output looks like a valid SMILES (contains common SMILES characters)
126
+ if any(char in output for char in 'CNOS()=[]#+-'):
127
+ return True
128
+
129
+ return False
130
+
131
+ except Exception as e:
132
+ log.debug(f"OPSIN check failed for '{name}': {e}")
133
+ return False
134
+
135
+ def _get_iupac_name(compound) -> str:
136
+ """Get IUPAC name for a compound, checking if the common name is already IUPAC."""
137
+ if not compound:
138
+ return ''
139
+
140
+ # If we already have an IUPAC name, use it
141
+ if compound.iupac_name:
142
+ return compound.iupac_name
143
+
144
+ # If no IUPAC name but we have a common name, check if it's already IUPAC
145
+ if compound.name:
146
+ # Check with OPSIN if the name is a valid IUPAC name
147
+ if is_valid_iupac_name_with_opsin(compound.name):
148
+ log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
149
+ return compound.name
150
+
151
+ return ''
152
+
106
153
  # === 3. LOGGING HELPERS ===
107
154
 
108
155
  # --- Debug dump helper ----------------------------------------------------
@@ -2496,7 +2543,8 @@ def merge_with_lineage(
2496
2543
  data = lineage_map[matched_name]
2497
2544
  entry.parent_id = data['parent_id']
2498
2545
  entry.mutations = data['mutations']
2499
- entry.generation = data['generation']
2546
+ # Skip generation - to be filled by lineage_format
2547
+ # entry.generation = data['generation']
2500
2548
  entry.aa_seq = data['aa_seq']
2501
2549
  entry.dna_seq = data['dna_seq']
2502
2550
  entry.confidence = data['confidence']
@@ -2524,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2524
2572
  'enzyme_id': entry.enzyme_id,
2525
2573
  'parent_enzyme_id': entry.parent_id or '',
2526
2574
  'mutations': entry.mutations or '',
2527
- 'generation': entry.generation if entry.generation is not None else '',
2575
+ 'generation': '', # Empty generation - to be filled by lineage_format
2528
2576
  'campaign_id': entry.campaign_id or '',
2529
2577
  'protein_sequence': entry.aa_seq or '',
2530
2578
  'nucleotide_sequence': entry.dna_seq or '',
@@ -2532,9 +2580,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2532
2580
  'flag': '',
2533
2581
 
2534
2582
  'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
2535
- 'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
2583
+ 'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
2536
2584
  'product_list': '; '.join(p.name for p in entry.products if p.name),
2537
- 'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
2585
+ 'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
2538
2586
 
2539
2587
  'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
2540
2588
  'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),