debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +985 -100
- debase/lineage_format.py +226 -13
- debase/reaction_info_extractor.py +178 -34
- debase/substrate_scope_extractor.py +52 -4
- debase/wrapper.py +155 -151
- debase-0.4.5.dist-info/METADATA +121 -0
- debase-0.4.5.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0
@@ -58,7 +58,7 @@ class Config:
|
|
58
58
|
extract_temperature: float = 0.0
|
59
59
|
model_reaction_temperature: float = 0.0
|
60
60
|
top_p: float = 1.0
|
61
|
-
max_tokens: int = 4096
|
61
|
+
max_tokens: int = 12288 # Increased 3x from 4096
|
62
62
|
pdf_cache_size: int = 8
|
63
63
|
retries: int = 2
|
64
64
|
|
@@ -209,7 +209,7 @@ def _cached_gemini_call(
|
|
209
209
|
parts,
|
210
210
|
generation_config={
|
211
211
|
"temperature": temperature,
|
212
|
-
"max_output_tokens": 8192
|
212
|
+
"max_output_tokens": 24576, # Increased 3x from 8192
|
213
213
|
}
|
214
214
|
)
|
215
215
|
# Track token usage if available
|
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
|
|
450
450
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
451
451
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
452
452
|
|
453
|
-
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
|
453
|
+
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
|
454
454
|
Do not include too much sources, just return 2 or 3 sources.
|
455
455
|
Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
|
456
456
|
When returning confidence scores, be more accurate and avoid scores that are too close together.
|
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
|
|
703
703
|
- If no IUPAC name is found for a compound, return null for iupac_name
|
704
704
|
- Include ALL compounds found or referenced
|
705
705
|
|
706
|
+
IMPORTANT - ONE NAME PER COMPOUND:
|
707
|
+
- Return ONLY ONE IUPAC name per compound identifier
|
708
|
+
- If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
|
709
|
+
1. Names explicitly labeled as "IUPAC name:" in the text
|
710
|
+
2. Names in compound characterization sections
|
711
|
+
3. The most systematic/complete chemical name
|
712
|
+
- Do NOT return multiple IUPAC names in a single iupac_name field
|
713
|
+
|
706
714
|
Return as JSON:
|
707
715
|
{
|
708
716
|
"compound_mappings": [
|
@@ -722,8 +730,8 @@ Return as JSON:
|
|
722
730
|
###############################################################################
|
723
731
|
|
724
732
|
class ReactionExtractor:
|
725
|
-
_FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
726
|
-
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
733
|
+
_FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
734
|
+
_TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
727
735
|
|
728
736
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
729
737
|
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
|
@@ -792,14 +800,24 @@ class ReactionExtractor:
|
|
792
800
|
context_start = context_start + last_period + 1
|
793
801
|
|
794
802
|
# For tables, include much more content after the caption to show actual table data
|
795
|
-
# For figures,
|
796
|
-
is_table = match.group(1).lower()
|
797
|
-
max_chars =
|
798
|
-
|
799
|
-
# Get up to max_chars or until double newline
|
800
|
-
|
801
|
-
|
803
|
+
# For figures, include more content to ensure complete captions
|
804
|
+
is_table = 'table' in match.group(1).lower()
|
805
|
+
max_chars = 8000 if is_table else 5000
|
806
|
+
|
807
|
+
# Get up to max_chars or until double newline (but ensure we get complete caption)
|
808
|
+
# First, try to find the end of the caption sentence
|
809
|
+
caption_end = caption_start
|
810
|
+
period_pos = all_text.find('. ', caption_start)
|
811
|
+
if period_pos != -1 and period_pos < caption_start + 1000:
|
812
|
+
# Include at least to the end of the caption sentence
|
813
|
+
caption_end = period_pos + 1
|
814
|
+
|
815
|
+
# Then extend to include more context or until double newline
|
816
|
+
double_newline_pos = all_text.find("\n\n", caption_end)
|
817
|
+
if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
|
802
818
|
caption_end = caption_start + max_chars
|
819
|
+
else:
|
820
|
+
caption_end = double_newline_pos
|
803
821
|
|
804
822
|
# Include the context and full caption with table content
|
805
823
|
full_caption = all_text[context_start:caption_end].strip()
|
@@ -1082,6 +1100,7 @@ class ReactionExtractor:
|
|
1082
1100
|
If extract_figure_only=True, extracts just the figure above the caption.
|
1083
1101
|
If False, extracts the entire page (useful for tables).
|
1084
1102
|
Returns a base64-encoded PNG or None."""
|
1103
|
+
LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
|
1085
1104
|
|
1086
1105
|
# Check cache first
|
1087
1106
|
cache_key = f"{ref}_{extract_figure_only}"
|
@@ -1099,10 +1118,18 @@ class ReactionExtractor:
|
|
1099
1118
|
return None
|
1100
1119
|
|
1101
1120
|
# For figure extraction, search both documents for actual figure captions
|
1102
|
-
|
1121
|
+
docs = list(filter(None, [self.ms_doc, self.si_doc]))
|
1122
|
+
LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
|
1123
|
+
|
1124
|
+
for doc_idx, doc in enumerate(docs):
|
1125
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1126
|
+
LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
|
1127
|
+
|
1103
1128
|
for page_number in range(doc.page_count):
|
1104
1129
|
page = doc.load_page(page_number)
|
1105
1130
|
page_text = page.get_text()
|
1131
|
+
LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
|
1132
|
+
page_number + 1, doc_name, len(page_text))
|
1106
1133
|
|
1107
1134
|
# Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
|
1108
1135
|
# For subfigures like "Figure 1C", extract the main figure "Figure 1"
|
@@ -1150,6 +1177,14 @@ class ReactionExtractor:
|
|
1150
1177
|
if figure_mentions:
|
1151
1178
|
LOGGER.debug("Page %d has figure mentions but no caption match: %s",
|
1152
1179
|
page_number, figure_mentions[:3])
|
1180
|
+
|
1181
|
+
# For supplementary figures, also check for "supplementary" mentions
|
1182
|
+
if 'supplementary' in ref.lower():
|
1183
|
+
supp_mentions = [line.strip() for line in page_text.split('\n')
|
1184
|
+
if 'supplementary' in line.lower() and 'figure' in line.lower()]
|
1185
|
+
if supp_mentions:
|
1186
|
+
LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
|
1187
|
+
page_number + 1, supp_mentions[:3])
|
1153
1188
|
continue
|
1154
1189
|
|
1155
1190
|
if extract_figure_only:
|
@@ -1207,6 +1242,8 @@ class ReactionExtractor:
|
|
1207
1242
|
# Cache the result
|
1208
1243
|
self._figure_cache.put(cache_key, result)
|
1209
1244
|
return result
|
1245
|
+
|
1246
|
+
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1210
1247
|
return None
|
1211
1248
|
|
1212
1249
|
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
@@ -1437,6 +1474,7 @@ class ReactionExtractor:
|
|
1437
1474
|
|
1438
1475
|
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
1439
1476
|
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
1477
|
+
LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
|
1440
1478
|
ref_lc = ref.lower()
|
1441
1479
|
image_b64: Optional[str] = None
|
1442
1480
|
|
@@ -1458,11 +1496,15 @@ class ReactionExtractor:
|
|
1458
1496
|
snippet = self._extract_table_context(ref)
|
1459
1497
|
elif self._FIG_RE.search(ref_lc):
|
1460
1498
|
# For figures, extract just the figure image (same logic as compound mapping)
|
1499
|
+
LOGGER.debug("Attempting to extract figure image for '%s'", ref)
|
1461
1500
|
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1462
1501
|
if not image_b64:
|
1463
|
-
LOGGER.
|
1502
|
+
LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
|
1464
1503
|
snippet = self._extract_figure_caption(ref)
|
1504
|
+
LOGGER.debug("Caption extraction result: %s",
|
1505
|
+
f"'{snippet[:100]}...'" if snippet else "empty")
|
1465
1506
|
else:
|
1507
|
+
LOGGER.info("Successfully extracted figure image for '%s'", ref)
|
1466
1508
|
# If figure is found, ignore text information - use image only
|
1467
1509
|
snippet = ""
|
1468
1510
|
else:
|
@@ -1907,8 +1949,14 @@ TEXT FROM MANUSCRIPT:
|
|
1907
1949
|
f.write(prompt)
|
1908
1950
|
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
1909
1951
|
|
1910
|
-
# Make multimodal API call
|
1911
|
-
response = self.model.generate_content(
|
1952
|
+
# Make multimodal API call with increased token limit
|
1953
|
+
response = self.model.generate_content(
|
1954
|
+
content_parts,
|
1955
|
+
generation_config={
|
1956
|
+
"temperature": 0.0,
|
1957
|
+
"max_output_tokens": 24576, # Increased 3x for compound mapping
|
1958
|
+
}
|
1959
|
+
)
|
1912
1960
|
|
1913
1961
|
# Track token usage if available
|
1914
1962
|
try:
|
@@ -1971,6 +2019,7 @@ TEXT FROM MANUSCRIPT:
|
|
1971
2019
|
compound_ids: List[str],
|
1972
2020
|
initial_sections: List[str] = None,
|
1973
2021
|
campaign_filter: Optional[str] = None,
|
2022
|
+
iupac_location_hint: Optional[Dict[str, Any]] = None,
|
1974
2023
|
) -> Dict[str, CompoundMapping]:
|
1975
2024
|
"""Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
|
1976
2025
|
|
@@ -2002,14 +2051,57 @@ TEXT FROM MANUSCRIPT:
|
|
2002
2051
|
LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
|
2003
2052
|
len(uncached_compound_ids), sorted(uncached_compound_ids))
|
2004
2053
|
|
2005
|
-
# Tier 1:
|
2006
|
-
|
2007
|
-
"
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2054
|
+
# Tier 1: Use IUPAC location hint if provided, otherwise standard sections
|
2055
|
+
if iupac_location_hint and iupac_location_hint.get('location'):
|
2056
|
+
LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
|
2057
|
+
if iupac_location_hint.get('compound_section_hint'):
|
2058
|
+
LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
|
2059
|
+
|
2060
|
+
# Extract text from the specific IUPAC location
|
2061
|
+
iupac_text = self._get_extended_text_around_location(
|
2062
|
+
iupac_location_hint['location'],
|
2063
|
+
before=2000,
|
2064
|
+
after=10000
|
2065
|
+
)
|
2066
|
+
|
2067
|
+
# Also check for compound-specific hints
|
2068
|
+
compound_hint = iupac_location_hint.get('compound_section_hint', '')
|
2069
|
+
if compound_hint and iupac_text:
|
2070
|
+
# Search for the specific compound section
|
2071
|
+
hint_pattern = re.escape(compound_hint)
|
2072
|
+
match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
|
2073
|
+
if match:
|
2074
|
+
# Extract more focused text around the compound hint
|
2075
|
+
start = max(0, match.start() - 500)
|
2076
|
+
end = min(len(iupac_text), match.end() + 2000)
|
2077
|
+
iupac_text = iupac_text[start:end]
|
2078
|
+
LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
|
2079
|
+
|
2080
|
+
extraction_text = iupac_text or ""
|
2081
|
+
if extraction_text:
|
2082
|
+
LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
|
2083
|
+
else:
|
2084
|
+
LOGGER.warning("Tier 1: No text found at IUPAC location hint")
|
2085
|
+
# Add some manuscript context
|
2086
|
+
manuscript_text = "\n\n".join(self.ms_pages[:5])
|
2087
|
+
else:
|
2088
|
+
# Fallback to standard sections
|
2089
|
+
initial_sections = initial_sections or [
|
2090
|
+
"General procedure", "Compound characterization",
|
2091
|
+
"Synthesis", "Experimental", "Materials and methods"
|
2092
|
+
]
|
2093
|
+
|
2094
|
+
# Extract from initial sections - search in all pages (manuscript + SI)
|
2095
|
+
extraction_text = self._extract_sections_by_title(initial_sections)
|
2096
|
+
|
2097
|
+
# If no sections found by title, include first few SI pages which often have compound data
|
2098
|
+
if not extraction_text and self.si_pages:
|
2099
|
+
# SI often starts with compound characterization after TOC
|
2100
|
+
si_compound_pages = "\n\n".join(self.si_pages[2:10]) # Skip first 2 pages (usually TOC)
|
2101
|
+
extraction_text = si_compound_pages
|
2102
|
+
|
2103
|
+
# Include manuscript pages (first 10) for model reaction context
|
2104
|
+
manuscript_text = "\n\n".join(self.ms_pages[:10])
|
2013
2105
|
|
2014
2106
|
# Add campaign context if provided
|
2015
2107
|
campaign_context = ""
|
@@ -2033,8 +2125,7 @@ Do NOT include compound information from other campaigns.
|
|
2033
2125
|
|
2034
2126
|
"""
|
2035
2127
|
|
2036
|
-
#
|
2037
|
-
extraction_text = self._extract_sections_by_title(initial_sections)
|
2128
|
+
# Combine manuscript text, campaign context, and extraction text
|
2038
2129
|
if extraction_text:
|
2039
2130
|
extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
|
2040
2131
|
else:
|
@@ -2083,11 +2174,11 @@ Do NOT include compound information from other campaigns.
|
|
2083
2174
|
figure_images[ref] = img_b64
|
2084
2175
|
LOGGER.info("Extracted %s for compound mapping", ref)
|
2085
2176
|
|
2086
|
-
# Full text search including
|
2087
|
-
full_text = "\n\n".join(self.all_pages
|
2177
|
+
# Full text search including ALL pages (manuscript + SI)
|
2178
|
+
full_text = "\n\n".join(self.all_pages) # Send everything
|
2088
2179
|
|
2089
2180
|
final_mappings = self._extract_compound_mappings_with_figures(
|
2090
|
-
full_text
|
2181
|
+
full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
|
2091
2182
|
)
|
2092
2183
|
|
2093
2184
|
# Merge final mappings with better compound ID matching
|
@@ -2261,7 +2352,13 @@ Do NOT include compound information from other campaigns.
|
|
2261
2352
|
compound_mappings = {}
|
2262
2353
|
if compound_ids:
|
2263
2354
|
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
2264
|
-
|
2355
|
+
# Pass the IUPAC location hint if we have it
|
2356
|
+
iupac_hint = locations.get("iupac_location") if locations else None
|
2357
|
+
compound_mappings = self._extract_compound_mappings_adaptive(
|
2358
|
+
compound_ids,
|
2359
|
+
campaign_filter=self.campaign_filter,
|
2360
|
+
iupac_location_hint=iupac_hint
|
2361
|
+
)
|
2265
2362
|
|
2266
2363
|
# Add the mapped IUPAC names to the context for better extraction
|
2267
2364
|
if compound_mappings:
|
@@ -2404,6 +2501,34 @@ Different campaigns may use different model reactions and substrates.
|
|
2404
2501
|
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2405
2502
|
list(compound_mappings.keys()))
|
2406
2503
|
|
2504
|
+
# First, populate IUPAC lists directly from compound mappings based on compound_type
|
2505
|
+
substrate_iupacs_from_mappings = []
|
2506
|
+
product_iupacs_from_mappings = []
|
2507
|
+
|
2508
|
+
for mapping in compound_mappings.values():
|
2509
|
+
if mapping.iupac_name and mapping.compound_type:
|
2510
|
+
if mapping.compound_type.lower() == "substrate":
|
2511
|
+
substrate_iupacs_from_mappings.append(mapping.iupac_name)
|
2512
|
+
LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
|
2513
|
+
elif mapping.compound_type.lower() == "product":
|
2514
|
+
product_iupacs_from_mappings.append(mapping.iupac_name)
|
2515
|
+
LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
|
2516
|
+
|
2517
|
+
# Initialize or update the IUPAC lists with mapped compounds
|
2518
|
+
if substrate_iupacs_from_mappings:
|
2519
|
+
existing_substrates = data.get("substrate_iupac_list", []) or []
|
2520
|
+
if isinstance(existing_substrates, list):
|
2521
|
+
data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
|
2522
|
+
else:
|
2523
|
+
data["substrate_iupac_list"] = substrate_iupacs_from_mappings
|
2524
|
+
|
2525
|
+
if product_iupacs_from_mappings:
|
2526
|
+
existing_products = data.get("product_iupac_list", []) or []
|
2527
|
+
if isinstance(existing_products, list):
|
2528
|
+
data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
|
2529
|
+
else:
|
2530
|
+
data["product_iupac_list"] = product_iupacs_from_mappings
|
2531
|
+
|
2407
2532
|
# Try to map substrate/product lists through compound IDs
|
2408
2533
|
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
2409
2534
|
if isinstance(substrate_list, list):
|
@@ -3021,7 +3146,14 @@ def main() -> None:
|
|
3021
3146
|
campaign_filter = all_campaigns[0]
|
3022
3147
|
LOGGER.info("Detected single campaign: %s", campaign_filter)
|
3023
3148
|
|
3024
|
-
|
3149
|
+
# Create campaign-specific debug directory even for single campaign
|
3150
|
+
campaign_debug_dir = None
|
3151
|
+
if args.debug_dir:
|
3152
|
+
campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
|
3153
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3154
|
+
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3155
|
+
|
3156
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3025
3157
|
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3026
3158
|
df_metrics = extractor.run(enzyme_df)
|
3027
3159
|
|
@@ -3041,8 +3173,14 @@ def main() -> None:
|
|
3041
3173
|
LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
|
3042
3174
|
continue
|
3043
3175
|
|
3044
|
-
# Create extractor for this campaign
|
3045
|
-
|
3176
|
+
# Create extractor for this campaign with campaign-specific debug directory
|
3177
|
+
campaign_debug_dir = None
|
3178
|
+
if args.debug_dir:
|
3179
|
+
campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
|
3180
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3181
|
+
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3182
|
+
|
3183
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3046
3184
|
campaign_filter=campaign, all_campaigns=all_campaigns)
|
3047
3185
|
|
3048
3186
|
# Run extraction for this campaign
|
@@ -3088,7 +3226,13 @@ def main() -> None:
|
|
3088
3226
|
df_metrics = pd.DataFrame()
|
3089
3227
|
else:
|
3090
3228
|
# No campaign information, process all enzymes together
|
3091
|
-
|
3229
|
+
campaign_debug_dir = None
|
3230
|
+
if args.debug_dir:
|
3231
|
+
campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
|
3232
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3233
|
+
LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
|
3234
|
+
|
3235
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3092
3236
|
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3093
3237
|
df_metrics = extractor.run(enzyme_df)
|
3094
3238
|
|
@@ -28,6 +28,7 @@ import re
|
|
28
28
|
import json
|
29
29
|
import time
|
30
30
|
import logging
|
31
|
+
import subprocess
|
31
32
|
from pathlib import Path
|
32
33
|
from dataclasses import dataclass, field
|
33
34
|
from typing import List, Optional, Dict, Any, Union
|
@@ -103,6 +104,52 @@ class CompoundMapping:
|
|
103
104
|
compound_type: str = "unknown"
|
104
105
|
source_location: Optional[str] = None
|
105
106
|
|
107
|
+
def is_valid_iupac_name_with_opsin(name: str) -> bool:
|
108
|
+
"""Check if a name is a valid IUPAC name using the local OPSIN command."""
|
109
|
+
if not name or len(name.strip()) < 3:
|
110
|
+
return False
|
111
|
+
|
112
|
+
try:
|
113
|
+
# Use local OPSIN command to check if name can be converted to SMILES
|
114
|
+
process = subprocess.run(
|
115
|
+
['opsin', '-o', 'smi'],
|
116
|
+
input=name.strip(),
|
117
|
+
text=True,
|
118
|
+
capture_output=True,
|
119
|
+
timeout=30
|
120
|
+
)
|
121
|
+
|
122
|
+
# If OPSIN successfully converts to SMILES, the name is valid IUPAC
|
123
|
+
if process.returncode == 0 and process.stdout.strip():
|
124
|
+
output = process.stdout.strip()
|
125
|
+
# Check if output looks like a valid SMILES (contains common SMILES characters)
|
126
|
+
if any(char in output for char in 'CNOS()=[]#+-'):
|
127
|
+
return True
|
128
|
+
|
129
|
+
return False
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
log.debug(f"OPSIN check failed for '{name}': {e}")
|
133
|
+
return False
|
134
|
+
|
135
|
+
def _get_iupac_name(compound) -> str:
|
136
|
+
"""Get IUPAC name for a compound, checking if the common name is already IUPAC."""
|
137
|
+
if not compound:
|
138
|
+
return ''
|
139
|
+
|
140
|
+
# If we already have an IUPAC name, use it
|
141
|
+
if compound.iupac_name:
|
142
|
+
return compound.iupac_name
|
143
|
+
|
144
|
+
# If no IUPAC name but we have a common name, check if it's already IUPAC
|
145
|
+
if compound.name:
|
146
|
+
# Check with OPSIN if the name is a valid IUPAC name
|
147
|
+
if is_valid_iupac_name_with_opsin(compound.name):
|
148
|
+
log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
|
149
|
+
return compound.name
|
150
|
+
|
151
|
+
return ''
|
152
|
+
|
106
153
|
# === 3. LOGGING HELPERS ===
|
107
154
|
|
108
155
|
# --- Debug dump helper ----------------------------------------------------
|
@@ -2496,7 +2543,8 @@ def merge_with_lineage(
|
|
2496
2543
|
data = lineage_map[matched_name]
|
2497
2544
|
entry.parent_id = data['parent_id']
|
2498
2545
|
entry.mutations = data['mutations']
|
2499
|
-
|
2546
|
+
# Skip generation - to be filled by lineage_format
|
2547
|
+
# entry.generation = data['generation']
|
2500
2548
|
entry.aa_seq = data['aa_seq']
|
2501
2549
|
entry.dna_seq = data['dna_seq']
|
2502
2550
|
entry.confidence = data['confidence']
|
@@ -2524,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2524
2572
|
'enzyme_id': entry.enzyme_id,
|
2525
2573
|
'parent_enzyme_id': entry.parent_id or '',
|
2526
2574
|
'mutations': entry.mutations or '',
|
2527
|
-
'generation':
|
2575
|
+
'generation': '', # Empty generation - to be filled by lineage_format
|
2528
2576
|
'campaign_id': entry.campaign_id or '',
|
2529
2577
|
'protein_sequence': entry.aa_seq or '',
|
2530
2578
|
'nucleotide_sequence': entry.dna_seq or '',
|
@@ -2532,9 +2580,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2532
2580
|
'flag': '',
|
2533
2581
|
|
2534
2582
|
'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
|
2535
|
-
'substrate_iupac_list': '; '.join(s
|
2583
|
+
'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
|
2536
2584
|
'product_list': '; '.join(p.name for p in entry.products if p.name),
|
2537
|
-
'product_iupac_list': '; '.join(p
|
2585
|
+
'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
|
2538
2586
|
|
2539
2587
|
'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
|
2540
2588
|
'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),
|