debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +656 -27
- debase/enzyme_lineage_extractor.py +1077 -109
- debase/lineage_format.py +221 -12
- debase/reaction_info_extractor.py +133 -23
- debase/substrate_scope_extractor.py +49 -2
- debase/wrapper.py +155 -151
- debase-0.4.4.dist-info/METADATA +121 -0
- debase-0.4.4.dist-info/RECORD +16 -0
- debase-0.4.2.dist-info/METADATA +0 -296
- debase-0.4.2.dist-info/RECORD +0 -16
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
553
553
|
This function:
|
554
554
|
1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
|
555
555
|
2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
|
556
|
+
3. Uses Gemini API for intelligent matching when exact matches fail
|
556
557
|
"""
|
557
558
|
# Step 1: Clean up 3a data format
|
558
559
|
log.info("Cleaning up reaction data (3a) format...")
|
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
564
565
|
|
565
566
|
# Step 2: Create sequence lookup from cleaned 3a data
|
566
567
|
seq_lookup = {}
|
568
|
+
campaign_enzymes = {} # Track enzymes by campaign for Gemini matching
|
567
569
|
|
568
570
|
# Collect sequences from reaction data entries (3a) - these have data_type='lineage'
|
569
571
|
reaction_entries = df[df.get("data_type") == "lineage"]
|
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
584
586
|
"aa_sequence": aa_seq,
|
585
587
|
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
586
588
|
"campaign_id": campaign_id,
|
587
|
-
"enzyme_id": eid
|
589
|
+
"enzyme_id": eid,
|
590
|
+
"generation": str(row.get("generation", "")),
|
591
|
+
"parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
|
588
592
|
}
|
589
593
|
|
590
594
|
# Also keep simple enzyme_id lookup as fallback
|
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
592
596
|
"aa_sequence": aa_seq,
|
593
597
|
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
594
598
|
"campaign_id": campaign_id,
|
595
|
-
"enzyme_id": eid
|
599
|
+
"enzyme_id": eid,
|
600
|
+
"generation": str(row.get("generation", "")),
|
601
|
+
"parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
|
596
602
|
}
|
603
|
+
|
604
|
+
# Track enzymes by campaign for Gemini matching
|
605
|
+
if campaign_id not in campaign_enzymes:
|
606
|
+
campaign_enzymes[campaign_id] = []
|
607
|
+
campaign_enzymes[campaign_id].append({
|
608
|
+
"enzyme_id": eid,
|
609
|
+
"has_sequence": True,
|
610
|
+
"generation": str(row.get("generation", "")),
|
611
|
+
"parent_id": str(row.get("parent_enzyme_id", ""))
|
612
|
+
})
|
597
613
|
|
598
614
|
log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
|
599
615
|
|
616
|
+
# Setup Gemini if available
|
617
|
+
gemini_model = None
|
618
|
+
if GEMINI_OK and GEMINI_API_KEY:
|
619
|
+
try:
|
620
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
621
|
+
gemini_model = genai.GenerativeModel('gemini-1.5-flash')
|
622
|
+
log.info("Gemini API configured for intelligent enzyme matching")
|
623
|
+
except Exception as e:
|
624
|
+
log.warning(f"Failed to configure Gemini API: {e}")
|
625
|
+
|
600
626
|
# Step 3: Fill missing sequences in substrate scope entries (3b)
|
601
627
|
substrate_entries = df[df.get("data_type") == "substrate_scope"]
|
602
628
|
log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
|
603
629
|
|
604
630
|
filled_count = 0
|
631
|
+
gemini_matched_count = 0
|
632
|
+
unmatched_enzymes = [] # Track enzymes that need Gemini matching
|
633
|
+
|
605
634
|
for idx, row in df.iterrows():
|
606
635
|
if row.get("data_type") != "substrate_scope":
|
607
636
|
continue
|
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
620
649
|
if seq_lookup[composite_key]["nt_sequence"]:
|
621
650
|
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
622
651
|
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
652
|
+
df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
|
653
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
|
623
654
|
filled_count += 1
|
624
655
|
log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
|
625
656
|
|
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
630
661
|
if seq_lookup[eid]["nt_sequence"]:
|
631
662
|
df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
|
632
663
|
df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
|
664
|
+
df.at[idx, "generation"] = seq_lookup[eid]["generation"]
|
665
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
|
633
666
|
filled_count += 1
|
634
667
|
log.debug(f"Filled sequence for {eid} (fallback lookup)")
|
635
668
|
|
636
669
|
else:
|
637
|
-
|
670
|
+
# Collect for Gemini matching
|
671
|
+
unmatched_enzymes.append({
|
672
|
+
"idx": idx,
|
673
|
+
"enzyme_id": eid,
|
674
|
+
"campaign_id": campaign_id
|
675
|
+
})
|
676
|
+
|
677
|
+
# Step 4: Use Gemini for intelligent matching of unmatched enzymes
|
678
|
+
if unmatched_enzymes and gemini_model:
|
679
|
+
log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
|
680
|
+
|
681
|
+
# Group unmatched enzymes by campaign
|
682
|
+
unmatched_by_campaign = {}
|
683
|
+
for entry in unmatched_enzymes:
|
684
|
+
cid = entry["campaign_id"]
|
685
|
+
if cid not in unmatched_by_campaign:
|
686
|
+
unmatched_by_campaign[cid] = []
|
687
|
+
unmatched_by_campaign[cid].append(entry)
|
688
|
+
|
689
|
+
# Process each campaign
|
690
|
+
for campaign_id, entries in unmatched_by_campaign.items():
|
691
|
+
if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
|
692
|
+
log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
|
693
|
+
continue
|
694
|
+
|
695
|
+
# Get enzyme IDs that need matching
|
696
|
+
unmatched_ids = [e["enzyme_id"] for e in entries]
|
697
|
+
|
698
|
+
# Get available enzymes in this campaign
|
699
|
+
available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
|
700
|
+
|
701
|
+
if not available_ids:
|
702
|
+
log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
|
703
|
+
continue
|
704
|
+
|
705
|
+
# Create prompt for Gemini
|
706
|
+
prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
|
707
|
+
These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
|
708
|
+
|
709
|
+
Enzymes needing sequences (from substrate scope):
|
710
|
+
{json.dumps(unmatched_ids, indent=2)}
|
711
|
+
|
712
|
+
Enzymes with sequences available (from reaction data):
|
713
|
+
{json.dumps(available_ids, indent=2)}
|
714
|
+
|
715
|
+
Match each enzyme from the first list to its corresponding enzyme in the second list.
|
716
|
+
Consider variations like:
|
717
|
+
- Case differences (p411-hf vs P411-HF)
|
718
|
+
- Underscore vs hyphen (p411_hf vs p411-hf)
|
719
|
+
- Additional prefixes/suffixes
|
720
|
+
- Similar naming patterns within the campaign
|
721
|
+
|
722
|
+
Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
|
723
|
+
{{"substrate_scope_id": "reaction_data_id", ...}}
|
724
|
+
|
725
|
+
Only include matches you are confident about. If no match exists, omit that enzyme.
|
726
|
+
"""
|
727
|
+
|
728
|
+
try:
|
729
|
+
response = gemini_model.generate_content(prompt)
|
730
|
+
mapping_text = response.text.strip()
|
731
|
+
|
732
|
+
# Extract JSON from response
|
733
|
+
if '```json' in mapping_text:
|
734
|
+
mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
|
735
|
+
elif '```' in mapping_text:
|
736
|
+
mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
|
737
|
+
|
738
|
+
mapping = json.loads(mapping_text)
|
739
|
+
|
740
|
+
# Apply the matches
|
741
|
+
for entry in entries:
|
742
|
+
substrate_id = entry["enzyme_id"]
|
743
|
+
if substrate_id in mapping:
|
744
|
+
matched_id = mapping[substrate_id]
|
745
|
+
composite_key = f"{campaign_id}_{matched_id}"
|
746
|
+
|
747
|
+
if composite_key in seq_lookup:
|
748
|
+
idx = entry["idx"]
|
749
|
+
df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
750
|
+
df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
751
|
+
if seq_lookup[composite_key]["nt_sequence"]:
|
752
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
753
|
+
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
754
|
+
|
755
|
+
# Also copy generation and parent_enzyme_id
|
756
|
+
df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
|
757
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
|
758
|
+
|
759
|
+
# Store the match for later mutation copying
|
760
|
+
df.at[idx, "_matched_enzyme_id"] = matched_id
|
761
|
+
df.at[idx, "_matched_campaign_id"] = campaign_id
|
762
|
+
|
763
|
+
gemini_matched_count += 1
|
764
|
+
log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
|
765
|
+
else:
|
766
|
+
log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
|
767
|
+
|
768
|
+
except Exception as e:
|
769
|
+
log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
|
638
770
|
|
639
|
-
|
640
|
-
|
771
|
+
# Final logging
|
772
|
+
total_filled = filled_count + gemini_matched_count
|
773
|
+
if total_filled > 0:
|
774
|
+
log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
|
775
|
+
f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
|
776
|
+
|
777
|
+
# Log any remaining unmatched
|
778
|
+
for entry in unmatched_enzymes:
|
779
|
+
if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
|
780
|
+
if col in df.columns and df.at[entry["idx"], col]):
|
781
|
+
log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
|
641
782
|
|
642
783
|
return df
|
643
784
|
|
644
785
|
|
786
|
+
def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
|
787
|
+
"""Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
|
788
|
+
|
789
|
+
This function looks for entries that were matched by Gemini and copies their
|
790
|
+
mutation information from the corresponding matched enzyme.
|
791
|
+
"""
|
792
|
+
# Look for entries with _matched_enzyme_id (these were matched by Gemini)
|
793
|
+
if "_matched_enzyme_id" not in orig_df.columns:
|
794
|
+
return out_df
|
795
|
+
|
796
|
+
matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
|
797
|
+
|
798
|
+
if len(matched_entries) == 0:
|
799
|
+
return out_df
|
800
|
+
|
801
|
+
log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
|
802
|
+
|
803
|
+
# Create a lookup of mutations from the output dataframe
|
804
|
+
mutation_lookup = {}
|
805
|
+
for idx, row in out_df.iterrows():
|
806
|
+
key = f"{row['campaign_id']}_{row['id']}" # 'id' is the enzyme_id in output
|
807
|
+
mutation_lookup[key] = {
|
808
|
+
"nucleotide_mutation": row.get("nucleotide_mutation", ""),
|
809
|
+
"amino_acid_substitutions": row.get("amino_acid_substitutions", "")
|
810
|
+
}
|
811
|
+
|
812
|
+
# Copy mutations for matched entries
|
813
|
+
mutations_copied = 0
|
814
|
+
for idx, row in out_df.iterrows():
|
815
|
+
# Check if this row needs mutation copying
|
816
|
+
# Find the original row in orig_df with the same enzyme_id and campaign_id
|
817
|
+
orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
|
818
|
+
orig_rows = orig_df[orig_mask]
|
819
|
+
|
820
|
+
if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
|
821
|
+
orig_row = orig_rows.iloc[0]
|
822
|
+
if pd.notna(orig_row.get("_matched_enzyme_id")):
|
823
|
+
# This was a Gemini-matched entry
|
824
|
+
matched_id = orig_row["_matched_enzyme_id"]
|
825
|
+
matched_campaign = orig_row["_matched_campaign_id"]
|
826
|
+
lookup_key = f"{matched_campaign}_{matched_id}"
|
827
|
+
|
828
|
+
if lookup_key in mutation_lookup:
|
829
|
+
out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
|
830
|
+
out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
|
831
|
+
mutations_copied += 1
|
832
|
+
log.debug(f"Copied mutations for {row['id']} from {matched_id}")
|
833
|
+
|
834
|
+
if mutations_copied > 0:
|
835
|
+
log.info(f"Successfully copied mutations for {mutations_copied} entries")
|
836
|
+
|
837
|
+
return out_df
|
838
|
+
|
839
|
+
|
645
840
|
def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
646
841
|
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
647
842
|
if not GEMINI_OK:
|
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
885
1080
|
# Fill missing sequences in substrate scope entries from lineage data
|
886
1081
|
df = _fill_missing_sequences(df)
|
887
1082
|
|
888
|
-
#
|
889
|
-
df = _identify_parents_with_gemini(df)
|
890
|
-
|
891
|
-
# Fill sequences again after parent identification to propagate sequences from identified parents
|
892
|
-
df = _fill_missing_sequences(df)
|
1083
|
+
# Note: Removed parent identification - we only want exact variant matching
|
893
1084
|
|
894
1085
|
# 1. Generate lineage roots once -----------------------------------------
|
895
1086
|
lineage_roots = _generate_lineage_roots(df)
|
@@ -1095,6 +1286,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1095
1286
|
|
1096
1287
|
log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
|
1097
1288
|
out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
|
1289
|
+
|
1290
|
+
# Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
|
1291
|
+
out_df = _copy_mutations_from_matched_enzymes(out_df, df)
|
1292
|
+
|
1098
1293
|
return out_df
|
1099
1294
|
|
1100
1295
|
|
@@ -1137,10 +1332,24 @@ def run_pipeline(reaction_csv: str | Path | None = None,
|
|
1137
1332
|
if not dfs:
|
1138
1333
|
raise ValueError("At least one input CSV must be provided")
|
1139
1334
|
|
1140
|
-
# Combine dataframes
|
1335
|
+
# Combine dataframes with deduplication
|
1141
1336
|
if len(dfs) > 1:
|
1142
1337
|
df_in = pd.concat(dfs, ignore_index=True)
|
1143
|
-
log.info("Combined data: %d total entries", len(df_in))
|
1338
|
+
log.info("Combined data: %d total entries (before deduplication)", len(df_in))
|
1339
|
+
|
1340
|
+
# Deduplicate based on unique combination of campaign, variant, fitness, and product
|
1341
|
+
# Define the key columns that should be unique
|
1342
|
+
unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
|
1343
|
+
|
1344
|
+
# Check if we have these columns
|
1345
|
+
available_cols = [col for col in unique_cols if col in df_in.columns]
|
1346
|
+
|
1347
|
+
if len(available_cols) >= 2: # Need at least campaign_id and enzyme_id
|
1348
|
+
# Keep the first occurrence of each unique combination
|
1349
|
+
df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
|
1350
|
+
log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
|
1351
|
+
else:
|
1352
|
+
log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
|
1144
1353
|
else:
|
1145
1354
|
df_in = dfs[0]
|
1146
1355
|
|
@@ -58,7 +58,7 @@ class Config:
|
|
58
58
|
extract_temperature: float = 0.0
|
59
59
|
model_reaction_temperature: float = 0.0
|
60
60
|
top_p: float = 1.0
|
61
|
-
max_tokens: int = 4096
|
61
|
+
max_tokens: int = 12288 # Increased 3x from 4096
|
62
62
|
pdf_cache_size: int = 8
|
63
63
|
retries: int = 2
|
64
64
|
|
@@ -209,7 +209,7 @@ def _cached_gemini_call(
|
|
209
209
|
parts,
|
210
210
|
generation_config={
|
211
211
|
"temperature": temperature,
|
212
|
-
"max_output_tokens": 8192
|
212
|
+
"max_output_tokens": 24576, # Increased 3x from 8192
|
213
213
|
}
|
214
214
|
)
|
215
215
|
# Track token usage if available
|
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
|
|
450
450
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
451
451
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
452
452
|
|
453
|
-
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
|
453
|
+
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
|
454
454
|
Do not include too much sources, just return 2 or 3 sources.
|
455
455
|
Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
|
456
456
|
When returning confidence scores, be more accurate and avoid scores that are too close together.
|
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
|
|
703
703
|
- If no IUPAC name is found for a compound, return null for iupac_name
|
704
704
|
- Include ALL compounds found or referenced
|
705
705
|
|
706
|
+
IMPORTANT - ONE NAME PER COMPOUND:
|
707
|
+
- Return ONLY ONE IUPAC name per compound identifier
|
708
|
+
- If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
|
709
|
+
1. Names explicitly labeled as "IUPAC name:" in the text
|
710
|
+
2. Names in compound characterization sections
|
711
|
+
3. The most systematic/complete chemical name
|
712
|
+
- Do NOT return multiple IUPAC names in a single iupac_name field
|
713
|
+
|
706
714
|
Return as JSON:
|
707
715
|
{
|
708
716
|
"compound_mappings": [
|
@@ -1907,8 +1915,14 @@ TEXT FROM MANUSCRIPT:
|
|
1907
1915
|
f.write(prompt)
|
1908
1916
|
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
1909
1917
|
|
1910
|
-
# Make multimodal API call
|
1911
|
-
response = self.model.generate_content(
|
1918
|
+
# Make multimodal API call with increased token limit
|
1919
|
+
response = self.model.generate_content(
|
1920
|
+
content_parts,
|
1921
|
+
generation_config={
|
1922
|
+
"temperature": 0.0,
|
1923
|
+
"max_output_tokens": 24576, # Increased 3x for compound mapping
|
1924
|
+
}
|
1925
|
+
)
|
1912
1926
|
|
1913
1927
|
# Track token usage if available
|
1914
1928
|
try:
|
@@ -1971,6 +1985,7 @@ TEXT FROM MANUSCRIPT:
|
|
1971
1985
|
compound_ids: List[str],
|
1972
1986
|
initial_sections: List[str] = None,
|
1973
1987
|
campaign_filter: Optional[str] = None,
|
1988
|
+
iupac_location_hint: Optional[Dict[str, Any]] = None,
|
1974
1989
|
) -> Dict[str, CompoundMapping]:
|
1975
1990
|
"""Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
|
1976
1991
|
|
@@ -2002,14 +2017,57 @@ TEXT FROM MANUSCRIPT:
|
|
2002
2017
|
LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
|
2003
2018
|
len(uncached_compound_ids), sorted(uncached_compound_ids))
|
2004
2019
|
|
2005
|
-
# Tier 1:
|
2006
|
-
|
2007
|
-
"
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
|
2012
|
-
|
2020
|
+
# Tier 1: Use IUPAC location hint if provided, otherwise standard sections
|
2021
|
+
if iupac_location_hint and iupac_location_hint.get('location'):
|
2022
|
+
LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
|
2023
|
+
if iupac_location_hint.get('compound_section_hint'):
|
2024
|
+
LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
|
2025
|
+
|
2026
|
+
# Extract text from the specific IUPAC location
|
2027
|
+
iupac_text = self._get_extended_text_around_location(
|
2028
|
+
iupac_location_hint['location'],
|
2029
|
+
before=2000,
|
2030
|
+
after=10000
|
2031
|
+
)
|
2032
|
+
|
2033
|
+
# Also check for compound-specific hints
|
2034
|
+
compound_hint = iupac_location_hint.get('compound_section_hint', '')
|
2035
|
+
if compound_hint and iupac_text:
|
2036
|
+
# Search for the specific compound section
|
2037
|
+
hint_pattern = re.escape(compound_hint)
|
2038
|
+
match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
|
2039
|
+
if match:
|
2040
|
+
# Extract more focused text around the compound hint
|
2041
|
+
start = max(0, match.start() - 500)
|
2042
|
+
end = min(len(iupac_text), match.end() + 2000)
|
2043
|
+
iupac_text = iupac_text[start:end]
|
2044
|
+
LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
|
2045
|
+
|
2046
|
+
extraction_text = iupac_text or ""
|
2047
|
+
if extraction_text:
|
2048
|
+
LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
|
2049
|
+
else:
|
2050
|
+
LOGGER.warning("Tier 1: No text found at IUPAC location hint")
|
2051
|
+
# Add some manuscript context
|
2052
|
+
manuscript_text = "\n\n".join(self.ms_pages[:5])
|
2053
|
+
else:
|
2054
|
+
# Fallback to standard sections
|
2055
|
+
initial_sections = initial_sections or [
|
2056
|
+
"General procedure", "Compound characterization",
|
2057
|
+
"Synthesis", "Experimental", "Materials and methods"
|
2058
|
+
]
|
2059
|
+
|
2060
|
+
# Extract from initial sections - search in all pages (manuscript + SI)
|
2061
|
+
extraction_text = self._extract_sections_by_title(initial_sections)
|
2062
|
+
|
2063
|
+
# If no sections found by title, include first few SI pages which often have compound data
|
2064
|
+
if not extraction_text and self.si_pages:
|
2065
|
+
# SI often starts with compound characterization after TOC
|
2066
|
+
si_compound_pages = "\n\n".join(self.si_pages[2:10]) # Skip first 2 pages (usually TOC)
|
2067
|
+
extraction_text = si_compound_pages
|
2068
|
+
|
2069
|
+
# Include manuscript pages (first 10) for model reaction context
|
2070
|
+
manuscript_text = "\n\n".join(self.ms_pages[:10])
|
2013
2071
|
|
2014
2072
|
# Add campaign context if provided
|
2015
2073
|
campaign_context = ""
|
@@ -2033,8 +2091,7 @@ Do NOT include compound information from other campaigns.
|
|
2033
2091
|
|
2034
2092
|
"""
|
2035
2093
|
|
2036
|
-
#
|
2037
|
-
extraction_text = self._extract_sections_by_title(initial_sections)
|
2094
|
+
# Combine manuscript text, campaign context, and extraction text
|
2038
2095
|
if extraction_text:
|
2039
2096
|
extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
|
2040
2097
|
else:
|
@@ -2083,11 +2140,11 @@ Do NOT include compound information from other campaigns.
|
|
2083
2140
|
figure_images[ref] = img_b64
|
2084
2141
|
LOGGER.info("Extracted %s for compound mapping", ref)
|
2085
2142
|
|
2086
|
-
# Full text search including
|
2087
|
-
full_text = "\n\n".join(self.all_pages
|
2143
|
+
# Full text search including ALL pages (manuscript + SI)
|
2144
|
+
full_text = "\n\n".join(self.all_pages) # Send everything
|
2088
2145
|
|
2089
2146
|
final_mappings = self._extract_compound_mappings_with_figures(
|
2090
|
-
full_text
|
2147
|
+
full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
|
2091
2148
|
)
|
2092
2149
|
|
2093
2150
|
# Merge final mappings with better compound ID matching
|
@@ -2261,7 +2318,13 @@ Do NOT include compound information from other campaigns.
|
|
2261
2318
|
compound_mappings = {}
|
2262
2319
|
if compound_ids:
|
2263
2320
|
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
2264
|
-
|
2321
|
+
# Pass the IUPAC location hint if we have it
|
2322
|
+
iupac_hint = locations.get("iupac_location") if locations else None
|
2323
|
+
compound_mappings = self._extract_compound_mappings_adaptive(
|
2324
|
+
compound_ids,
|
2325
|
+
campaign_filter=self.campaign_filter,
|
2326
|
+
iupac_location_hint=iupac_hint
|
2327
|
+
)
|
2265
2328
|
|
2266
2329
|
# Add the mapped IUPAC names to the context for better extraction
|
2267
2330
|
if compound_mappings:
|
@@ -2404,6 +2467,34 @@ Different campaigns may use different model reactions and substrates.
|
|
2404
2467
|
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2405
2468
|
list(compound_mappings.keys()))
|
2406
2469
|
|
2470
|
+
# First, populate IUPAC lists directly from compound mappings based on compound_type
|
2471
|
+
substrate_iupacs_from_mappings = []
|
2472
|
+
product_iupacs_from_mappings = []
|
2473
|
+
|
2474
|
+
for mapping in compound_mappings.values():
|
2475
|
+
if mapping.iupac_name and mapping.compound_type:
|
2476
|
+
if mapping.compound_type.lower() == "substrate":
|
2477
|
+
substrate_iupacs_from_mappings.append(mapping.iupac_name)
|
2478
|
+
LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
|
2479
|
+
elif mapping.compound_type.lower() == "product":
|
2480
|
+
product_iupacs_from_mappings.append(mapping.iupac_name)
|
2481
|
+
LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
|
2482
|
+
|
2483
|
+
# Initialize or update the IUPAC lists with mapped compounds
|
2484
|
+
if substrate_iupacs_from_mappings:
|
2485
|
+
existing_substrates = data.get("substrate_iupac_list", []) or []
|
2486
|
+
if isinstance(existing_substrates, list):
|
2487
|
+
data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
|
2488
|
+
else:
|
2489
|
+
data["substrate_iupac_list"] = substrate_iupacs_from_mappings
|
2490
|
+
|
2491
|
+
if product_iupacs_from_mappings:
|
2492
|
+
existing_products = data.get("product_iupac_list", []) or []
|
2493
|
+
if isinstance(existing_products, list):
|
2494
|
+
data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
|
2495
|
+
else:
|
2496
|
+
data["product_iupac_list"] = product_iupacs_from_mappings
|
2497
|
+
|
2407
2498
|
# Try to map substrate/product lists through compound IDs
|
2408
2499
|
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
2409
2500
|
if isinstance(substrate_list, list):
|
@@ -3021,7 +3112,14 @@ def main() -> None:
|
|
3021
3112
|
campaign_filter = all_campaigns[0]
|
3022
3113
|
LOGGER.info("Detected single campaign: %s", campaign_filter)
|
3023
3114
|
|
3024
|
-
|
3115
|
+
# Create campaign-specific debug directory even for single campaign
|
3116
|
+
campaign_debug_dir = None
|
3117
|
+
if args.debug_dir:
|
3118
|
+
campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
|
3119
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3120
|
+
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3121
|
+
|
3122
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3025
3123
|
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3026
3124
|
df_metrics = extractor.run(enzyme_df)
|
3027
3125
|
|
@@ -3041,8 +3139,14 @@ def main() -> None:
|
|
3041
3139
|
LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
|
3042
3140
|
continue
|
3043
3141
|
|
3044
|
-
# Create extractor for this campaign
|
3045
|
-
|
3142
|
+
# Create extractor for this campaign with campaign-specific debug directory
|
3143
|
+
campaign_debug_dir = None
|
3144
|
+
if args.debug_dir:
|
3145
|
+
campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
|
3146
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3147
|
+
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3148
|
+
|
3149
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3046
3150
|
campaign_filter=campaign, all_campaigns=all_campaigns)
|
3047
3151
|
|
3048
3152
|
# Run extraction for this campaign
|
@@ -3088,7 +3192,13 @@ def main() -> None:
|
|
3088
3192
|
df_metrics = pd.DataFrame()
|
3089
3193
|
else:
|
3090
3194
|
# No campaign information, process all enzymes together
|
3091
|
-
|
3195
|
+
campaign_debug_dir = None
|
3196
|
+
if args.debug_dir:
|
3197
|
+
campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
|
3198
|
+
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3199
|
+
LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
|
3200
|
+
|
3201
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3092
3202
|
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3093
3203
|
df_metrics = extractor.run(enzyme_df)
|
3094
3204
|
|
@@ -28,6 +28,7 @@ import re
|
|
28
28
|
import json
|
29
29
|
import time
|
30
30
|
import logging
|
31
|
+
import subprocess
|
31
32
|
from pathlib import Path
|
32
33
|
from dataclasses import dataclass, field
|
33
34
|
from typing import List, Optional, Dict, Any, Union
|
@@ -103,6 +104,52 @@ class CompoundMapping:
|
|
103
104
|
compound_type: str = "unknown"
|
104
105
|
source_location: Optional[str] = None
|
105
106
|
|
107
|
+
def is_valid_iupac_name_with_opsin(name: str) -> bool:
|
108
|
+
"""Check if a name is a valid IUPAC name using the local OPSIN command."""
|
109
|
+
if not name or len(name.strip()) < 3:
|
110
|
+
return False
|
111
|
+
|
112
|
+
try:
|
113
|
+
# Use local OPSIN command to check if name can be converted to SMILES
|
114
|
+
process = subprocess.run(
|
115
|
+
['opsin', '-o', 'smi'],
|
116
|
+
input=name.strip(),
|
117
|
+
text=True,
|
118
|
+
capture_output=True,
|
119
|
+
timeout=30
|
120
|
+
)
|
121
|
+
|
122
|
+
# If OPSIN successfully converts to SMILES, the name is valid IUPAC
|
123
|
+
if process.returncode == 0 and process.stdout.strip():
|
124
|
+
output = process.stdout.strip()
|
125
|
+
# Check if output looks like a valid SMILES (contains common SMILES characters)
|
126
|
+
if any(char in output for char in 'CNOS()=[]#+-'):
|
127
|
+
return True
|
128
|
+
|
129
|
+
return False
|
130
|
+
|
131
|
+
except Exception as e:
|
132
|
+
log.debug(f"OPSIN check failed for '{name}': {e}")
|
133
|
+
return False
|
134
|
+
|
135
|
+
def _get_iupac_name(compound) -> str:
|
136
|
+
"""Get IUPAC name for a compound, checking if the common name is already IUPAC."""
|
137
|
+
if not compound:
|
138
|
+
return ''
|
139
|
+
|
140
|
+
# If we already have an IUPAC name, use it
|
141
|
+
if compound.iupac_name:
|
142
|
+
return compound.iupac_name
|
143
|
+
|
144
|
+
# If no IUPAC name but we have a common name, check if it's already IUPAC
|
145
|
+
if compound.name:
|
146
|
+
# Check with OPSIN if the name is a valid IUPAC name
|
147
|
+
if is_valid_iupac_name_with_opsin(compound.name):
|
148
|
+
log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
|
149
|
+
return compound.name
|
150
|
+
|
151
|
+
return ''
|
152
|
+
|
106
153
|
# === 3. LOGGING HELPERS ===
|
107
154
|
|
108
155
|
# --- Debug dump helper ----------------------------------------------------
|
@@ -2532,9 +2579,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2532
2579
|
'flag': '',
|
2533
2580
|
|
2534
2581
|
'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
|
2535
|
-
'substrate_iupac_list': '; '.join(s
|
2582
|
+
'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
|
2536
2583
|
'product_list': '; '.join(p.name for p in entry.products if p.name),
|
2537
|
-
'product_iupac_list': '; '.join(p
|
2584
|
+
'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
|
2538
2585
|
|
2539
2586
|
'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
|
2540
2587
|
'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),
|