debase 0.6.1__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.6.1/src/debase.egg-info → debase-0.7.0}/PKG-INFO +1 -1
- {debase-0.6.1 → debase-0.7.0}/src/debase/_version.py +1 -1
- {debase-0.6.1 → debase-0.7.0}/src/debase/caption_pattern.py +7 -2
- {debase-0.6.1 → debase-0.7.0}/src/debase/cleanup_sequence.py +34 -6
- {debase-0.6.1 → debase-0.7.0}/src/debase/enzyme_lineage_extractor.py +673 -221
- {debase-0.6.1 → debase-0.7.0}/src/debase/lineage_format.py +55 -6
- {debase-0.6.1 → debase-0.7.0}/src/debase/reaction_info_extractor.py +282 -97
- {debase-0.6.1 → debase-0.7.0}/src/debase/substrate_scope_extractor.py +218 -65
- {debase-0.6.1 → debase-0.7.0/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.6.1 → debase-0.7.0}/.gitignore +0 -0
- {debase-0.6.1 → debase-0.7.0}/LICENSE +0 -0
- {debase-0.6.1 → debase-0.7.0}/MANIFEST.in +0 -0
- {debase-0.6.1 → debase-0.7.0}/README.md +0 -0
- {debase-0.6.1 → debase-0.7.0}/environment.yml +0 -0
- {debase-0.6.1 → debase-0.7.0}/manuscript/DEBase_LLM_Validater.ipynb +0 -0
- {debase-0.6.1 → debase-0.7.0}/pyproject.toml +0 -0
- {debase-0.6.1 → debase-0.7.0}/setup.cfg +0 -0
- {debase-0.6.1 → debase-0.7.0}/setup.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/__init__.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase/__init__.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase/__main__.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase/build_db.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase/campaign_utils.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase/wrapper.py +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -8,15 +8,20 @@ formats found in scientific papers, including:
|
|
8
8
|
- Other types: Scheme 1, Chart 1
|
9
9
|
- Page headers: S14 Table 5
|
10
10
|
- Various punctuation: Figure 1. Figure 1: Figure 1 |
|
11
|
+
- Inline captions: ...text Table 1. Caption text...
|
11
12
|
"""
|
12
13
|
|
13
14
|
import re
|
14
15
|
|
15
16
|
# Universal caption pattern that handles all common formats
|
17
|
+
# Now includes both start-of-line and inline caption patterns
|
16
18
|
UNIVERSAL_CAPTION_PATTERN = re.compile(
|
17
19
|
r"""
|
18
|
-
|
19
|
-
|
20
|
+
(?: # Non-capturing group for position
|
21
|
+
^[^\n]{0,20}? # Start of line with up to 20 chars before
|
22
|
+
| # OR
|
23
|
+
(?<=[a-zA-Z0-9\s]) # Look-behind for alphanumeric or space (for inline)
|
24
|
+
)
|
20
25
|
( # Start capture group
|
21
26
|
(?:Extended\s+Data\s+)? # Optional "Extended Data" prefix
|
22
27
|
(?:ED\s+)? # Optional "ED" prefix
|
@@ -1016,13 +1016,38 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1016
1016
|
|
1017
1017
|
if source_enzyme_id:
|
1018
1018
|
# Find the source enzyme's sequence in the dataframe
|
1019
|
+
# Prefer sequences from OTHER campaigns (not the current empty campaign)
|
1019
1020
|
source_rows = df[df['enzyme_id'] == source_enzyme_id]
|
1020
1021
|
if source_rows.empty:
|
1021
1022
|
log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
|
1022
1023
|
else:
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1024
|
+
# Look for a row with a sequence, preferring other campaigns
|
1025
|
+
source_sequence = None
|
1026
|
+
source_row_idx = None
|
1027
|
+
|
1028
|
+
# First, try to find a row with sequence from a different campaign
|
1029
|
+
for idx, row in source_rows.iterrows():
|
1030
|
+
seq = str(row['protein_sequence']).strip()
|
1031
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1032
|
+
# Check if this is from a different campaign
|
1033
|
+
if row['campaign_id'] != campaign_id:
|
1034
|
+
source_sequence = seq
|
1035
|
+
source_row_idx = idx
|
1036
|
+
log.info(f"Found source sequence for {source_enzyme_id} from campaign {row['campaign_id']}")
|
1037
|
+
break
|
1038
|
+
|
1039
|
+
# If not found in other campaigns, try any row with sequence
|
1040
|
+
if not source_sequence:
|
1041
|
+
for idx, row in source_rows.iterrows():
|
1042
|
+
seq = str(row['protein_sequence']).strip()
|
1043
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1044
|
+
source_sequence = seq
|
1045
|
+
source_row_idx = idx
|
1046
|
+
log.info(f"Found source sequence for {source_enzyme_id} from same campaign {row['campaign_id']}")
|
1047
|
+
break
|
1048
|
+
|
1049
|
+
if not source_sequence:
|
1050
|
+
log.warning(f"Source enzyme {source_enzyme_id} has no sequence in any campaign")
|
1026
1051
|
else:
|
1027
1052
|
# Find the target enzyme in our empty list
|
1028
1053
|
seed_found = False
|
@@ -1031,7 +1056,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1031
1056
|
if relationship_type == "EXACT_MATCH":
|
1032
1057
|
# Exact match - copy sequence directly
|
1033
1058
|
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
1034
|
-
df.at[entry['idx'], 'flag']
|
1059
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1060
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_exact"
|
1035
1061
|
log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
|
1036
1062
|
seed_found = True
|
1037
1063
|
|
@@ -1045,7 +1071,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1045
1071
|
|
1046
1072
|
if success:
|
1047
1073
|
df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
|
1048
|
-
df.at[entry['idx'], 'flag']
|
1074
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1075
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent"
|
1049
1076
|
log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
|
1050
1077
|
seed_found = True
|
1051
1078
|
else:
|
@@ -1053,7 +1080,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1053
1080
|
else:
|
1054
1081
|
# No mutations - use parent sequence directly
|
1055
1082
|
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
1056
|
-
df.at[entry['idx'], 'flag']
|
1083
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1084
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent_no_mutations"
|
1057
1085
|
log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
|
1058
1086
|
seed_found = True
|
1059
1087
|
break
|