debase 0.6.1__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {debase-0.6.1/src/debase.egg-info → debase-0.7.0}/PKG-INFO +1 -1
  2. {debase-0.6.1 → debase-0.7.0}/src/debase/_version.py +1 -1
  3. {debase-0.6.1 → debase-0.7.0}/src/debase/caption_pattern.py +7 -2
  4. {debase-0.6.1 → debase-0.7.0}/src/debase/cleanup_sequence.py +34 -6
  5. {debase-0.6.1 → debase-0.7.0}/src/debase/enzyme_lineage_extractor.py +673 -221
  6. {debase-0.6.1 → debase-0.7.0}/src/debase/lineage_format.py +55 -6
  7. {debase-0.6.1 → debase-0.7.0}/src/debase/reaction_info_extractor.py +282 -97
  8. {debase-0.6.1 → debase-0.7.0}/src/debase/substrate_scope_extractor.py +218 -65
  9. {debase-0.6.1 → debase-0.7.0/src/debase.egg-info}/PKG-INFO +1 -1
  10. {debase-0.6.1 → debase-0.7.0}/.gitignore +0 -0
  11. {debase-0.6.1 → debase-0.7.0}/LICENSE +0 -0
  12. {debase-0.6.1 → debase-0.7.0}/MANIFEST.in +0 -0
  13. {debase-0.6.1 → debase-0.7.0}/README.md +0 -0
  14. {debase-0.6.1 → debase-0.7.0}/environment.yml +0 -0
  15. {debase-0.6.1 → debase-0.7.0}/manuscript/DEBase_LLM_Validater.ipynb +0 -0
  16. {debase-0.6.1 → debase-0.7.0}/pyproject.toml +0 -0
  17. {debase-0.6.1 → debase-0.7.0}/setup.cfg +0 -0
  18. {debase-0.6.1 → debase-0.7.0}/setup.py +0 -0
  19. {debase-0.6.1 → debase-0.7.0}/src/__init__.py +0 -0
  20. {debase-0.6.1 → debase-0.7.0}/src/debase/__init__.py +0 -0
  21. {debase-0.6.1 → debase-0.7.0}/src/debase/__main__.py +0 -0
  22. {debase-0.6.1 → debase-0.7.0}/src/debase/build_db.py +0 -0
  23. {debase-0.6.1 → debase-0.7.0}/src/debase/campaign_utils.py +0 -0
  24. {debase-0.6.1 → debase-0.7.0}/src/debase/wrapper.py +0 -0
  25. {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/SOURCES.txt +0 -0
  26. {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/dependency_links.txt +0 -0
  27. {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/entry_points.txt +0 -0
  28. {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/requires.txt +0 -0
  29. {debase-0.6.1 → debase-0.7.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.6.1
3
+ Version: 0.7.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.6.1"
3
+ __version__ = "0.7.0"
@@ -8,15 +8,20 @@ formats found in scientific papers, including:
8
8
  - Other types: Scheme 1, Chart 1
9
9
  - Page headers: S14 Table 5
10
10
  - Various punctuation: Figure 1. Figure 1: Figure 1 |
11
+ - Inline captions: ...text Table 1. Caption text...
11
12
  """
12
13
 
13
14
  import re
14
15
 
15
16
  # Universal caption pattern that handles all common formats
17
+ # Now includes both start-of-line and inline caption patterns
16
18
  UNIVERSAL_CAPTION_PATTERN = re.compile(
17
19
  r"""
18
- ^ # Start of line
19
- [^\n]{0,20}? # Up to 20 chars of any content (page headers, etc.)
20
+ (?: # Non-capturing group for position
21
+ ^[^\n]{0,20}? # Start of line with up to 20 chars before
22
+ | # OR
23
+ (?<=[a-zA-Z0-9\s]) # Look-behind for alphanumeric or space (for inline)
24
+ )
20
25
  ( # Start capture group
21
26
  (?:Extended\s+Data\s+)? # Optional "Extended Data" prefix
22
27
  (?:ED\s+)? # Optional "ED" prefix
@@ -1016,13 +1016,38 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1016
1016
 
1017
1017
  if source_enzyme_id:
1018
1018
  # Find the source enzyme's sequence in the dataframe
1019
+ # Prefer sequences from OTHER campaigns (not the current empty campaign)
1019
1020
  source_rows = df[df['enzyme_id'] == source_enzyme_id]
1020
1021
  if source_rows.empty:
1021
1022
  log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
1022
1023
  else:
1023
- source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
1024
- if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
1025
- log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
1024
+ # Look for a row with a sequence, preferring other campaigns
1025
+ source_sequence = None
1026
+ source_row_idx = None
1027
+
1028
+ # First, try to find a row with sequence from a different campaign
1029
+ for idx, row in source_rows.iterrows():
1030
+ seq = str(row['protein_sequence']).strip()
1031
+ if seq and seq.lower() not in ["nan", "none", ""]:
1032
+ # Check if this is from a different campaign
1033
+ if row['campaign_id'] != campaign_id:
1034
+ source_sequence = seq
1035
+ source_row_idx = idx
1036
+ log.info(f"Found source sequence for {source_enzyme_id} from campaign {row['campaign_id']}")
1037
+ break
1038
+
1039
+ # If not found in other campaigns, try any row with sequence
1040
+ if not source_sequence:
1041
+ for idx, row in source_rows.iterrows():
1042
+ seq = str(row['protein_sequence']).strip()
1043
+ if seq and seq.lower() not in ["nan", "none", ""]:
1044
+ source_sequence = seq
1045
+ source_row_idx = idx
1046
+ log.info(f"Found source sequence for {source_enzyme_id} from same campaign {row['campaign_id']}")
1047
+ break
1048
+
1049
+ if not source_sequence:
1050
+ log.warning(f"Source enzyme {source_enzyme_id} has no sequence in any campaign")
1026
1051
  else:
1027
1052
  # Find the target enzyme in our empty list
1028
1053
  seed_found = False
@@ -1031,7 +1056,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1031
1056
  if relationship_type == "EXACT_MATCH":
1032
1057
  # Exact match - copy sequence directly
1033
1058
  df.at[entry['idx'], 'protein_sequence'] = source_sequence
1034
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
1059
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1060
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_exact"
1035
1061
  log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
1036
1062
  seed_found = True
1037
1063
 
@@ -1045,7 +1071,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1045
1071
 
1046
1072
  if success:
1047
1073
  df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
1048
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
1074
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1075
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent"
1049
1076
  log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
1050
1077
  seed_found = True
1051
1078
  else:
@@ -1053,7 +1080,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
1053
1080
  else:
1054
1081
  # No mutations - use parent sequence directly
1055
1082
  df.at[entry['idx'], 'protein_sequence'] = source_sequence
1056
- df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
1083
+ current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
1084
+ df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent_no_mutations"
1057
1085
  log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
1058
1086
  seed_found = True
1059
1087
  break