PyPI - debase - Versions diffs - 0.6.1__tar.gz → 0.7.0__tar.gz - Mend

debase 0.6.1tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{debase-0.6.1/src/debase.egg-info → debase-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.6.1
+Version: 0.7.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.6.1 → debase-0.7.0}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.6.1"
+__version__ = "0.7.0"

{debase-0.6.1 → debase-0.7.0}/src/debase/caption_pattern.py RENAMED Viewed

@@ -8,15 +8,20 @@ formats found in scientific papers, including:
 - Other types: Scheme 1, Chart 1
 - Page headers: S14 Table 5
 - Various punctuation: Figure 1. Figure 1: Figure 1 |
+- Inline captions: ...text Table 1. Caption text...
 """
 import re
 # Universal caption pattern that handles all common formats
+# Now includes both start-of-line and inline caption patterns
 UNIVERSAL_CAPTION_PATTERN = re.compile(
     r"""
-    ^                                      # Start of line
-    [^\n]{0,20}?                          # Up to 20 chars of any content (page headers, etc.)
+    (?:                                    # Non-capturing group for position
+        ^[^\n]{0,20}?                      # Start of line with up to 20 chars before
+    |                                      # OR
+        (?<=[a-zA-Z0-9\s])                # Look-behind for alphanumeric or space (for inline)
+    )
     (                                      # Start capture group
         (?:Extended\s+Data\s+)?           # Optional "Extended Data" prefix
         (?:ED\s+)?                        # Optional "ED" prefix

{debase-0.6.1 → debase-0.7.0}/src/debase/cleanup_sequence.py RENAMED Viewed

@@ -1016,13 +1016,38 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
                         if source_enzyme_id:
                             # Find the source enzyme's sequence in the dataframe
+                            # Prefer sequences from OTHER campaigns (not the current empty campaign)
                             source_rows = df[df['enzyme_id'] == source_enzyme_id]
                             if source_rows.empty:
                                 log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
                             else:
-                                source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
-                                if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
-                                    log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
+                                # Look for a row with a sequence, preferring other campaigns
+                                source_sequence = None
+                                source_row_idx = None
+                                # First, try to find a row with sequence from a different campaign
+                                for idx, row in source_rows.iterrows():
+                                    seq = str(row['protein_sequence']).strip()
+                                    if seq and seq.lower() not in ["nan", "none", ""]:
+                                        # Check if this is from a different campaign
+                                        if row['campaign_id'] != campaign_id:
+                                            source_sequence = seq
+                                            source_row_idx = idx
+                                            log.info(f"Found source sequence for {source_enzyme_id} from campaign {row['campaign_id']}")
+                                            break
+                                # If not found in other campaigns, try any row with sequence
+                                if not source_sequence:
+                                    for idx, row in source_rows.iterrows():
+                                        seq = str(row['protein_sequence']).strip()
+                                        if seq and seq.lower() not in ["nan", "none", ""]:
+                                            source_sequence = seq
+                                            source_row_idx = idx
+                                            log.info(f"Found source sequence for {source_enzyme_id} from same campaign {row['campaign_id']}")
+                                            break
+                                if not source_sequence:
+                                    log.warning(f"Source enzyme {source_enzyme_id} has no sequence in any campaign")
                                 else:
                                     # Find the target enzyme in our empty list
                                     seed_found = False
@@ -1031,7 +1056,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
                                             if relationship_type == "EXACT_MATCH":
                                                 # Exact match - copy sequence directly
                                                 df.at[entry['idx'], 'protein_sequence'] = source_sequence
-                                                df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
+                                                current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
+                                                df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_exact"
                                                 log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
                                                 seed_found = True
@@ -1045,7 +1071,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
                                                     if success:
                                                         df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
-                                                        df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
+                                                        current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
+                                                        df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent"
                                                         log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
                                                         seed_found = True
                                                     else:
@@ -1053,7 +1080,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
                                                 else:
                                                     # No mutations - use parent sequence directly
                                                     df.at[entry['idx'], 'protein_sequence'] = source_sequence
-                                                    df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
+                                                    current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
+                                                    df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent_no_mutations"
                                                     log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
                                                     seed_found = True
                                             break

debase 0.6.1__tar.gz → 0.7.0__tar.gz

debase 0.6.1tar.gz → 0.7.0tar.gz