debase 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.9"
3
+ __version__ = "0.1.11"
@@ -2024,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2024
2024
  text = text[4:].strip()
2025
2025
 
2026
2026
  matches = json.loads(text)
2027
+ log.info(f"Gemini returned matches: {matches}")
2028
+
2029
+ # Debug: Log what sequences we actually have
2030
+ log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
2027
2031
 
2028
2032
  # Apply the matches
2029
2033
  for lineage_id, seq_id in matches.items():
2030
- if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
2031
- # Get the sequence data
2032
- seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
2034
+ if lineage_id in unmatched_lineage:
2035
+ # Find the sequence data - be flexible with matching
2036
+ seq_data = None
2033
2037
 
2034
- # Update the dataframe
2035
- mask = df['variant_id'] == lineage_id
2036
- df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2037
- df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2038
- df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
2039
- df.loc[mask, 'truncated'] = seq_data['truncated']
2038
+ # First try exact match
2039
+ seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
2040
+ if len(seq_matches) > 0:
2041
+ seq_data = seq_matches.iloc[0]
2042
+ else:
2043
+ # Try to find by checking various matching strategies
2044
+ for idx, row in unmatched_seqs.iterrows():
2045
+ variant_id = row['variant_id']
2046
+ # Check if one is contained in the other
2047
+ if seq_id in variant_id or variant_id in seq_id:
2048
+ seq_data = row
2049
+ break
2050
+ # Check if they share the same core identifier (e.g., G0, G1, etc.)
2051
+ seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
2052
+ variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
2053
+ if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
2054
+ seq_data = row
2055
+ break
2040
2056
 
2041
- log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2057
+ if seq_data is not None:
2058
+ # Update the dataframe
2059
+ mask = df['variant_id'] == lineage_id
2060
+ if mask.any():
2061
+ # Log before update
2062
+ log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
2063
+
2064
+ df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
2065
+ df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
2066
+ df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
2067
+ df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
2068
+
2069
+ # Log after update
2070
+ log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
2071
+ log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
2072
+ else:
2073
+ log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
2074
+ else:
2075
+ log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
2042
2076
 
2043
2077
  # Log the final state after all matches
2044
2078
  matched_count = (~df['aa_seq'].isna()).sum()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=j8HFA4JhFiintNU67gau8Re8N3rsxPqodcW8xAgdwqY,49
4
+ debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=mHKo6cxQdcAFuthQTpxc4fsGH73JO3VuLXSsixA7mOA,97421
7
+ debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.9.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.9.dist-info/METADATA,sha256=QVjGEYd1VWDmQszko8IQ5jJ9xMQoT45SCY_oG9XvbMs,10789
14
- debase-0.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.9.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.9.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.9.dist-info/RECORD,,
12
+ debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
14
+ debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.11.dist-info/RECORD,,