debase 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +44 -10
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/METADATA +1 -1
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/RECORD +8 -8
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/WHEEL +0 -0
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/entry_points.txt +0 -0
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.9.dist-info → debase-0.1.11.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -2024,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2024
2024
|
text = text[4:].strip()
|
2025
2025
|
|
2026
2026
|
matches = json.loads(text)
|
2027
|
+
log.info(f"Gemini returned matches: {matches}")
|
2028
|
+
|
2029
|
+
# Debug: Log what sequences we actually have
|
2030
|
+
log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
|
2027
2031
|
|
2028
2032
|
# Apply the matches
|
2029
2033
|
for lineage_id, seq_id in matches.items():
|
2030
|
-
if lineage_id in unmatched_lineage
|
2031
|
-
#
|
2032
|
-
seq_data =
|
2034
|
+
if lineage_id in unmatched_lineage:
|
2035
|
+
# Find the sequence data - be flexible with matching
|
2036
|
+
seq_data = None
|
2033
2037
|
|
2034
|
-
#
|
2035
|
-
|
2036
|
-
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2038
|
+
# First try exact match
|
2039
|
+
seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
|
2040
|
+
if len(seq_matches) > 0:
|
2041
|
+
seq_data = seq_matches.iloc[0]
|
2042
|
+
else:
|
2043
|
+
# Try to find by checking various matching strategies
|
2044
|
+
for idx, row in unmatched_seqs.iterrows():
|
2045
|
+
variant_id = row['variant_id']
|
2046
|
+
# Check if one is contained in the other
|
2047
|
+
if seq_id in variant_id or variant_id in seq_id:
|
2048
|
+
seq_data = row
|
2049
|
+
break
|
2050
|
+
# Check if they share the same core identifier (e.g., G0, G1, etc.)
|
2051
|
+
seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
|
2052
|
+
variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
|
2053
|
+
if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
|
2054
|
+
seq_data = row
|
2055
|
+
break
|
2040
2056
|
|
2041
|
-
|
2057
|
+
if seq_data is not None:
|
2058
|
+
# Update the dataframe
|
2059
|
+
mask = df['variant_id'] == lineage_id
|
2060
|
+
if mask.any():
|
2061
|
+
# Log before update
|
2062
|
+
log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
|
2063
|
+
|
2064
|
+
df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
|
2065
|
+
df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
|
2066
|
+
df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
|
2067
|
+
df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
|
2068
|
+
|
2069
|
+
# Log after update
|
2070
|
+
log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
|
2071
|
+
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
|
2072
|
+
else:
|
2073
|
+
log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
|
2074
|
+
else:
|
2075
|
+
log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
|
2042
2076
|
|
2043
2077
|
# Log the final state after all matches
|
2044
2078
|
matched_count = (~df['aa_seq'].isna()).sum()
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
9
|
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
|
14
|
+
debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|