debase 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +28 -0
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/METADATA +1 -1
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/RECORD +8 -8
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/WHEEL +0 -0
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/entry_points.txt +0 -0
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.6.dist-info → debase-0.1.8.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -1944,7 +1944,13 @@ def _merge_lineage_and_sequences(
|
|
1944
1944
|
df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
|
1945
1945
|
|
1946
1946
|
# 2a. If we have unmatched sequences and a model, use Gemini to match them
|
1947
|
+
log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
|
1947
1948
|
if model and len(df_seq) > 0:
|
1949
|
+
# Log initial state
|
1950
|
+
log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
|
1951
|
+
log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
|
1952
|
+
log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
|
1953
|
+
|
1948
1954
|
# Find lineage entries without sequences
|
1949
1955
|
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
1950
1956
|
unmatched_lineage = df[missing_seq]['variant_id'].tolist()
|
@@ -1999,6 +2005,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
1999
2005
|
|
2000
2006
|
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
|
2001
2007
|
|
2008
|
+
# Log the final state after all matches
|
2009
|
+
matched_count = (~df['aa_seq'].isna()).sum()
|
2010
|
+
log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
|
2011
|
+
|
2002
2012
|
except Exception as e:
|
2003
2013
|
log.warning(f"Failed to match variants using Gemini: {e}")
|
2004
2014
|
|
@@ -2019,6 +2029,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2019
2029
|
# 5. Sort rows: primary by generation, then by variant_id
|
2020
2030
|
df = df.sort_values(["generation", "variant_id"], kind="mergesort")
|
2021
2031
|
|
2032
|
+
# Debug: Log final merge state
|
2033
|
+
seq_count = (~df['aa_seq'].isna()).sum()
|
2034
|
+
log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
|
2035
|
+
if seq_count > 0:
|
2036
|
+
log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
|
2037
|
+
|
2022
2038
|
return df
|
2023
2039
|
|
2024
2040
|
# --- 8.3 Public API -----------------------------------------------------------
|
@@ -2047,6 +2063,10 @@ def merge_and_score(
|
|
2047
2063
|
if missing_rate > 0.5:
|
2048
2064
|
log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
|
2049
2065
|
|
2066
|
+
# Debug log before returning
|
2067
|
+
seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
|
2068
|
+
log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
|
2069
|
+
|
2050
2070
|
return df
|
2051
2071
|
|
2052
2072
|
# -------------------------------------------------------------------- end 8 ---
|
@@ -2230,6 +2250,14 @@ def run_pipeline(
|
|
2230
2250
|
output_csv_path = Path(output_csv)
|
2231
2251
|
# Save final data with sequences using same filename (overwrites lineage-only)
|
2232
2252
|
sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
|
2253
|
+
|
2254
|
+
# Debug: Log what we're about to save
|
2255
|
+
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2256
|
+
log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
|
2257
|
+
if seq_count > 0 and 'aa_seq' in df_final:
|
2258
|
+
with_seq = df_final[~df_final['aa_seq'].isna()]
|
2259
|
+
log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
|
2260
|
+
|
2233
2261
|
df_final.to_csv(sequence_path, index=False)
|
2234
2262
|
log.info(
|
2235
2263
|
"Overwrote with final results -> %s (%.1f kB)",
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=pU1LNmYqDA6wql-sQ9H8cktGdOqOUTonX-sx1fgYV2Y,49
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=k62qIp37ONYJBWT8D7ROh7ooYhz871BlCXJmAduq8js,95764
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
9
|
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.8.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.8.dist-info/METADATA,sha256=uGoC1Ebi663KiWpLbzkbcpKHYFxKjTJXoU1lV4cbVJ8,10789
|
14
|
+
debase-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.8.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.8.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|