debase 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.6"
3
+ __version__ = "0.1.8"
@@ -1944,7 +1944,13 @@ def _merge_lineage_and_sequences(
1944
1944
  df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
1945
1945
 
1946
1946
  # 2a. If we have unmatched sequences and a model, use Gemini to match them
1947
+ log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
1947
1948
  if model and len(df_seq) > 0:
1949
+ # Log initial state
1950
+ log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
1951
+ log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
1952
+ log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
1953
+
1948
1954
  # Find lineage entries without sequences
1949
1955
  missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
1950
1956
  unmatched_lineage = df[missing_seq]['variant_id'].tolist()
@@ -1999,6 +2005,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
1999
2005
 
2000
2006
  log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2001
2007
 
2008
+ # Log the final state after all matches
2009
+ matched_count = (~df['aa_seq'].isna()).sum()
2010
+ log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
2011
+
2002
2012
  except Exception as e:
2003
2013
  log.warning(f"Failed to match variants using Gemini: {e}")
2004
2014
 
@@ -2019,6 +2029,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
2019
2029
  # 5. Sort rows: primary by generation, then by variant_id
2020
2030
  df = df.sort_values(["generation", "variant_id"], kind="mergesort")
2021
2031
 
2032
+ # Debug: Log final merge state
2033
+ seq_count = (~df['aa_seq'].isna()).sum()
2034
+ log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
2035
+ if seq_count > 0:
2036
+ log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
2037
+
2022
2038
  return df
2023
2039
 
2024
2040
  # --- 8.3 Public API -----------------------------------------------------------
@@ -2047,6 +2063,10 @@ def merge_and_score(
2047
2063
  if missing_rate > 0.5:
2048
2064
  log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
2049
2065
 
2066
+ # Debug log before returning
2067
+ seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
2068
+ log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
2069
+
2050
2070
  return df
2051
2071
 
2052
2072
  # -------------------------------------------------------------------- end 8 ---
@@ -2230,6 +2250,14 @@ def run_pipeline(
2230
2250
  output_csv_path = Path(output_csv)
2231
2251
  # Save final data with sequences using same filename (overwrites lineage-only)
2232
2252
  sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
2253
+
2254
+ # Debug: Log what we're about to save
2255
+ seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
2256
+ log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
2257
+ if seq_count > 0 and 'aa_seq' in df_final:
2258
+ with_seq = df_final[~df_final['aa_seq'].isna()]
2259
+ log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
2260
+
2233
2261
  df_final.to_csv(sequence_path, index=False)
2234
2262
  log.info(
2235
2263
  "Overwrote with final results -> %s (%.1f kB)",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=0-ypEiBe2kUuZ71tMon-TVN7tQNbXA_-yM2NzlyMWuk,49
4
+ debase/_version.py,sha256=pU1LNmYqDA6wql-sQ9H8cktGdOqOUTonX-sx1fgYV2Y,49
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=_kVsuOXOR8qhrOIy3mKNJuac3joK6goke628nAJoj88,94183
7
+ debase/enzyme_lineage_extractor.py,sha256=k62qIp37ONYJBWT8D7ROh7ooYhz871BlCXJmAduq8js,95764
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.6.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.6.dist-info/METADATA,sha256=mLzMY4LQx3SoOsirAiKhcqMofiMZ_D7jjB68ohUplFI,10789
14
- debase-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.6.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.6.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.6.dist-info/RECORD,,
12
+ debase-0.1.8.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.8.dist-info/METADATA,sha256=uGoC1Ebi663KiWpLbzkbcpKHYFxKjTJXoU1lV4cbVJ8,10789
14
+ debase-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.8.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.8.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.8.dist-info/RECORD,,
File without changes