debase 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.1.7 → debase-0.1.8}/PKG-INFO +1 -1
- {debase-0.1.7 → debase-0.1.8}/src/debase/_version.py +1 -1
- {debase-0.1.7 → debase-0.1.8}/src/debase/enzyme_lineage_extractor.py +22 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/PKG-INFO +1 -1
- {debase-0.1.7 → debase-0.1.8}/.gitignore +0 -0
- {debase-0.1.7 → debase-0.1.8}/CONTRIBUTING.md +0 -0
- {debase-0.1.7 → debase-0.1.8}/LICENSE +0 -0
- {debase-0.1.7 → debase-0.1.8}/MANIFEST.in +0 -0
- {debase-0.1.7 → debase-0.1.8}/README.md +0 -0
- {debase-0.1.7 → debase-0.1.8}/docs/README.md +0 -0
- {debase-0.1.7 → debase-0.1.8}/docs/examples/README.md +0 -0
- {debase-0.1.7 → debase-0.1.8}/environment.yml +0 -0
- {debase-0.1.7 → debase-0.1.8}/pyproject.toml +0 -0
- {debase-0.1.7 → debase-0.1.8}/setup.cfg +0 -0
- {debase-0.1.7 → debase-0.1.8}/setup.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/__init__.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/PIPELINE_FLOW.md +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/__init__.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/__main__.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/build_db.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/lineage_format.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/reaction_info_extractor.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase/wrapper.py +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.1.7 → debase-0.1.8}/src/debase.egg-info/top_level.txt +0 -0
@@ -2005,6 +2005,10 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2005
2005
|
|
2006
2006
|
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
|
2007
2007
|
|
2008
|
+
# Log the final state after all matches
|
2009
|
+
matched_count = (~df['aa_seq'].isna()).sum()
|
2010
|
+
log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
|
2011
|
+
|
2008
2012
|
except Exception as e:
|
2009
2013
|
log.warning(f"Failed to match variants using Gemini: {e}")
|
2010
2014
|
|
@@ -2025,6 +2029,12 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
|
|
2025
2029
|
# 5. Sort rows: primary by generation, then by variant_id
|
2026
2030
|
df = df.sort_values(["generation", "variant_id"], kind="mergesort")
|
2027
2031
|
|
2032
|
+
# Debug: Log final merge state
|
2033
|
+
seq_count = (~df['aa_seq'].isna()).sum()
|
2034
|
+
log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
|
2035
|
+
if seq_count > 0:
|
2036
|
+
log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
|
2037
|
+
|
2028
2038
|
return df
|
2029
2039
|
|
2030
2040
|
# --- 8.3 Public API -----------------------------------------------------------
|
@@ -2053,6 +2063,10 @@ def merge_and_score(
|
|
2053
2063
|
if missing_rate > 0.5:
|
2054
2064
|
log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
|
2055
2065
|
|
2066
|
+
# Debug log before returning
|
2067
|
+
seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
|
2068
|
+
log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
|
2069
|
+
|
2056
2070
|
return df
|
2057
2071
|
|
2058
2072
|
# -------------------------------------------------------------------- end 8 ---
|
@@ -2236,6 +2250,14 @@ def run_pipeline(
|
|
2236
2250
|
output_csv_path = Path(output_csv)
|
2237
2251
|
# Save final data with sequences using same filename (overwrites lineage-only)
|
2238
2252
|
sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
|
2253
|
+
|
2254
|
+
# Debug: Log what we're about to save
|
2255
|
+
seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
|
2256
|
+
log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
|
2257
|
+
if seq_count > 0 and 'aa_seq' in df_final:
|
2258
|
+
with_seq = df_final[~df_final['aa_seq'].isna()]
|
2259
|
+
log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
|
2260
|
+
|
2239
2261
|
df_final.to_csv(sequence_path, index=False)
|
2240
2262
|
log.info(
|
2241
2263
|
"Overwrote with final results -> %s (%.1f kB)",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|