debase 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +69 -3
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/METADATA +1 -1
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/RECORD +8 -8
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/WHEEL +0 -0
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/entry_points.txt +0 -0
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.5.dist-info → debase-0.1.7.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -1912,7 +1912,7 @@ def _infer_generations(variants: List[Variant]) -> None:
|
|
1912
1912
|
|
1913
1913
|
|
1914
1914
|
def _merge_lineage_and_sequences(
|
1915
|
-
lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str]
|
1915
|
+
lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str], model=None
|
1916
1916
|
) -> pd.DataFrame:
|
1917
1917
|
"""Return a tidy DataFrame with one row per variant."""
|
1918
1918
|
|
@@ -1943,6 +1943,71 @@ def _merge_lineage_and_sequences(
|
|
1943
1943
|
# 2. Outer merge keeps every lineage entry and adds sequence cols when present
|
1944
1944
|
df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
|
1945
1945
|
|
1946
|
+
# 2a. If we have unmatched sequences and a model, use Gemini to match them
|
1947
|
+
log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
|
1948
|
+
if model and len(df_seq) > 0:
|
1949
|
+
# Log initial state
|
1950
|
+
log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
|
1951
|
+
log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
|
1952
|
+
log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
|
1953
|
+
|
1954
|
+
# Find lineage entries without sequences
|
1955
|
+
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
1956
|
+
unmatched_lineage = df[missing_seq]['variant_id'].tolist()
|
1957
|
+
|
1958
|
+
# Find sequences that weren't matched
|
1959
|
+
matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
|
1960
|
+
unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
|
1961
|
+
|
1962
|
+
if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
|
1963
|
+
log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
|
1964
|
+
log.info(f"Using Gemini to match variants")
|
1965
|
+
|
1966
|
+
# Build prompt for Gemini to match variants
|
1967
|
+
prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
|
1968
|
+
|
1969
|
+
Lineage variant IDs (need sequences):
|
1970
|
+
{json.dumps(unmatched_lineage)}
|
1971
|
+
|
1972
|
+
Sequence variant IDs (have sequences):
|
1973
|
+
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
1974
|
+
|
1975
|
+
These lists contain variant identifiers from the same paper but may use different naming conventions.
|
1976
|
+
Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
|
1977
|
+
|
1978
|
+
Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
|
1979
|
+
"""
|
1980
|
+
|
1981
|
+
try:
|
1982
|
+
response = model.generate_content(prompt)
|
1983
|
+
text = _extract_text(response).strip()
|
1984
|
+
|
1985
|
+
# Parse JSON response
|
1986
|
+
if text.startswith("```"):
|
1987
|
+
text = text.split("```")[1].strip()
|
1988
|
+
if text.startswith("json"):
|
1989
|
+
text = text[4:].strip()
|
1990
|
+
|
1991
|
+
matches = json.loads(text)
|
1992
|
+
|
1993
|
+
# Apply the matches
|
1994
|
+
for lineage_id, seq_id in matches.items():
|
1995
|
+
if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
|
1996
|
+
# Get the sequence data
|
1997
|
+
seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
|
1998
|
+
|
1999
|
+
# Update the dataframe
|
2000
|
+
mask = df['variant_id'] == lineage_id
|
2001
|
+
df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
|
2002
|
+
df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
|
2003
|
+
df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
|
2004
|
+
df.loc[mask, 'truncated'] = seq_data['truncated']
|
2005
|
+
|
2006
|
+
log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
|
2007
|
+
|
2008
|
+
except Exception as e:
|
2009
|
+
log.warning(f"Failed to match variants using Gemini: {e}")
|
2010
|
+
|
1946
2011
|
# 3. If generation missing after user input, try inference
|
1947
2012
|
if df["generation"].isna().any():
|
1948
2013
|
_infer_generations(lineage) # mutates in place
|
@@ -1968,6 +2033,7 @@ def merge_and_score(
|
|
1968
2033
|
lineage: List[Variant],
|
1969
2034
|
seqs: List[SequenceBlock],
|
1970
2035
|
doi: Optional[str] = None,
|
2036
|
+
model=None,
|
1971
2037
|
) -> pd.DataFrame:
|
1972
2038
|
"""User-facing helper imported by the pipeline orchestrator.
|
1973
2039
|
|
@@ -1980,7 +2046,7 @@ def merge_and_score(
|
|
1980
2046
|
raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
|
1981
2047
|
|
1982
2048
|
# If no sequences found, still build a DataFrame so caller can decide what to do.
|
1983
|
-
df = _merge_lineage_and_sequences(lineage, seqs, doi)
|
2049
|
+
df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
|
1984
2050
|
|
1985
2051
|
# Basic sanity: warn if many missing sequences
|
1986
2052
|
missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
|
@@ -2163,7 +2229,7 @@ def run_pipeline(
|
|
2163
2229
|
|
2164
2230
|
# 5. Merge & score (Section 8) --------------------------------------------
|
2165
2231
|
doi = extract_doi(manuscript)
|
2166
|
-
df_final = merge_and_score(lineage, sequences, doi)
|
2232
|
+
df_final = merge_and_score(lineage, sequences, doi, model)
|
2167
2233
|
|
2168
2234
|
# 6. Write FINAL CSV -------------------------------------------------------
|
2169
2235
|
if output_csv:
|
@@ -1,17 +1,17 @@
|
|
1
1
|
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
2
|
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
3
|
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=
|
4
|
+
debase/_version.py,sha256=vPJKuOkG3cFKZEVbwdNnidjitp6bll8M7nfNUPqx6YA,49
|
5
5
|
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
6
|
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=
|
7
|
+
debase/enzyme_lineage_extractor.py,sha256=7RWogFelrFcwAUup1MgJuiozjEbBc3ri2F2UV3eTEig,94544
|
8
8
|
debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
|
9
9
|
debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
|
10
10
|
debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
|
11
11
|
debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
|
12
|
-
debase-0.1.
|
13
|
-
debase-0.1.
|
14
|
-
debase-0.1.
|
15
|
-
debase-0.1.
|
16
|
-
debase-0.1.
|
17
|
-
debase-0.1.
|
12
|
+
debase-0.1.7.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
+
debase-0.1.7.dist-info/METADATA,sha256=Dj6i6WqC3QFJxXZyPQT7e27Aq8Hrrclu-44KadmYQJ0,10789
|
14
|
+
debase-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
debase-0.1.7.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
+
debase-0.1.7.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
+
debase-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|