debase 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.1.5"
3
+ __version__ = "0.1.6"
@@ -1912,7 +1912,7 @@ def _infer_generations(variants: List[Variant]) -> None:
1912
1912
 
1913
1913
 
1914
1914
  def _merge_lineage_and_sequences(
1915
- lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str]
1915
+ lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str], model=None
1916
1916
  ) -> pd.DataFrame:
1917
1917
  """Return a tidy DataFrame with one row per variant."""
1918
1918
 
@@ -1943,6 +1943,65 @@ def _merge_lineage_and_sequences(
1943
1943
  # 2. Outer merge keeps every lineage entry and adds sequence cols when present
1944
1944
  df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
1945
1945
 
1946
+ # 2a. If we have unmatched sequences and a model, use Gemini to match them
1947
+ if model and len(df_seq) > 0:
1948
+ # Find lineage entries without sequences
1949
+ missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
1950
+ unmatched_lineage = df[missing_seq]['variant_id'].tolist()
1951
+
1952
+ # Find sequences that weren't matched
1953
+ matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
1954
+ unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
1955
+
1956
+ if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
1957
+ log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
1958
+ log.info(f"Using Gemini to match variants")
1959
+
1960
+ # Build prompt for Gemini to match variants
1961
+ prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
1962
+
1963
+ Lineage variant IDs (need sequences):
1964
+ {json.dumps(unmatched_lineage)}
1965
+
1966
+ Sequence variant IDs (have sequences):
1967
+ {json.dumps(unmatched_seqs['variant_id'].tolist())}
1968
+
1969
+ These lists contain variant identifiers from the same paper but may use different naming conventions.
1970
+ Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
1971
+
1972
+ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
1973
+ """
1974
+
1975
+ try:
1976
+ response = model.generate_content(prompt)
1977
+ text = _extract_text(response).strip()
1978
+
1979
+ # Parse JSON response
1980
+ if text.startswith("```"):
1981
+ text = text.split("```")[1].strip()
1982
+ if text.startswith("json"):
1983
+ text = text[4:].strip()
1984
+
1985
+ matches = json.loads(text)
1986
+
1987
+ # Apply the matches
1988
+ for lineage_id, seq_id in matches.items():
1989
+ if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
1990
+ # Get the sequence data
1991
+ seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
1992
+
1993
+ # Update the dataframe
1994
+ mask = df['variant_id'] == lineage_id
1995
+ df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
1996
+ df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
1997
+ df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
1998
+ df.loc[mask, 'truncated'] = seq_data['truncated']
1999
+
2000
+ log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
2001
+
2002
+ except Exception as e:
2003
+ log.warning(f"Failed to match variants using Gemini: {e}")
2004
+
1946
2005
  # 3. If generation missing after user input, try inference
1947
2006
  if df["generation"].isna().any():
1948
2007
  _infer_generations(lineage) # mutates in place
@@ -1968,6 +2027,7 @@ def merge_and_score(
1968
2027
  lineage: List[Variant],
1969
2028
  seqs: List[SequenceBlock],
1970
2029
  doi: Optional[str] = None,
2030
+ model=None,
1971
2031
  ) -> pd.DataFrame:
1972
2032
  """User-facing helper imported by the pipeline orchestrator.
1973
2033
 
@@ -1980,7 +2040,7 @@ def merge_and_score(
1980
2040
  raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
1981
2041
 
1982
2042
  # If no sequences found, still build a DataFrame so caller can decide what to do.
1983
- df = _merge_lineage_and_sequences(lineage, seqs, doi)
2043
+ df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
1984
2044
 
1985
2045
  # Basic sanity: warn if many missing sequences
1986
2046
  missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
@@ -2163,7 +2223,7 @@ def run_pipeline(
2163
2223
 
2164
2224
  # 5. Merge & score (Section 8) --------------------------------------------
2165
2225
  doi = extract_doi(manuscript)
2166
- df_final = merge_and_score(lineage, sequences, doi)
2226
+ df_final = merge_and_score(lineage, sequences, doi, model)
2167
2227
 
2168
2228
  # 6. Write FINAL CSV -------------------------------------------------------
2169
2229
  if output_csv:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,17 +1,17 @@
1
1
  debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
2
  debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
3
  debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=Bj9n2sI-8fEKj8LGa2ZU_dV7G5OnubUV9yK63_ZmeUU,49
4
+ debase/_version.py,sha256=0-ypEiBe2kUuZ71tMon-TVN7tQNbXA_-yM2NzlyMWuk,49
5
5
  debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
6
  debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=kn3pfPWctiaWC-oaynEOike9MQ-63ApAK1cmoHbTPzU,91159
7
+ debase/enzyme_lineage_extractor.py,sha256=_kVsuOXOR8qhrOIy3mKNJuac3joK6goke628nAJoj88,94183
8
8
  debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
9
9
  debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
10
10
  debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
11
11
  debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
12
- debase-0.1.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.5.dist-info/METADATA,sha256=1vMeMX3yGLXnnvnp9lN3mTRDCsdqFklE0puYlBemfyE,10789
14
- debase-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.5.dist-info/RECORD,,
12
+ debase-0.1.6.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
+ debase-0.1.6.dist-info/METADATA,sha256=mLzMY4LQx3SoOsirAiKhcqMofiMZ_D7jjB68ohUplFI,10789
14
+ debase-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ debase-0.1.6.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
+ debase-0.1.6.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
+ debase-0.1.6.dist-info/RECORD,,
File without changes