PyPI - debase - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

debase 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.5"
+__version__ = "0.1.7"

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -1912,7 +1912,7 @@ def _infer_generations(variants: List[Variant]) -> None:
 def _merge_lineage_and_sequences(
-    lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str]
+    lineage: List[Variant], seqs: List[SequenceBlock], doi: Optional[str], model=None
 ) -> pd.DataFrame:
     """Return a tidy DataFrame with one row per variant."""
@@ -1943,6 +1943,71 @@ def _merge_lineage_and_sequences(
     # 2. Outer merge keeps every lineage entry and adds sequence cols when present
     df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
+    # 2a. If we have unmatched sequences and a model, use Gemini to match them
+    log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
+    if model and len(df_seq) > 0:
+        # Log initial state
+        log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
+        log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
+        log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
+        # Find lineage entries without sequences
+        missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
+        unmatched_lineage = df[missing_seq]['variant_id'].tolist()
+        # Find sequences that weren't matched
+        matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
+        unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
+        if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
+            log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
+            log.info(f"Using Gemini to match variants")
+            # Build prompt for Gemini to match variants
+            prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
+Lineage variant IDs (need sequences):
+{json.dumps(unmatched_lineage)}
+Sequence variant IDs (have sequences):
+{json.dumps(unmatched_seqs['variant_id'].tolist())}
+These lists contain variant identifiers from the same paper but may use different naming conventions.
+Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
+Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
+"""
+            try:
+                response = model.generate_content(prompt)
+                text = _extract_text(response).strip()
+                # Parse JSON response
+                if text.startswith("```"):
+                    text = text.split("```")[1].strip()
+                    if text.startswith("json"):
+                        text = text[4:].strip()
+                matches = json.loads(text)
+                # Apply the matches
+                for lineage_id, seq_id in matches.items():
+                    if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
+                        # Get the sequence data
+                        seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
+                        # Update the dataframe
+                        mask = df['variant_id'] == lineage_id
+                        df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
+                        df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
+                        df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
+                        df.loc[mask, 'truncated'] = seq_data['truncated']
+                        log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
+            except Exception as e:
+                log.warning(f"Failed to match variants using Gemini: {e}")
     # 3. If generation missing after user input, try inference
     if df["generation"].isna().any():
         _infer_generations(lineage)  # mutates in place
@@ -1968,6 +2033,7 @@ def merge_and_score(
     lineage: List[Variant],
     seqs: List[SequenceBlock],
     doi: Optional[str] = None,
+    model=None,
 ) -> pd.DataFrame:
     """User-facing helper imported by the pipeline orchestrator.
@@ -1980,7 +2046,7 @@ def merge_and_score(
         raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
     # If no sequences found, still build a DataFrame so caller can decide what to do.
-    df = _merge_lineage_and_sequences(lineage, seqs, doi)
+    df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
     # Basic sanity: warn if many missing sequences
     missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
@@ -2163,7 +2229,7 @@ def run_pipeline(
     # 5. Merge & score (Section 8) --------------------------------------------
     doi = extract_doi(manuscript)
-    df_final = merge_and_score(lineage, sequences, doi)
+    df_final = merge_and_score(lineage, sequences, doi, model)
     # 6. Write FINAL CSV -------------------------------------------------------
     if output_csv:

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.5
+Version: 0.1.7
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
 debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
 debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=Bj9n2sI-8fEKj8LGa2ZU_dV7G5OnubUV9yK63_ZmeUU,49
+debase/_version.py,sha256=vPJKuOkG3cFKZEVbwdNnidjitp6bll8M7nfNUPqx6YA,49
 debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
 debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
-debase/enzyme_lineage_extractor.py,sha256=kn3pfPWctiaWC-oaynEOike9MQ-63ApAK1cmoHbTPzU,91159
+debase/enzyme_lineage_extractor.py,sha256=7RWogFelrFcwAUup1MgJuiozjEbBc3ri2F2UV3eTEig,94544
 debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
 debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
 debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
 debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
-debase-0.1.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.1.5.dist-info/METADATA,sha256=1vMeMX3yGLXnnvnp9lN3mTRDCsdqFklE0puYlBemfyE,10789
-debase-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.1.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.1.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.1.5.dist-info/RECORD,,
+debase-0.1.7.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.1.7.dist-info/METADATA,sha256=Dj6i6WqC3QFJxXZyPQT7e27Aq8Hrrclu-44KadmYQJ0,10789
+debase-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.1.7.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.1.7.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.1.7.dist-info/RECORD,,

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.1.5.dist-info → debase-0.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

debase 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl