PyPI - debase - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

debase 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.8"
+__version__ = "0.1.11"

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -1562,11 +1562,24 @@ TEXT (may be truncated):
 ```
 """.strip()
-def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
+def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
     """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
-    prompt = _SEQ_EXTRACTION_PROMPT.format(
+    base_prompt = _SEQ_EXTRACTION_PROMPT.format(
         schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
     )
+    # Add lineage context if available
+    if lineage_context:
+        prompt = f"""{base_prompt}
+IMPORTANT CONTEXT - Known variants from lineage extraction:
+{lineage_context}
+Match sequences to these known variants when possible. Variants may be labeled differently in different sections (e.g., "5295" might also appear as "ʟ-G0", "ʟ-ApPgb-αEsA-G0", or "ʟ-ApPgb-αEsA-G0 (5295)").
+"""
+    else:
+        prompt = base_prompt
     data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
     return _parse_sequences(data)
@@ -1620,7 +1633,7 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
     return blocks
 # --- 7.5  Convenience wrapper -------------------------------------------------
-def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None) -> list[SequenceBlock]:
+def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
     # Phase 1: Identify where sequences might be located
     locations = identify_sequence_locations(text, model, debug_dir=debug_dir)
@@ -1685,14 +1698,36 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             if focused_text and len(focused_text) < len(text):
                 log.info("Reduced text from %d to %d chars using validated location",
                          len(text), len(focused_text))
-                return extract_sequences(focused_text, model, debug_dir=debug_dir)
+                # Build lineage context if available
+                lineage_context = None
+                if lineage_variants:
+                    variant_info = []
+                    for v in lineage_variants[:20]:  # Limit to first 20
+                        info = f"- {v.variant_id} (Gen {v.generation})"
+                        if v.mutations:
+                            info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
+                        variant_info.append(info)
+                    lineage_context = "\n".join(variant_info)
+                return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
         else:
             log.warning("Location validation failed or returned invalid location: %s",
                        validation.get("reason", "Unknown"))
     # Fallback to full text
     log.info("Using full text for sequence extraction")
-    return extract_sequences(text, model, debug_dir=debug_dir)
+    # Build lineage context if available
+    lineage_context = None
+    if lineage_variants:
+        variant_info = []
+        for v in lineage_variants[:20]:  # Limit to first 20
+            info = f"- {v.variant_id} (Gen {v.generation})"
+            if v.mutations:
+                info += f" [{', '.join(v.mutations[:3])}{'...' if len(v.mutations) > 3 else ''}]"
+            variant_info.append(info)
+        lineage_context = "\n".join(variant_info)
+    return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
 # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
 """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -1989,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
                         text = text[4:].strip()
                 matches = json.loads(text)
+                log.info(f"Gemini returned matches: {matches}")
+                # Debug: Log what sequences we actually have
+                log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
                 # Apply the matches
                 for lineage_id, seq_id in matches.items():
-                    if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
-                        # Get the sequence data
-                        seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
+                    if lineage_id in unmatched_lineage:
+                        # Find the sequence data - be flexible with matching
+                        seq_data = None
-                        # Update the dataframe
-                        mask = df['variant_id'] == lineage_id
-                        df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
-                        df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
-                        df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
-                        df.loc[mask, 'truncated'] = seq_data['truncated']
+                        # First try exact match
+                        seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
+                        if len(seq_matches) > 0:
+                            seq_data = seq_matches.iloc[0]
+                        else:
+                            # Try to find by checking various matching strategies
+                            for idx, row in unmatched_seqs.iterrows():
+                                variant_id = row['variant_id']
+                                # Check if one is contained in the other
+                                if seq_id in variant_id or variant_id in seq_id:
+                                    seq_data = row
+                                    break
+                                # Check if they share the same core identifier (e.g., G0, G1, etc.)
+                                seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
+                                variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
+                                if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
+                                    seq_data = row
+                                    break
-                        log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
+                        if seq_data is not None:
+                            # Update the dataframe
+                            mask = df['variant_id'] == lineage_id
+                            if mask.any():
+                                # Log before update
+                                log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
+                                df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
+                                df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
+                                df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
+                                df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
+                                # Log after update
+                                log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
+                                log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
+                            else:
+                                log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
+                        else:
+                            log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
                 # Log the final state after all matches
                 matched_count = (~df['aa_seq'].isna()).sum()
@@ -2184,7 +2253,7 @@ def run_pipeline(
         )
     # 4. Extract sequences (Section 7) ----------------------------------------
-    sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
     # 4a. Try PDB extraction if no sequences found -----------------------------
     # Check if we need PDB sequences (no sequences or only partial sequences)

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.8
+Version: 0.1.11
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
 debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
 debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=pU1LNmYqDA6wql-sQ9H8cktGdOqOUTonX-sx1fgYV2Y,49
+debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
 debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
 debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
-debase/enzyme_lineage_extractor.py,sha256=k62qIp37ONYJBWT8D7ROh7ooYhz871BlCXJmAduq8js,95764
+debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
 debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
 debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
 debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
 debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
-debase-0.1.8.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.1.8.dist-info/METADATA,sha256=uGoC1Ebi663KiWpLbzkbcpKHYFxKjTJXoU1lV4cbVJ8,10789
-debase-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.1.8.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.1.8.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.1.8.dist-info/RECORD,,
+debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
+debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.1.11.dist-info/RECORD,,

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.1.8.dist-info → debase-0.1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

debase 0.1.8py3-none-any.whl → 0.1.11py3-none-any.whl