PyPI - debase - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

debase 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.1.9"
+__version__ = "0.1.11"

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -2024,21 +2024,55 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
                         text = text[4:].strip()
                 matches = json.loads(text)
+                log.info(f"Gemini returned matches: {matches}")
+                # Debug: Log what sequences we actually have
+                log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
                 # Apply the matches
                 for lineage_id, seq_id in matches.items():
-                    if lineage_id in unmatched_lineage and seq_id in unmatched_seqs['variant_id'].values:
-                        # Get the sequence data
-                        seq_data = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id].iloc[0]
+                    if lineage_id in unmatched_lineage:
+                        # Find the sequence data - be flexible with matching
+                        seq_data = None
-                        # Update the dataframe
-                        mask = df['variant_id'] == lineage_id
-                        df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
-                        df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
-                        df.loc[mask, 'seq_confidence'] = seq_data['seq_confidence']
-                        df.loc[mask, 'truncated'] = seq_data['truncated']
+                        # First try exact match
+                        seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
+                        if len(seq_matches) > 0:
+                            seq_data = seq_matches.iloc[0]
+                        else:
+                            # Try to find by checking various matching strategies
+                            for idx, row in unmatched_seqs.iterrows():
+                                variant_id = row['variant_id']
+                                # Check if one is contained in the other
+                                if seq_id in variant_id or variant_id in seq_id:
+                                    seq_data = row
+                                    break
+                                # Check if they share the same core identifier (e.g., G0, G1, etc.)
+                                seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
+                                variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
+                                if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
+                                    seq_data = row
+                                    break
-                        log.info(f"Matched {lineage_id} -> {seq_id} using Gemini")
+                        if seq_data is not None:
+                            # Update the dataframe
+                            mask = df['variant_id'] == lineage_id
+                            if mask.any():
+                                # Log before update
+                                log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
+                                df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
+                                df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
+                                df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
+                                df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
+                                # Log after update
+                                log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
+                                log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
+                            else:
+                                log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
+                        else:
+                            log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
                 # Log the final state after all matches
                 matched_count = (~df['aa_seq'].isna()).sum()

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.9
+Version: 0.1.11
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
 debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
 debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=j8HFA4JhFiintNU67gau8Re8N3rsxPqodcW8xAgdwqY,49
+debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
 debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
 debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
-debase/enzyme_lineage_extractor.py,sha256=mHKo6cxQdcAFuthQTpxc4fsGH73JO3VuLXSsixA7mOA,97421
+debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
 debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
 debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
 debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
 debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
-debase-0.1.9.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.1.9.dist-info/METADATA,sha256=QVjGEYd1VWDmQszko8IQ5jJ9xMQoT45SCY_oG9XvbMs,10789
-debase-0.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.1.9.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.1.9.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.1.9.dist-info/RECORD,,
+debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
+debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.1.11.dist-info/RECORD,,

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.1.9.dist-info → debase-0.1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

debase 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl