PyPI - debase - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +623 -234
debase/lineage_format.py +113 -11
debase/reaction_info_extractor.py +21 -7
debase/substrate_scope_extractor.py +516 -67
debase/wrapper.py +301 -67
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
debase-0.1.17.dist-info/RECORD +17 -0
debase-0.1.11.dist-info/RECORD +0 -17
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0

debase/wrapper.py CHANGED Viewed

@@ -46,101 +46,333 @@ def run_sequence_cleanup(input_csv: Path, output_csv: Path) -> Path:
     """
     Step 2: Clean and validate protein sequences
     Calls: cleanup_sequence.py
+    Returns output path even if cleanup fails (copies input file)
     """
     logger.info(f"Cleaning sequences from {input_csv.name}")
-    from .cleanup_sequence import main as cleanup_sequences
-    cleanup_sequences([str(input_csv), str(output_csv)])
-    logger.info(f"Sequence cleanup complete: {output_csv}")
-    return output_csv
+    try:
+        from .cleanup_sequence import main as cleanup_sequences
+        cleanup_sequences([str(input_csv), str(output_csv)])
+        logger.info(f"Sequence cleanup complete: {output_csv}")
+        return output_csv
+    except Exception as e:
+        logger.warning(f"Sequence cleanup failed: {e}")
+        logger.info("Copying original file to continue pipeline...")
+        # Copy the input file as-is to continue pipeline
+        import shutil
+        shutil.copy2(input_csv, output_csv)
+        logger.info(f"Original file copied: {output_csv}")
+        return output_csv
 def run_reaction_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
     """
     Step 3a: Extract reaction performance metrics
     Calls: reaction_info_extractor.py
+    Returns output path even if extraction fails (creates empty file)
     """
     logger.info(f"Extracting reaction info for enzymes in {lineage_csv.name}")
-    from .reaction_info_extractor import ReactionExtractor, Config
-    import pandas as pd
-    # Load enzyme data
-    enzyme_df = pd.read_csv(lineage_csv)
-    # Initialize extractor and run
-    cfg = Config()
-    extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
-    df_metrics = extractor.run(enzyme_df)
-    # Save results
-    df_metrics.to_csv(output, index=False)
-    logger.info(f"Reaction extraction complete: {output}")
-    return output
+    try:
+        from .reaction_info_extractor import ReactionExtractor, Config
+        import pandas as pd
+        # Load enzyme data
+        enzyme_df = pd.read_csv(lineage_csv)
+        # Initialize extractor and run
+        cfg = Config()
+        extractor = ReactionExtractor(manuscript, si, cfg, debug_dir=debug_dir)
+        df_metrics = extractor.run(enzyme_df)
+        # Save results
+        df_metrics.to_csv(output, index=False)
+        logger.info(f"Reaction extraction complete: {output}")
+        return output
+    except Exception as e:
+        logger.warning(f"Reaction extraction failed: {e}")
+        logger.info("Creating empty reaction info file to continue pipeline...")
+        # Create empty reaction CSV with basic columns
+        import pandas as pd
+        empty_df = pd.DataFrame(columns=[
+            'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
+            'conversion_percent', 'reaction_type', 'reaction_conditions', 'notes'
+        ])
+        empty_df.to_csv(output, index=False)
+        logger.info(f"Empty reaction file created: {output}")
+        return output
 def run_substrate_scope_extraction(manuscript: Path, si: Path, lineage_csv: Path, output: Path, debug_dir: Path = None) -> Path:
     """
     Step 3b: Extract substrate scope data (runs in parallel with reaction extraction)
     Calls: substrate_scope_extractor.py
+    Returns output path even if extraction fails (creates empty file)
     """
     logger.info(f"Extracting substrate scope for enzymes in {lineage_csv.name}")
-    from .substrate_scope_extractor import run_pipeline
+    try:
+        from .substrate_scope_extractor import run_pipeline
+        # Run substrate scope extraction
+        run_pipeline(
+            manuscript=manuscript,
+            si=si,
+            lineage_csv=lineage_csv,
+            output_csv=output,
+            debug_dir=debug_dir
+        )
+        logger.info(f"Substrate scope extraction complete: {output}")
+        return output
+    except Exception as e:
+        logger.warning(f"Substrate scope extraction failed: {e}")
+        logger.info("Creating empty substrate scope file to continue pipeline...")
+        # Create empty substrate scope CSV with proper headers
+        import pandas as pd
+        empty_df = pd.DataFrame(columns=[
+            'enzyme', 'substrate', 'product', 'yield_percent', 'ee_percent',
+            'conversion_percent', 'selectivity', 'reaction_conditions', 'notes'
+        ])
+        empty_df.to_csv(output, index=False)
+        logger.info(f"Empty substrate scope file created: {output}")
+        return output
+def match_enzyme_variants_with_gemini(lineage_enzymes: list, data_enzymes: list, model=None) -> dict:
+    """
+    Use Gemini to match enzyme variant IDs between different datasets.
+    Returns a mapping of data_enzyme_id -> lineage_enzyme_id.
+    """
+    import json
-    # Run substrate scope extraction
-    run_pipeline(
-        manuscript=manuscript,
-        si=si,
-        lineage_csv=lineage_csv,
-        output_csv=output,
-        debug_dir=debug_dir
-    )
+    if not model:
+        try:
+            from .enzyme_lineage_extractor import get_model
+            model = get_model()
+        except:
+            logger.warning("Could not load Gemini model for variant matching")
+            return {}
-    logger.info(f"Substrate scope extraction complete: {output}")
-    return output
+    prompt = f"""Match enzyme variant IDs between two lists from the same scientific paper.
+These lists come from different sections or analyses of the same study, but may use different naming conventions.
+List 1 (from lineage/sequence data):
+{json.dumps(lineage_enzymes)}
+List 2 (from experimental data):
+{json.dumps(data_enzymes)}
+Analyze the patterns and match variants that refer to the same enzyme.
+Return ONLY a JSON object mapping IDs from List 2 to their corresponding IDs in List 1.
+Format: {{"list2_id": "list1_id", ...}}
+Only include matches you are confident about based on the naming patterns.
+"""
+    try:
+        response = model.generate_content(prompt)
+        mapping_text = response.text.strip()
+        # Extract JSON from response
+        if '```json' in mapping_text:
+            mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
+        elif '```' in mapping_text:
+            mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
+        mapping = json.loads(mapping_text)
+        logger.info(f"Gemini matched {len(mapping)} enzyme variants")
+        for k, v in mapping.items():
+            logger.info(f"  Matched '{k}' -> '{v}'")
+        return mapping
+    except Exception as e:
+        logger.warning(f"Failed to match variants with Gemini: {e}")
+        return {}
 def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
     """
     Step 4: Format and merge all data into final CSV
-    Calls: lineage_format.py
+    Creates comprehensive format merging all available data, even if some extraction steps failed
     """
     logger.info(f"Formatting and merging data into final output")
-    from .lineage_format import run_pipeline
-    import pandas as pd
-    # First, we need to merge the protein sequences into the reaction data
-    df_reaction = pd.read_csv(reaction_csv)
-    df_sequences = pd.read_csv(cleaned_csv)
-    # Merge sequences into reaction data
-    # Include generation and parent info for proper mutation calculation
-    sequence_cols = ['protein_sequence', 'dna_seq', 'seq_confidence', 'truncated', 'flag',
-                     'generation', 'parent_enzyme_id', 'mutations']
-    sequence_data = df_sequences[['enzyme_id'] + [col for col in sequence_cols if col in df_sequences.columns]]
-    # Merge on enzyme_id or variant_id
-    if 'enzyme_id' in df_reaction.columns:
-        df_reaction = df_reaction.merge(sequence_data, on='enzyme_id', how='left', suffixes=('', '_seq'))
-    elif 'enzyme' in df_reaction.columns:
-        sequence_data = sequence_data.rename(columns={'enzyme_id': 'enzyme'})
-        df_reaction = df_reaction.merge(sequence_data, on='enzyme', how='left', suffixes=('', '_seq'))
-    # Save the merged reaction data
-    df_reaction.to_csv(reaction_csv, index=False)
-    # Run the formatting pipeline
-    df_final = run_pipeline(
-        reaction_csv=reaction_csv,
-        substrate_scope_csv=substrate_scope_csv,
-        output_csv=output_csv
-    )
-    logger.info(f"Final formatting complete: {output_csv}")
-    return output_csv
+    try:
+        import pandas as pd
+        # Read all available data files
+        logger.info("Reading enzyme lineage data...")
+        df_lineage = pd.read_csv(cleaned_csv)
+        logger.info("Reading reaction data...")
+        try:
+            df_reaction = pd.read_csv(reaction_csv)
+            has_reaction_data = len(df_reaction) > 0 and not df_reaction.empty
+        except:
+            df_reaction = pd.DataFrame()
+            has_reaction_data = False
+        logger.info("Reading substrate scope data...")
+        try:
+            df_scope = pd.read_csv(substrate_scope_csv)
+            has_scope_data = len(df_scope) > 0 and not df_scope.empty
+        except:
+            df_scope = pd.DataFrame()
+            has_scope_data = False
+        # Start with lineage data as base
+        df_final = df_lineage.copy()
+        # Ensure consistent enzyme ID column
+        if 'variant_id' in df_final.columns and 'enzyme_id' not in df_final.columns:
+            df_final = df_final.rename(columns={'variant_id': 'enzyme_id'})
+        # Merge reaction data if available
+        if has_reaction_data:
+            logger.info(f"Merging reaction data ({len(df_reaction)} records)")
+            # Match on enzyme_id or enzyme
+            merge_key = 'enzyme_id' if 'enzyme_id' in df_reaction.columns else 'enzyme'
+            if merge_key in df_reaction.columns:
+                df_final = df_final.merge(df_reaction, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_reaction'))
+        else:
+            logger.info("No reaction data available")
+        # Merge substrate scope data if available
+        if has_scope_data:
+            logger.info(f"Merging substrate scope data ({len(df_scope)} records)")
+            merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
+            if merge_key in df_scope.columns:
+                # First try direct merge
+                df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
+                # Check if any matches were found
+                matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
+                if matched_count == 0:
+                    logger.info("No direct matches found, using Gemini to match enzyme variants...")
+                    # Get unique enzyme IDs from both datasets
+                    lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
+                    scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
+                    # Get mapping from Gemini
+                    mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
+                    if mapping:
+                        # Apply mapping to scope data
+                        df_scope_mapped = df_scope.copy()
+                        df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
+                        df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
+                    else:
+                        logger.warning("Could not match enzyme variants between datasets")
+                        df_final = df_test_merge
+                else:
+                    df_final = df_test_merge
+                    logger.info(f"Direct merge matched {matched_count} records")
+        else:
+            logger.info("No substrate scope data available")
+        # Add comprehensive column structure for missing data
+        essential_columns = [
+            'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
+            'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
+            'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
+            'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
+            'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
+            'data_location'
+        ]
+        # Add missing columns with NaN
+        for col in essential_columns:
+            if col not in df_final.columns:
+                df_final[col] = None
+        # Clean up duplicate columns from merging
+        columns_to_keep = []
+        seen_base_names = set()
+        for col in df_final.columns:
+            base_name = col.split('_reaction')[0].split('_scope')[0]
+            if base_name not in seen_base_names:
+                columns_to_keep.append(col)
+                seen_base_names.add(base_name)
+            elif col.endswith('_scope') or col.endswith('_reaction'):
+                # Prefer scope or reaction data over base lineage data for certain columns
+                if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
+                    columns_to_keep.append(col)
+                    # Remove the base column if it exists
+                    if base_name in columns_to_keep:
+                        columns_to_keep.remove(base_name)
+                    seen_base_names.add(base_name)
+        df_final = df_final[columns_to_keep]
+        # Rename merged columns back to standard names
+        rename_map = {}
+        for col in df_final.columns:
+            if col.endswith('_scope') or col.endswith('_reaction'):
+                base_name = col.split('_scope')[0].split('_reaction')[0]
+                rename_map[col] = base_name
+        df_final = df_final.rename(columns=rename_map)
+        # Save the comprehensive final output
+        df_final.to_csv(output_csv, index=False)
+        logger.info(f"Final comprehensive format complete: {output_csv}")
+        logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
+        # Log what data was successfully merged
+        if has_reaction_data:
+            logger.info("✓ Reaction performance data merged")
+        if has_scope_data:
+            logger.info("✓ Substrate scope data merged")
+        # Now run the actual lineage format to produce plate-based format
+        logger.info("\nRunning lineage format to produce plate-based output...")
+        try:
+            from .lineage_format import flatten_dataframe
+            # Create the plate-based output filename
+            plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
+            # Flatten the dataframe to plate format
+            df_flattened = flatten_dataframe(df_final)
+            # Save the flattened output
+            df_flattened.to_csv(plate_output, index=False)
+            logger.info(f"✓ Plate-based format saved to: {plate_output}")
+            logger.info(f"  Contains {len(df_flattened)} rows with plate/well assignments")
+            # Update the final output path to be the plate format
+            output_csv = plate_output
+        except Exception as e:
+            logger.warning(f"Could not generate plate-based format: {e}")
+            logger.info("Comprehensive format will be used as final output")
+        return output_csv
+    except Exception as e:
+        logger.warning(f"Final formatting failed: {e}")
+        logger.info("Using cleaned sequence data as final output...")
+        # Copy the cleaned CSV as the final output
+        import shutil
+        shutil.copy2(cleaned_csv, output_csv)
+        logger.info(f"Cleaned sequence file used as final output: {output_csv}")
+        return output_csv
 def run_pipeline(
@@ -206,7 +438,7 @@ def run_pipeline(
         # Step 4: Format and merge
         logger.info("\n[Step 4/5] Formatting and merging data...")
-        run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
+        final_output = run_lineage_format(reaction_csv, substrate_csv, cleaned_csv, output_path)
         # Step 5: Finalize
         logger.info("\n[Step 5/5] Finalizing...")
@@ -219,11 +451,13 @@ def run_pipeline(
         logger.info("\n" + "="*60)
         logger.info("PIPELINE COMPLETED SUCCESSFULLY")
-        logger.info(f"Output: {output_path}")
+        logger.info(f"Comprehensive output: {output_path}")
+        if final_output != output_path:
+            logger.info(f"Plate-based output: {final_output}")
         logger.info(f"Runtime: {elapsed:.1f} seconds")
         logger.info("="*60)
-        return output_path
+        return final_output
     except Exception as e:
         logger.error(f"Pipeline failed: {str(e)}")

{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.11
+Version: 0.1.17
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.1.17.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=edeF0ciTSBytkIGNcNjx3UR4nAs3QzF_Lmmyr66k0Jc,50
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
+debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
+debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
+debase/reaction_info_extractor.py,sha256=NjOXZf22i3PvYpCgk9DCnswCbgmCQkj5V2-E21LEM6M,112876
+debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
+debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
+debase-0.1.17.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.1.17.dist-info/METADATA,sha256=uCGXpNG7dIVZtpywd8V7kBcXuWHPyTjhJmH0mWKD7Ew,10790
+debase-0.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.1.17.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.1.17.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.1.17.dist-info/RECORD,,

debase-0.1.11.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=L4sqaU-oAJRWrcboH-vA95jHfUiXr5-fAsrF7lqZSyQ,50
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
-debase/enzyme_lineage_extractor.py,sha256=at4OYHdXtgMku1FR_6AsHWk64UKInWkGQL9m3H6cKIQ,99809
-debase/lineage_format.py,sha256=mACni9M1RXA_1tIyDZJpStQoutd_HLG2qQMAORTusZs,30045
-debase/reaction_info_extractor.py,sha256=6wWj4IyUNSugNjxpwMGjABSAp68yHABaz_7ZRjh9GEk,112162
-debase/substrate_scope_extractor.py,sha256=dbve8q3K7ggA3A6EwB-KK9L19BnMNgPZMZ05G937dSY,82262
-debase/wrapper.py,sha256=lTx375a57EVuXcZ_roXaj5UDj8HjRcb5ViNaSgPN4Ik,10352
-debase-0.1.11.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.1.11.dist-info/METADATA,sha256=ZSR0Yl36Al_rQm9Ht9jut7om3xQT8yqyobIjEUH_Xfo,10790
-debase-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.1.11.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.1.11.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.1.11.dist-info/RECORD,,

{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl