PyPI - debase - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

debase 0.4.3py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +512 -33
debase/enzyme_lineage_extractor.py +977 -97
debase/lineage_format.py +221 -12
debase/reaction_info_extractor.py +133 -23
debase/substrate_scope_extractor.py +49 -2
debase/wrapper.py +155 -151
debase-0.4.4.dist-info/METADATA +121 -0
debase-0.4.4.dist-info/RECORD +16 -0
debase-0.4.3.dist-info/METADATA +0 -296
debase-0.4.3.dist-info/RECORD +0 -16
{debase-0.4.3.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
{debase-0.4.3.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
{debase-0.4.3.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.3.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0

debase/wrapper.py CHANGED Viewed

@@ -75,6 +75,48 @@ def reset_token_usage():
             module_data['output'] = 0
             module_data['calls'] = 0
+def save_token_usage_to_csv(manuscript_path: Path, input_tokens: int, output_tokens: int, cost: float, runtime: float, output_dir: Path):
+    """Save token usage and cost to CSV with naming format: price_manuscriptname.csv"""
+    import pandas as pd
+    # Create filename: price_manuscriptname.csv
+    manuscript_name = manuscript_path.stem.replace(' ', '_').replace('-', '_')
+    csv_filename = f"price_{manuscript_name}.csv"
+    csv_path = output_dir / csv_filename
+    # Prepare the data
+    data = {
+        'manuscript_name': [manuscript_name],
+        'timestamp': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
+        'input_tokens': [input_tokens],
+        'output_tokens': [output_tokens],
+        'total_tokens': [input_tokens + output_tokens],
+        'estimated_cost_usd': [cost],
+        'runtime_seconds': [runtime]
+    }
+    # Add module breakdown
+    with _token_lock:
+        for module_name, usage in _token_usage['calls_by_module'].items():
+            if usage['calls'] > 0:
+                data[f'{module_name}_calls'] = [usage['calls']]
+                data[f'{module_name}_input_tokens'] = [usage['input']]
+                data[f'{module_name}_output_tokens'] = [usage['output']]
+                module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
+                data[f'{module_name}_cost_usd'] = [module_cost]
+            else:
+                data[f'{module_name}_calls'] = [0]
+                data[f'{module_name}_input_tokens'] = [0]
+                data[f'{module_name}_output_tokens'] = [0]
+                data[f'{module_name}_cost_usd'] = [0.0]
+    # Create DataFrame and save
+    df = pd.DataFrame(data)
+    df.to_csv(csv_path, index=False)
+    logger.info(f"Token usage saved to: {csv_path}")
+    return csv_path
 def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
     """
@@ -277,175 +319,104 @@ Only include matches you are confident about based on the naming patterns.
 def run_lineage_format(reaction_csv: Path, substrate_scope_csv: Path, cleaned_csv: Path, output_csv: Path) -> Path:
     """
     Step 4: Format and merge all data into final CSV
-    Creates comprehensive format merging all available data, even if some extraction steps failed
+    Uses lineage_format module to normalize data, convert IUPAC to SMILES, fill missing sequences,
+    and create the plate format output
     """
-    logger.info(f"Formatting and merging data into final output")
+    logger.info(f"Formatting and merging data into final plate format output")
     try:
+        from . import lineage_format
         import pandas as pd
-        # Read all available data files
-        logger.info("Reading enzyme lineage data...")
-        df_lineage = pd.read_csv(cleaned_csv)
+        # Check which files have data
+        has_reaction_data = False
+        has_scope_data = False
-        logger.info("Reading reaction data...")
         try:
             df_reaction = pd.read_csv(reaction_csv)
-            has_reaction_data = len(df_reaction) > 0 and not df_reaction.empty
-        except:
-            df_reaction = pd.DataFrame()
-            has_reaction_data = False
-        logger.info("Reading substrate scope data...")
+            has_reaction_data = len(df_reaction) > 0
+            logger.info(f"Reaction data has {len(df_reaction)} entries")
+        except Exception as e:
+            logger.info(f"No reaction data available: {e}")
         try:
             df_scope = pd.read_csv(substrate_scope_csv)
-            has_scope_data = len(df_scope) > 0 and not df_scope.empty
-        except:
-            df_scope = pd.DataFrame()
-            has_scope_data = False
-        # Start with lineage data as base
-        df_final = df_lineage.copy()
-        # Ensure consistent enzyme ID column
-        if 'variant_id' in df_final.columns and 'enzyme_id' not in df_final.columns:
-            df_final = df_final.rename(columns={'variant_id': 'enzyme_id'})
-        # Merge reaction data if available
-        if has_reaction_data:
-            logger.info(f"Merging reaction data ({len(df_reaction)} records)")
-            # Match on enzyme_id or enzyme
-            merge_key = 'enzyme_id' if 'enzyme_id' in df_reaction.columns else 'enzyme'
-            if merge_key in df_reaction.columns:
-                df_final = df_final.merge(df_reaction, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_reaction'))
-        else:
-            logger.info("No reaction data available")
+            has_scope_data = len(df_scope) > 0
+            logger.info(f"Substrate scope data has {len(df_scope)} entries")
+        except Exception as e:
+            logger.info(f"No substrate scope data available: {e}")
+        # Use lineage_format's run_pipeline to process the data
+        logger.info("Running lineage format pipeline to create plate format...")
+        # The lineage_format expects string paths
+        reaction_path = str(reaction_csv) if has_reaction_data else None
+        scope_path = str(substrate_scope_csv) if has_scope_data else None
+        # If neither file has data, just copy the cleaned file
+        if not has_reaction_data and not has_scope_data:
+            logger.warning("No data to process in either reaction or substrate scope files")
+            import shutil
+            shutil.copy2(cleaned_csv, output_csv)
+            return output_csv
+        # Call lineage_format's run_pipeline function
+        # This will handle all the processing including:
+        # - Merging reaction and substrate scope data
+        # - Filling missing sequences
+        # - Converting IUPAC names to SMILES
+        # - Creating the flattened plate format
+        logger.info("Calling lineage_format.run_pipeline...")
+        # Run the pipeline and get the formatted dataframe
+        df = lineage_format.run_pipeline(
+            reaction_csv=reaction_path,
+            substrate_scope_csv=scope_path,
+            output_csv=str(output_csv)
+        )
-        # Merge substrate scope data if available
-        if has_scope_data:
-            logger.info(f"Merging substrate scope data ({len(df_scope)} records)")
-            merge_key = 'enzyme_id' if 'enzyme_id' in df_scope.columns else 'enzyme'
-            if merge_key in df_scope.columns:
-                # First try direct merge
-                df_test_merge = df_final.merge(df_scope, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
-                # Check if any matches were found
-                matched_count = df_test_merge[merge_key + '_scope'].notna().sum() if merge_key + '_scope' in df_test_merge.columns else 0
-                if matched_count == 0:
-                    logger.info("No direct matches found, using Gemini to match enzyme variants...")
-                    # Get unique enzyme IDs from both datasets
-                    lineage_enzymes = df_final['enzyme_id'].dropna().unique().tolist()
-                    scope_enzymes = df_scope[merge_key].dropna().unique().tolist()
-                    # Get mapping from Gemini
-                    mapping = match_enzyme_variants_with_gemini(lineage_enzymes, scope_enzymes)
-                    if mapping:
-                        # Apply mapping to scope data
-                        df_scope_mapped = df_scope.copy()
-                        df_scope_mapped[merge_key] = df_scope_mapped[merge_key].map(lambda x: mapping.get(x, x))
-                        df_final = df_final.merge(df_scope_mapped, left_on='enzyme_id', right_on=merge_key, how='left', suffixes=('', '_scope'))
-                    else:
-                        logger.warning("Could not match enzyme variants between datasets")
-                        df_final = df_test_merge
-                else:
-                    df_final = df_test_merge
-                    logger.info(f"Direct merge matched {matched_count} records")
-        else:
-            logger.info("No substrate scope data available")
-        # Add comprehensive column structure for missing data
-        essential_columns = [
-            'enzyme_id', 'parent_id', 'generation', 'mutations', 'campaign_id', 'notes',
-            'aa_seq', 'dna_seq', 'seq_confidence', 'truncated', 'seq_source', 'doi',
-            'substrate_list', 'substrate_iupac_list', 'product_list', 'product_iupac_list',
-            'cofactor_list', 'cofactor_iupac_list', 'yield', 'ee', 'ttn',
-            'reaction_temperature', 'reaction_ph', 'reaction_buffer', 'reaction_other_conditions',
-            'data_location'
-        ]
+        logger.info(f"Lineage format pipeline completed successfully")
+        logger.info(f"Final output saved to: {output_csv}")
+        logger.info(f"Output contains {len(df)} rows in plate format (flattened)")
-        # Add missing columns with NaN
-        for col in essential_columns:
-            if col not in df_final.columns:
-                df_final[col] = None
-        # Clean up duplicate columns from merging
-        columns_to_keep = []
-        seen_base_names = set()
-        for col in df_final.columns:
-            base_name = col.split('_reaction')[0].split('_scope')[0]
-            if base_name not in seen_base_names:
-                columns_to_keep.append(col)
-                seen_base_names.add(base_name)
-            elif col.endswith('_scope') or col.endswith('_reaction'):
-                # Prefer scope or reaction data over base lineage data for certain columns
-                if base_name in ['substrate_list', 'product_list', 'yield', 'ee', 'reaction_temperature']:
-                    columns_to_keep.append(col)
-                    # Remove the base column if it exists
-                    if base_name in columns_to_keep:
-                        columns_to_keep.remove(base_name)
-                    seen_base_names.add(base_name)
-        df_final = df_final[columns_to_keep]
-        # Rename merged columns back to standard names
-        rename_map = {}
-        for col in df_final.columns:
-            if col.endswith('_scope') or col.endswith('_reaction'):
-                base_name = col.split('_scope')[0].split('_reaction')[0]
-                rename_map[col] = base_name
-        df_final = df_final.rename(columns=rename_map)
-        # Save the comprehensive final output
-        df_final.to_csv(output_csv, index=False)
-        logger.info(f"Final comprehensive format complete: {output_csv}")
-        logger.info(f"Final output contains {len(df_final)} variants with {len(df_final.columns)} data columns")
-        # Log what data was successfully merged
-        if has_reaction_data:
-            logger.info("✓ Reaction performance data merged")
-        if has_scope_data:
-            logger.info("✓ Substrate scope data merged")
-        # Now run the actual lineage format to produce plate-based format
-        logger.info("\nRunning lineage format to produce plate-based output...")
-        try:
-            from .lineage_format import flatten_dataframe
-            # Create the plate-based output filename
-            plate_output = output_csv.parent / (output_csv.stem + "_plate_format.csv")
-            # Flatten the dataframe to plate format
-            df_flattened = flatten_dataframe(df_final)
-            # Save the flattened output
-            df_flattened.to_csv(plate_output, index=False)
-            logger.info(f"✓ Plate-based format saved to: {plate_output}")
-            logger.info(f"  Contains {len(df_flattened)} rows with plate/well assignments")
-            # Update the final output path to be the plate format
-            output_csv = plate_output
-        except Exception as e:
-            logger.warning(f"Could not generate plate-based format: {e}")
-            logger.info("Comprehensive format will be used as final output")
+        # Log column summary
+        key_columns = ['enzyme_id', 'substrate', 'product', 'yield', 'ee', 'ttn',
+                      'substrate_smiles', 'product_smiles', 'protein_sequence']
+        available_columns = [col for col in key_columns if col in df.columns]
+        logger.info(f"Key columns in output: {', '.join(available_columns)}")
         return output_csv
     except Exception as e:
-        logger.warning(f"Final formatting failed: {e}")
-        logger.info("Using cleaned sequence data as final output...")
+        logger.warning(f"Lineage formatting failed: {e}")
+        logger.info("Falling back to simple concatenation...")
-        # Copy the cleaned CSV as the final output
-        import shutil
-        shutil.copy2(cleaned_csv, output_csv)
+        # Fallback to simple concatenation
+        import pandas as pd
+        dfs = []
+        try:
+            df_reaction = pd.read_csv(reaction_csv)
+            if len(df_reaction) > 0:
+                dfs.append(df_reaction)
+        except:
+            pass
+        try:
+            df_scope = pd.read_csv(substrate_scope_csv)
+            if len(df_scope) > 0:
+                dfs.append(df_scope)
+        except:
+            pass
+        if dfs:
+            df_final = pd.concat(dfs, ignore_index=True)
+            df_final.to_csv(output_csv, index=False)
+        else:
+            import shutil
+            shutil.copy2(cleaned_csv, output_csv)
-        logger.info(f"Cleaned sequence file used as final output: {output_csv}")
+        logger.info(f"Fallback output saved to: {output_csv}")
         return output_csv
@@ -478,6 +449,26 @@ def run_pipeline(
     reaction_csv = output_dir / "3a_reaction_info.csv"
     substrate_csv = output_dir / "3b_substrate_scope.csv"
+    # Setup file logging
+    log_file = output_dir / f"debase_pipeline_{time.strftime('%Y%m%d_%H%M%S')}.log"
+    # Configure logging to both file and console
+    file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
+    file_handler.setLevel(logging.DEBUG)
+    file_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')
+    file_handler.setFormatter(file_formatter)
+    # Add file handler to root logger and all module loggers
+    root_logger = logging.getLogger()
+    root_logger.addHandler(file_handler)
+    # Also add to module-specific loggers
+    for module_name in ['debase.enzyme_lineage_extractor', 'debase.cleanup_sequence',
+                        'debase.reaction_info_extractor', 'debase.substrate_scope_extractor',
+                        'debase.lineage_format', 'debase.wrapper']:
+        module_logger = logging.getLogger(module_name)
+        module_logger.addHandler(file_handler)
     try:
         # Reset token usage tracking for this pipeline run
         reset_token_usage()
@@ -487,6 +478,7 @@ def run_pipeline(
         logger.info(f"Manuscript: {manuscript_path}")
         logger.info(f"SI: {si_path if si_path else 'None'}")
         logger.info(f"Output: {output_path}")
+        logger.info(f"Log file: {log_file}")
         logger.info("="*60)
         start_time = time.time()
@@ -529,6 +521,9 @@ def run_pipeline(
         # Calculate token usage and estimated costs
         total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
+        # Save token usage to CSV file
+        save_token_usage_to_csv(manuscript_path, total_input_tokens, total_output_tokens, estimated_cost, elapsed, output_dir)
         logger.info("\n" + "="*60)
         logger.info("PIPELINE COMPLETED SUCCESSFULLY")
         logger.info(f"Comprehensive output: {output_path}")
@@ -563,6 +558,15 @@ def run_pipeline(
     except Exception as e:
         logger.error(f"Pipeline failed: {str(e)}")
         raise
+    finally:
+        # Clean up file handler
+        file_handler.close()
+        root_logger.removeHandler(file_handler)
+        for module_name in ['debase.enzyme_lineage_extractor', 'debase.cleanup_sequence',
+                            'debase.reaction_info_extractor', 'debase.substrate_scope_extractor',
+                            'debase.lineage_format', 'debase.wrapper']:
+            module_logger = logging.getLogger(module_name)
+            module_logger.removeHandler(file_handler)
 def main():

debase-0.4.4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,121 @@
+Metadata-Version: 2.4
+Name: debase
+Version: 0.4.4
+Summary: Enzyme lineage analysis and sequence extraction package
+Home-page: https://github.com/YuemingLong/DEBase
+Author: DEBase Team
+Author-email: DEBase Team <ylong@caltech.edu>
+License: MIT
+Project-URL: Homepage, https://github.com/YuemingLong/DEBase
+Project-URL: Documentation, https://github.com/YuemingLong/DEBase#readme
+Project-URL: Repository, https://github.com/YuemingLong/DEBase
+Project-URL: Issues, https://github.com/YuemingLong/DEBase/issues
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Scientific/Engineering :: Chemistry
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=1.0.0
+Requires-Dist: PyMuPDF>=1.18.0
+Requires-Dist: numpy>=1.19.0
+Requires-Dist: google-generativeai>=0.3.0
+Requires-Dist: biopython>=1.78
+Requires-Dist: requests>=2.25.0
+Requires-Dist: httpx>=0.24.0
+Requires-Dist: tqdm>=4.60.0
+Requires-Dist: openpyxl>=3.0.0
+Requires-Dist: PyPDF2>=2.0.0
+Requires-Dist: Pillow>=8.0.0
+Requires-Dist: networkx>=2.5
+Provides-Extra: rdkit
+Requires-Dist: rdkit>=2020.03.1; extra == "rdkit"
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: black; extra == "dev"
+Requires-Dist: isort; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Requires-Dist: mypy; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: sphinx>=4.0; extra == "docs"
+Requires-Dist: sphinx-rtd-theme; extra == "docs"
+Requires-Dist: myst-parser; extra == "docs"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+# DEBase
+DEBase is a Python package for extracting and analyzing enzyme lineage data from scientific papers using AI-powered parsing.
+## Features
+- Extract enzyme variant lineages from PDF documents
+- Parse protein and DNA sequences with mutation annotations
+- Extract reaction performance metrics (yield, TTN, ee)
+- Extract and organize substrate scope data
+- Match enzyme variants across different data sources using AI
+- Generate structured CSV outputs for downstream analysis
+## Installation
+```bash
+pip install debase
+```
+## Quick Start
+```bash
+# Run the complete pipeline
+debase --manuscript paper.pdf --si supplementary.pdf --output results.csv
+# Enable debug mode to save Gemini prompts and responses
+debase --manuscript paper.pdf --si supplementary.pdf --output results.csv --debug-dir ./debug_output
+# Individual components with debugging
+python -m debase.enzyme_lineage_extractor --manuscript paper.pdf --output lineage.csv --debug-dir ./debug_output
+python -m debase.reaction_info_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output reactions.csv --debug-dir ./debug_output
+python -m debase.substrate_scope_extractor --manuscript paper.pdf --lineage-csv lineage.csv --output substrate_scope.csv --debug-dir ./debug_output
+python -m debase.lineage_format -r reactions.csv -s substrate_scope.csv -o final.csv -v
+```
+## Debugging
+Use the `--debug-dir` flag to save all Gemini API prompts and responses for debugging:
+- Location extraction prompts
+- Sequence extraction prompts (can be very large, up to 150K characters)
+- Enzyme matching prompts
+- All API responses with timestamps
+- Note: lineage_format.py uses `-v` for verbose output instead of `--debug-dir`
+## Requirements
+- Python 3.8+
+- Google Gemini API key (set as GEMINI_API_KEY environment variable)
+## Version
+0.4.4
+## License
+MIT License
+## Authors
+DEBase Team - Caltech
+## Contact
+ylong@caltech.edu

debase-0.4.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=Vtl1u7rFItRnkcTvBiUypIltuuzta9Uy3PxMO2NgNgc,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
+debase/enzyme_lineage_extractor.py,sha256=jWyDRfOY792zjY5SZCvhNfQxVcEOC1JjTGb9Wo2qZ4I,170543
+debase/lineage_format.py,sha256=ch5kyoUqD_4Hj7K0hJrRbKrN_FysqFrFXgbyDIgp2oA,57515
+debase/reaction_info_extractor.py,sha256=Gv1qgzInNWxdaEJdsWGlgyy5syL2qClVoKHFQpR_6q0,158498
+debase/substrate_scope_extractor.py,sha256=7JyTE3CiIQVDDetwfENCoiq5bLnHElsY3Db1ThVLEBE,115884
+debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
+debase-0.4.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.4.4.dist-info/METADATA,sha256=Gwx754a5Zr_0yp-HXQuRRLylgEp0hD15MhhMjSOVMHo,4047
+debase-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.4.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.4.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.4.4.dist-info/RECORD,,

debase 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

debase 0.4.3py3-none-any.whl → 0.4.4py3-none-any.whl