PyPI - amylodeep - Versions diffs - 0.1.0__tar.gz - Mend

amylodeep 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

amylodeep-0.1.0/LICENSE +21 -0
amylodeep-0.1.0/PKG-INFO +195 -0
amylodeep-0.1.0/README.md +156 -0
amylodeep-0.1.0/amylodeep/__init__.py +25 -0
amylodeep-0.1.0/amylodeep/cli.py +139 -0
amylodeep-0.1.0/amylodeep/ensemble_predictor.py +254 -0
amylodeep-0.1.0/amylodeep/esm_classifier.py +57 -0
amylodeep-0.1.0/amylodeep/secret.toml +1 -0
amylodeep-0.1.0/amylodeep/unirep_model.py +86 -0
amylodeep-0.1.0/amylodeep/utils.py +88 -0
amylodeep-0.1.0/amylodeep.egg-info/PKG-INFO +195 -0
amylodeep-0.1.0/amylodeep.egg-info/SOURCES.txt +16 -0
amylodeep-0.1.0/amylodeep.egg-info/dependency_links.txt +1 -0
amylodeep-0.1.0/amylodeep.egg-info/entry_points.txt +2 -0
amylodeep-0.1.0/amylodeep.egg-info/requires.txt +18 -0
amylodeep-0.1.0/amylodeep.egg-info/top_level.txt +1 -0
amylodeep-0.1.0/pyproject.toml +64 -0
amylodeep-0.1.0/setup.cfg +4 -0

amylodeep-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Alisa Davtyan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

amylodeep-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,195 @@
+Metadata-Version: 2.4
+Name: amylodeep
+Version: 0.1.0
+Summary: Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models
+Author-email: Alisa Davtyan <alisadavtyan7@gmail.com>
+License: MIT
+Project-URL: Repository, https://github.com/AlisaDavtyan/protein_classification
+Project-URL: Bug Tracker, https://github.com/AlisaDavtyan/protein_classification/issues
+Keywords: bioinformatics,amyloid,deep learning,protein,sequence classification
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=1.12.0
+Requires-Dist: transformers>=4.30.0
+Requires-Dist: xgboost>=1.7.0
+Requires-Dist: numpy>=1.20
+Requires-Dist: pandas>=1.3
+Requires-Dist: scikit-learn>=1.0
+Requires-Dist: jax-unirep>=2.0.0
+Requires-Dist: wandb>=0.14
+Requires-Dist: toml>=0.10.2
+Provides-Extra: ui
+Requires-Dist: streamlit>=1.18; extra == "ui"
+Requires-Dist: matplotlib>=3.5; extra == "ui"
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: black>=22.0; extra == "dev"
+Requires-Dist: flake8>=3.9; extra == "dev"
+Dynamic: license-file
+# AmyloDeep
+**Prediction of amyloid propensity from amino acid sequences using deep learning**
+AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
+## Features
+- **Multi-model ensemble**: Combines 5 different models for robust predictions
+- **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
+- **Pre-trained models**: Uses models trained on amyloid sequence databases
+- **Calibrated probabilities**: Includes probability calibration for better confidence estimates
+- **Easy-to-use API**: Simple Python interface and command-line tool
+- **Streamlit web interface**: Optional web interface for interactive predictions
+## Installation
+### From PyPI (recommended)
+```bash
+pip install amylodeep
+```
+### From source
+```bash
+git clone https://github.com/AlisaDavtyan/protein_classification.git
+cd amylodeep
+pip install -e .
+```
+For development:
+```bash
+pip install amylodeep[dev]
+```
+## Quick Start
+### Python API
+```python
+from amylodeep import predict_ensemble_rolling
+# Predict amyloid propensity for a protein sequence
+sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+result = predict_ensemble_rolling(sequence, window_size=6)
+print(f"Average probability: {result['avg_probability']:.4f}")
+print(f"Maximum probability: {result['max_probability']:.4f}")
+# Access position-wise probabilities
+for position, probability in result['position_probs']:
+    print(f"Position {position}: {probability:.4f}")
+```
+### Command Line Interface
+```bash
+# Basic prediction
+amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+# With custom window size
+amylodeep "SEQUENCE" --window-size 10
+# Save results to file
+amylodeep "SEQUENCE" --output results.json --format json
+# CSV output
+amylodeep "SEQUENCE" --output results.csv --format csv
+```
+## Model Architecture
+AmyloDeep uses an ensemble of 5 models:
+1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
+2. **UniRep**: UniRep-based neural network classifier
+3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
+4. **SVM**: Support Vector Machine with ESM2 embeddings
+5. **XGBoost**: Gradient boosting with ESM2 embeddings
+The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
+## Requirements
+- Python >= 3.8
+- PyTorch >= 1.9.0
+- Transformers >= 4.15.0
+- NumPy >= 1.20.0
+- scikit-learn >= 1.0.0
+- XGBoost >= 1.5.0
+- jax-unirep >= 2.0.0
+- wandb >= 0.12.0
+### Main Functions
+#### `predict_ensemble_rolling(sequence, window_size=6)`
+Predict amyloid propensity for a protein sequence using rolling window analysis.
+**Parameters:**
+- `sequence` (str): Protein sequence (amino acid letters)
+- `window_size` (int): Size of the rolling window (default: 6)
+**Returns:**
+Dictionary containing:
+- `position_probs`: List of (position, probability) tuples
+- `avg_probability`: Average probability across all windows
+- `max_probability`: Maximum probability across all windows
+- `sequence_length`: Length of the input sequence
+- `num_windows`: Number of windows analyzed
+Individual model classes for ESM and UniRep-based predictions.
+## Contributing
+We welcome contributions! Please see our contributing guidelines for more information.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## Citation
+If you use AmyloDeep in your research, please cite:
+```bibtex
+@software{amylodeep2025,
+  title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
+  author={Alisa Davtyan},
+  year={2025},
+  url={https://github.com/AlisaDavtyan/protein_classification}
+}
+```
+## Support
+For questions and support:
+- Open an issue on GitHub
+- Contact: alisadavtyan7@gmail.com
+## Changelog
+### v0.1.0
+- Initial release
+- 5-model ensemble implementation
+- Rolling window prediction
+- Command-line interface
+- Python API

amylodeep-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,156 @@
+# AmyloDeep
+**Prediction of amyloid propensity from amino acid sequences using deep learning**
+AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
+## Features
+- **Multi-model ensemble**: Combines 5 different models for robust predictions
+- **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
+- **Pre-trained models**: Uses models trained on amyloid sequence databases
+- **Calibrated probabilities**: Includes probability calibration for better confidence estimates
+- **Easy-to-use API**: Simple Python interface and command-line tool
+- **Streamlit web interface**: Optional web interface for interactive predictions
+## Installation
+### From PyPI (recommended)
+```bash
+pip install amylodeep
+```
+### From source
+```bash
+git clone https://github.com/AlisaDavtyan/protein_classification.git
+cd amylodeep
+pip install -e .
+```
+For development:
+```bash
+pip install amylodeep[dev]
+```
+## Quick Start
+### Python API
+```python
+from amylodeep import predict_ensemble_rolling
+# Predict amyloid propensity for a protein sequence
+sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+result = predict_ensemble_rolling(sequence, window_size=6)
+print(f"Average probability: {result['avg_probability']:.4f}")
+print(f"Maximum probability: {result['max_probability']:.4f}")
+# Access position-wise probabilities
+for position, probability in result['position_probs']:
+    print(f"Position {position}: {probability:.4f}")
+```
+### Command Line Interface
+```bash
+# Basic prediction
+amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+# With custom window size
+amylodeep "SEQUENCE" --window-size 10
+# Save results to file
+amylodeep "SEQUENCE" --output results.json --format json
+# CSV output
+amylodeep "SEQUENCE" --output results.csv --format csv
+```
+## Model Architecture
+AmyloDeep uses an ensemble of 5 models:
+1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
+2. **UniRep**: UniRep-based neural network classifier
+3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
+4. **SVM**: Support Vector Machine with ESM2 embeddings
+5. **XGBoost**: Gradient boosting with ESM2 embeddings
+The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
+## Requirements
+- Python >= 3.8
+- PyTorch >= 1.9.0
+- Transformers >= 4.15.0
+- NumPy >= 1.20.0
+- scikit-learn >= 1.0.0
+- XGBoost >= 1.5.0
+- jax-unirep >= 2.0.0
+- wandb >= 0.12.0
+### Main Functions
+#### `predict_ensemble_rolling(sequence, window_size=6)`
+Predict amyloid propensity for a protein sequence using rolling window analysis.
+**Parameters:**
+- `sequence` (str): Protein sequence (amino acid letters)
+- `window_size` (int): Size of the rolling window (default: 6)
+**Returns:**
+Dictionary containing:
+- `position_probs`: List of (position, probability) tuples
+- `avg_probability`: Average probability across all windows
+- `max_probability`: Maximum probability across all windows
+- `sequence_length`: Length of the input sequence
+- `num_windows`: Number of windows analyzed
+Individual model classes for ESM and UniRep-based predictions.
+## Contributing
+We welcome contributions! Please see our contributing guidelines for more information.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## Citation
+If you use AmyloDeep in your research, please cite:
+```bibtex
+@software{amylodeep2025,
+  title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
+  author={Alisa Davtyan},
+  year={2025},
+  url={https://github.com/AlisaDavtyan/protein_classification}
+}
+```
+## Support
+For questions and support:
+- Open an issue on GitHub
+- Contact: alisadavtyan7@gmail.com
+## Changelog
+### v0.1.0
+- Initial release
+- 5-model ensemble implementation
+- Rolling window prediction
+- Command-line interface
+- Python API

amylodeep-0.1.0/amylodeep/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""
+Amyloid: Prediction of amyloid propensity from amino acid sequences using deep learning
+This package provides an ensemble of machine learning models to predict
+amyloidogenic regions in protein sequences using a rolling window approach.
+"""
+__version__ = "0.1.0"
+__author__ = "Alisa Davtyan"
+__email__ = "alisadavtyan7@gmail.com"
+from .utils import predict_ensemble_rolling, load_models_and_calibrators
+from .ensemble_predictor import EnsembleRollingWindowPredictor
+from .esm_classifier import ESMClassifier, ESMClassifierConfig
+from .unirep_model import UniRepClassifier, UniRepClassifierConfig
+__all__ = [
+    "predict_ensemble_rolling",
+    "load_models_and_calibrators",
+    "EnsembleRollingWindowPredictor",
+    "ESMClassifier",
+    "ESMClassifierConfig",
+    "UniRepClassifier",
+    "UniRepClassifierConfig",
+]

amylodeep-0.1.0/amylodeep/cli.py ADDED Viewed

@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+import argparse
+import os
+import sys
+from .utils import load_models_and_calibrators
+from .ensemble_predictor import EnsembleRollingWindowPredictor
+def parse_fasta(fasta_file):
+    """
+    Parse a FASTA file and return sequences with their IDs.
+    """
+    sequences = []
+    current_id = None
+    current_seq = ""
+    with open(fasta_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith('>'):
+                if current_id is not None:
+                    sequences.append((current_id, current_seq.upper()))
+                current_id = line[1:]  # Remove '>' character
+                current_seq = ""
+            else:
+                current_seq += line
+        # Final sequence
+        if current_id is not None:
+            sequences.append((current_id, current_seq.upper()))
+    return sequences
+def main():
+    """
+    CLI entry point for AmyloDeeP predictions.
+    """
+    parser = argparse.ArgumentParser(
+        description="Run amyloid propensity predictions on a FASTA file."
+    )
+    parser.add_argument(
+        "-i", "--input",
+        required=True,
+        help="Path to input FASTA file containing amino acid sequences."
+    )
+    parser.add_argument(
+        "-o", "--output",
+        required=True,
+        help="Path to output CSV file for writing predictions."
+    )
+    parser.add_argument(
+        "-w", "--window-size",
+        type=int,
+        default=6,
+        help="Rolling window size (default: 6)"
+    )
+    args = parser.parse_args()
+    # Ensure WANDB_API_KEY is set
+    if not os.getenv("WANDB_API_KEY"):
+        print("Error: Environment variable WANDB_API_KEY must be set.", file=sys.stderr)
+        sys.exit(1)
+    # Check input file
+    if not os.path.exists(args.input):
+        print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
+        sys.exit(1)
+    # Load models, calibrators, and tokenizer
+    try:
+        print("Loading models and calibrators...", file=sys.stderr)
+        models, calibrators, tokenizer_1 = load_models_and_calibrators()
+    except Exception as e:
+        print(f"Error loading models: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Initialize ensemble predictor
+    predictor = EnsembleRollingWindowPredictor(
+        models_dict=models,
+        calibrators_dict=calibrators,
+        tokenizer=tokenizer_1
+    )
+    # Parse FASTA
+    try:
+        print("Parsing FASTA file...", file=sys.stderr)
+        sequences = parse_fasta(args.input)
+        if not sequences:
+            print("Error: No sequences found in FASTA file.", file=sys.stderr)
+            sys.exit(1)
+        print(f"Found {len(sequences)} sequences.", file=sys.stderr)
+    except Exception as e:
+        print(f"Error parsing FASTA file: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Run predictions
+    results = []
+    try:
+        for i, (seq_id, sequence) in enumerate(sequences, 1):
+            print(f"Processing sequence {i}/{len(sequences)}: {seq_id}", file=sys.stderr)
+            if not sequence.replace('X', '').isalpha():
+                print(f"Warning: Sequence {seq_id} contains invalid characters. Skipping.", file=sys.stderr)
+                continue
+            result = predictor.rolling_window_prediction(sequence, args.window_size)
+            for position, probability in result['position_probs']:
+                results.append({
+                    'sequence_id': seq_id,
+                    'position': position,
+                    'probability': probability,
+                    'sequence_length': result['sequence_length'],
+                    'avg_probability': result['avg_probability'],
+                    'max_probability': result['max_probability']
+                })
+    except Exception as e:
+        print(f"Error during prediction: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Write CSV
+    try:
+        import pandas as pd
+        df = pd.DataFrame(results)
+        df.to_csv(args.output, index=False)
+        print(f"Predictions saved to {args.output}")
+        print(f"Total predictions: {len(results)} position-wise results from {len(sequences)} sequences")
+    except ImportError:
+        print("pandas is required to write CSV outputs. Install via 'pip install pandas'", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error writing output file: {e}", file=sys.stderr)
+        sys.exit(1)
+# Allow running as `python -m amylodeep.cli`
+if __name__ == "__main__":
+    main()

amylodeep-0.1.0/amylodeep/ensemble_predictor.py ADDED Viewed

@@ -0,0 +1,254 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import jax_unirep
+import pickle
+class EnsembleRollingWindowPredictor:
+    def __init__(self, models_dict, calibrators_dict=None, tokenizer=None):
+        """
+        Initialize the ensemble predictor with all 5 models and calibrators.
+        Args:
+            models_dict: Dictionary containing all 5 models with keys:
+                'esm2_150M', 'unirep', 'esm2_650M', 'svm', 'xgboost'
+            calibrators_dict: Dictionary containing calibrators where applicable
+        """
+        self.models = models_dict
+        self.calibrators = calibrators_dict or {}
+        self.tokenizer_1 = tokenizer
+        self.tokenizer_esm = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+        self.esm_model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D", add_pooling_layer=False)
+        # Freeze ESM model parameters
+        for param in self.esm_model.parameters():
+            param.requires_grad = False
+        self.esm_model.eval()
+    def _predict_model_1(self, sequences):
+        """ESM2 150M fine-tuned model prediction"""
+        def tokenize_function(sequences):
+            return self.tokenizer_1(sequences, padding="max_length", truncation=True, max_length=128)
+        encodings = tokenize_function(sequences)
+        input_ids = torch.tensor(encodings['input_ids'])
+        attention_mask = torch.tensor(encodings['attention_mask'])
+        with torch.no_grad():
+            outputs = self.models['esm2_150M'](input_ids=input_ids, attention_mask=attention_mask)
+            probs = F.softmax(outputs.logits, dim=1)[:, 1]
+        return probs.numpy()
+    def _predict_model_2(self, sequences):
+        """UniRep model prediction"""
+        def unirep_tokenize_function(sequences):
+            h_final, c_final, h_avg = jax_unirep.get_reps(sequences)
+            return {
+                "embeddings": h_final,
+                "avg_hidden": h_avg,
+                "cell_state": c_final
+            }
+        encodings = unirep_tokenize_function(sequences)
+        embeddings = torch.tensor(encodings["embeddings"], dtype=torch.float32)
+        with torch.no_grad():
+            outputs = self.models['unirep'](embeddings=embeddings)
+            probs = F.softmax(outputs['logits'], dim=1)[:, 1]
+        probs_np = probs.numpy()
+        if 'platt_unirep' in self.calibrators:
+            probs_np = self.calibrators['platt_unirep'].predict_proba(probs_np.reshape(-1, 1))[:, 1]
+        return probs_np
+    def _extract_mean_esm_embeddings(self, encodings, batch_size=8):
+        """Shared helper to extract mean-pooled ESM2-650M embeddings."""
+        embeddings = []
+        input_ids = encodings['input_ids']
+        attention_mask = encodings['attention_mask']
+        dataset_size = input_ids.size(0)
+        with torch.no_grad():
+            for i in range(0, dataset_size, batch_size):
+                batch_input_ids = input_ids[i:i+batch_size]
+                batch_attention_mask = attention_mask[i:i+batch_size]
+                outputs = self.esm_model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
+                sequence_output = outputs.last_hidden_state
+                mask_expanded = batch_attention_mask.unsqueeze(-1).expand(sequence_output.size()).float()
+                sum_embeddings = torch.sum(sequence_output * mask_expanded, 1)
+                sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
+                mean_embeddings = sum_embeddings / sum_mask
+                embeddings.append(mean_embeddings)
+        return torch.cat(embeddings, dim=0)
+    def _predict_model_3(self, sequences):
+        """ESM2 650M with custom classifier prediction"""
+        def tokenize_function(sequences):
+            return self.tokenizer_esm(sequences, padding="max_length", truncation=True,
+                                    max_length=128, return_tensors="pt")
+        def extract_esm_embeddings(encodings, batch_size=8):
+            embeddings = []
+            input_ids = encodings['input_ids']
+            attention_mask = encodings['attention_mask']
+            dataset_size = input_ids.size(0)
+            with torch.no_grad():
+                for i in range(0, dataset_size, batch_size):
+                    batch_input_ids = input_ids[i:i+batch_size]
+                    batch_attention_mask = attention_mask[i:i+batch_size]
+                    outputs = self.esm_model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
+                    sequence_output = outputs.last_hidden_state
+                    mask_expanded = batch_attention_mask.unsqueeze(-1).expand(sequence_output.size()).float()
+                    sum_embeddings = torch.sum(sequence_output * mask_expanded, 1)
+                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
+                    mean_embeddings = sum_embeddings / sum_mask
+                    embeddings.append(mean_embeddings)
+            return torch.cat(embeddings, dim=0)
+        encodings = tokenize_function(sequences)
+        embeddings = extract_esm_embeddings(encodings)
+        with torch.no_grad():
+            outputs = self.models['esm2_650M'](embeddings=embeddings)
+            probs = F.softmax(outputs['logits'], dim=1)[:, 1]
+        probs_np = probs.numpy()
+        if 'isotonic_650M_NN' in self.calibrators:
+            probs_np = self.calibrators['isotonic_650M_NN'].predict(probs_np)
+        return probs_np
+    def _predict_model_4(self, sequences):
+        """SVM model prediction"""
+        X_features = self._extract_features_for_svm(sequences)
+        probs = self.models['svm'].predict_proba(X_features)[:, 1]
+        return probs
+    def _predict_model_5(self, sequences):
+        """XGBoost model prediction"""
+        X_features = self._extract_features_for_xgboost(sequences)
+        probs = self.models['xgboost'].predict_proba(X_features)[:, 1]
+        if 'isotonic_XGBoost' in self.calibrators:
+            probs = self.calibrators['isotonic_XGBoost'].predict(probs)
+        return probs
+    def _extract_features_for_svm(self, sequences):
+        """Extract ESM2-650M mean-pooled embeddings for SVM."""
+        tokenized = self.tokenizer_esm(
+            sequences,
+            padding="max_length",
+            truncation=True,
+            max_length=128,
+            return_tensors="pt"
+        )
+        with torch.no_grad():
+            embeddings = self._extract_mean_esm_embeddings(tokenized)
+        return embeddings.numpy()
+    def _extract_features_for_xgboost(self, sequences):
+        """Extract ESM2-650M mean-pooled embeddings for XGBoost."""
+        return self._extract_features_for_svm(sequences)  # same as SVM
+    def predict_ensemble(self, sequences):
+        """
+        Predict ensemble probabilities for a list of sequences.
+        Args:
+            sequences: List of protein sequences
+        Returns:
+            numpy array of ensemble probabilities
+        """
+        # Get predictions from all models
+        probs_1 = self._predict_model_1(sequences)  # ESM2 150M - NO calibration
+        probs_2 = self._predict_model_2(sequences)  # UniRep - WITH calibration (platt_unirep)
+        probs_3 = self._predict_model_3(sequences)  # ESM2 650M - WITH calibration (isotonic_650M_NN)
+        probs_4 = self._predict_model_4(sequences)  # SVM - NO calibration
+        probs_5 = self._predict_model_5(sequences)  # XGBoost - WITH calibration (isotonic_XGBoost)
+        # Combine probabilities (matching your original mixed_probs_list order)
+        mixed_probs_list = [probs_1, probs_2, probs_3, probs_4, probs_5]
+        # Compute average probabilities
+        avg_probs = np.mean(mixed_probs_list, axis=0)
+        return avg_probs
+    def rolling_window_prediction(self, sequence, window_size):
+        """
+        Predict amyloid probability for an entire sequence using rolling window approach.
+        The window slides one position at a time across the sequence.
+        Args:
+            sequence: Single protein sequence string
+            window_size: Size of the sliding window
+        Returns:
+            dict containing:
+                - 'position_probs': List of (position, probability) tuples
+                - 'avg_probability': Average probability across all windows
+                - 'max_probability': Maximum probability across all windows
+                - 'sequence_length': Length of the input sequence
+        """
+        sequence_length = len(sequence)
+        if sequence_length < window_size:
+            # If sequence is shorter than window, predict on the entire sequence
+            prob = self.predict_ensemble([sequence])[0]
+            return {
+                'position_probs': [(0, prob)],
+                'avg_probability': prob,
+                'max_probability': prob,
+                'sequence_length': sequence_length
+            }
+        # Generate windows - slide one position at a time
+        windows = []
+        positions = []
+        for i in range(sequence_length - window_size + 1):
+            window = sequence[i:i + window_size]
+            windows.append(window)
+            positions.append(i)
+        # Predict on all windows
+        window_probs = self.predict_ensemble(windows)
+        # Combine results
+        position_probs = list(zip(positions, window_probs))
+        avg_probability = np.mean(window_probs)
+        max_probability = np.max(window_probs)
+        return {
+            'position_probs': position_probs,
+            'avg_probability': avg_probability,
+            'max_probability': max_probability,
+            'sequence_length': sequence_length,
+            'num_windows': len(windows)
+        }

amylodeep-0.1.0/amylodeep/esm_classifier.py ADDED Viewed

@@ -0,0 +1,57 @@
+# Define a custom configuration for ESM embeddings
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+class ESMClassifierConfig(PretrainedConfig):
+    model_type = "esm_classifier"
+    def __init__(
+        self,
+        input_dim=1280,  # ESM2_t30_150M has 640 dim embeddings
+        hidden_dims=[2056, 1024, 512, 256, 128],
+        num_labels=2,
+        dropout=0.2,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.hidden_dims = hidden_dims
+        self.num_labels = num_labels
+        self.dropout = dropout
+# Define a custom model that works with the Trainer
+class ESMClassifier(PreTrainedModel):
+    config_class = ESMClassifierConfig
+    def __init__(self, config):
+        super().__init__(config)
+        layers = []
+        dims = [config.input_dim] + config.hidden_dims
+        for i in range(len(dims) - 1):
+            layers.append(nn.Linear(dims[i], dims[i+1]))
+            layers.append(nn.ReLU())
+            layers.append(nn.Dropout(config.dropout))
+        self.feature_extractor = nn.Sequential(*layers)
+        self.classifier = nn.Linear(dims[-1], config.num_labels)
+    def forward(
+        self,
+        embeddings=None,  # This will be your ESM embeddings
+        labels=None,
+        **kwargs
+    ):
+        # Process embeddings
+        features = self.feature_extractor(embeddings)
+        logits = self.classifier(features)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+#
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

amylodeep-0.1.0/amylodeep/secret.toml ADDED Viewed

	@@ -0,0 +1 @@
1	+ WANDB_API_KEY = "d54c49d77e48f6ab7ee41727ea0563c4d4a259a1"

amylodeep-0.1.0/amylodeep/unirep_model.py ADDED Viewed

@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from transformers import PreTrainedModel, PretrainedConfig
+import jax_unirep
+# --- UniRep Tokenizer ---
+def unirep_tokenize_function(sequences):
+    """
+    Get UniRep embeddings for a list of protein sequences.
+    Returns a dictionary with embeddings compatible with PyTorch datasets.
+    """
+    h_final, c_final, h_avg = jax_unirep.get_reps(sequences)
+    return {
+        "embeddings": h_final,
+        "avg_hidden": h_avg,
+        "cell_state": c_final
+    }
+# --- Custom Dataset using UniRep ---
+class UniRepProteinDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.embeddings = torch.tensor(encodings["embeddings"], dtype=torch.float32)
+        self.avg_hidden = torch.tensor(encodings["avg_hidden"], dtype=torch.float32)
+        self.cell_state = torch.tensor(encodings["cell_state"], dtype=torch.float32)
+        self.labels = torch.tensor(labels, dtype=torch.long)
+    def __getitem__(self, idx):
+        return {
+            "embeddings": self.embeddings[idx],
+            "avg_hidden": self.avg_hidden[idx],
+            "cell_state": self.cell_state[idx],
+            "labels": self.labels[idx]
+        }
+    def __len__(self):
+        return len(self.labels)
+# --- Custom Config for UniRepClassifier ---
+class UniRepClassifierConfig(PretrainedConfig):
+    model_type = "unirep_classifier"
+    def __init__(
+        self,
+        input_dim=1900,
+        hidden_dims=[512, 128],
+        num_labels=2,
+        dropout=0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.hidden_dims = hidden_dims
+        self.num_labels = num_labels
+        self.dropout = dropout
+# --- UniRep Classifier Model ---
+class UniRepClassifier(PreTrainedModel):
+    config_class = UniRepClassifierConfig
+    def __init__(self, config):
+        super().__init__(config)
+        dims = [config.input_dim] + config.hidden_dims
+        layers = []
+        for i in range(len(dims) - 1):
+            layers.append(nn.Linear(dims[i], dims[i + 1]))
+            layers.append(nn.ReLU())
+            layers.append(nn.Dropout(config.dropout))
+        self.feature_extractor = nn.Sequential(*layers)
+        self.classifier = nn.Linear(dims[-1], config.num_labels)
+    def forward(self, embeddings=None, labels=None, **kwargs):
+        features = self.feature_extractor(embeddings)
+        logits = self.classifier(features)
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss()
+            loss = loss_fn(logits.view(-1, self.config.num_labels), labels.view(-1))
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

amylodeep-0.1.0/amylodeep/utils.py ADDED Viewed

@@ -0,0 +1,88 @@
+from unirep_model import UniRepClassifier
+from esm_classifier import ESMClassifier
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+import pickle
+from ensemble_predictor import EnsembleRollingWindowPredictor
+import xgboost as xgb
+import wandb
+import os
+os.environ["WANDB_MODE"] = "disabled"
+def load_models_and_calibrators():
+    """
+    Load models and calibrators
+    """
+    models = {}
+    #initialize wandb api
+    api = wandb.Api(api_key=os.environ["WANDB_API_KEY"])
+    # Model 1: ESM2 150M fine-tuned
+    artifact_1 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_esm2_150M_checkpoint_100_epochs:v0')
+    model_path_1 = artifact_1.download()
+    models['esm2_150M'] = AutoModelForSequenceClassification.from_pretrained(model_path_1)
+    tokenizer_1 = AutoTokenizer.from_pretrained(model_path_1)
+    # Model 2: UniRep classifier
+    artifact_2 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_UniRepClassifier_4_layers_50_epochs:v0')
+    model_path_2 = artifact_2.download()
+    models['unirep'] = UniRepClassifier.from_pretrained(model_path_2)
+    # Model 3: ESM2 650M classifier
+    artifact_3 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_ESMClassifier_650_layers_50_epochs:v0')
+    model_path_3 = artifact_3.download()
+    models['esm2_650M'] = ESMClassifier.from_pretrained(model_path_3)
+    # Model 4: SVM model
+    artifact_4 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/svm_model:v0')
+    model_path_4 = artifact_4.download()
+    model_path_4_join = os.path.join(model_path_4, "svm_model.pkl")
+    with open(model_path_4_join, "rb") as f:
+        models['svm'] = pickle.load(f)
+    # Model 5: XGBoost model
+    artifact_5 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/XGBoost:v0')
+    model_path_5 = artifact_5.download()
+    model_path_5_join = os.path.join(model_path_5, "xgb_model.json")
+    xgb_model = xgb.XGBClassifier()
+    xgb_model.load_model(model_path_5_join)
+    models['xgboost'] = xgb_model
+    # Calibrators
+    calibrators = {}
+    # platt_unirep
+    artifact_p1 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/platt_unirep:v0')
+    model_path_p1 = artifact_p1.download()
+    calibrator_path_p1 = os.path.join(model_path_p1, "platt_unirep.pkl")
+    with open(calibrator_path_p1, "rb") as f:
+        calibrators['platt_unirep'] = pickle.load(f)
+    # isotonic_650M_NN
+    artifact_p2 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/isotonic_650M_NN:v0')
+    model_path_p2 = artifact_p2.download()
+    calibrator_path_p2 = os.path.join(model_path_p2, "isotonic_650M_NN.pkl")
+    with open(calibrator_path_p2, "rb") as f:
+        calibrators['isotonic_650M_NN'] = pickle.load(f)
+    # isotonic_XGBoost
+    artifact_p3 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/isotonic_XGBoost:v0')
+    model_path_p3 = artifact_p3.download()
+    calibrator_path_p3 = os.path.join(model_path_p3, "isotonic_XGBoost.pkl")
+    with open(calibrator_path_p3, "rb") as f:
+        calibrators['isotonic_XGBoost'] = pickle.load(f)
+    return models, calibrators,tokenizer_1
+def predict_ensemble_rolling(sequence: str, window_size: int = 6):
+    """
+    Run ensemble prediction with rolling window over a single sequence.
+    Returns dictionary with average/max probs and position-wise scores.
+    """
+    models, calibrators ,tokenizer_1 = load_models_and_calibrators()
+    predictor = EnsembleRollingWindowPredictor(models, calibrators,tokenizer_1)
+    return predictor.rolling_window_prediction(sequence, window_size)

amylodeep-0.1.0/amylodeep.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,195 @@
+Metadata-Version: 2.4
+Name: amylodeep
+Version: 0.1.0
+Summary: Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models
+Author-email: Alisa Davtyan <alisadavtyan7@gmail.com>
+License: MIT
+Project-URL: Repository, https://github.com/AlisaDavtyan/protein_classification
+Project-URL: Bug Tracker, https://github.com/AlisaDavtyan/protein_classification/issues
+Keywords: bioinformatics,amyloid,deep learning,protein,sequence classification
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=1.12.0
+Requires-Dist: transformers>=4.30.0
+Requires-Dist: xgboost>=1.7.0
+Requires-Dist: numpy>=1.20
+Requires-Dist: pandas>=1.3
+Requires-Dist: scikit-learn>=1.0
+Requires-Dist: jax-unirep>=2.0.0
+Requires-Dist: wandb>=0.14
+Requires-Dist: toml>=0.10.2
+Provides-Extra: ui
+Requires-Dist: streamlit>=1.18; extra == "ui"
+Requires-Dist: matplotlib>=3.5; extra == "ui"
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: black>=22.0; extra == "dev"
+Requires-Dist: flake8>=3.9; extra == "dev"
+Dynamic: license-file
+# AmyloDeep
+**Prediction of amyloid propensity from amino acid sequences using deep learning**
+AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
+## Features
+- **Multi-model ensemble**: Combines 5 different models for robust predictions
+- **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
+- **Pre-trained models**: Uses models trained on amyloid sequence databases
+- **Calibrated probabilities**: Includes probability calibration for better confidence estimates
+- **Easy-to-use API**: Simple Python interface and command-line tool
+- **Streamlit web interface**: Optional web interface for interactive predictions
+## Installation
+### From PyPI (recommended)
+```bash
+pip install amylodeep
+```
+### From source
+```bash
+git clone https://github.com/AlisaDavtyan/protein_classification.git
+cd amylodeep
+pip install -e .
+```
+For development:
+```bash
+pip install amylodeep[dev]
+```
+## Quick Start
+### Python API
+```python
+from amylodeep import predict_ensemble_rolling
+# Predict amyloid propensity for a protein sequence
+sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+result = predict_ensemble_rolling(sequence, window_size=6)
+print(f"Average probability: {result['avg_probability']:.4f}")
+print(f"Maximum probability: {result['max_probability']:.4f}")
+# Access position-wise probabilities
+for position, probability in result['position_probs']:
+    print(f"Position {position}: {probability:.4f}")
+```
+### Command Line Interface
+```bash
+# Basic prediction
+amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
+# With custom window size
+amylodeep "SEQUENCE" --window-size 10
+# Save results to file
+amylodeep "SEQUENCE" --output results.json --format json
+# CSV output
+amylodeep "SEQUENCE" --output results.csv --format csv
+```
+## Model Architecture
+AmyloDeep uses an ensemble of 5 models:
+1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
+2. **UniRep**: UniRep-based neural network classifier
+3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
+4. **SVM**: Support Vector Machine with ESM2 embeddings
+5. **XGBoost**: Gradient boosting with ESM2 embeddings
+The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
+## Requirements
+- Python >= 3.8
+- PyTorch >= 1.9.0
+- Transformers >= 4.15.0
+- NumPy >= 1.20.0
+- scikit-learn >= 1.0.0
+- XGBoost >= 1.5.0
+- jax-unirep >= 2.0.0
+- wandb >= 0.12.0
+### Main Functions
+#### `predict_ensemble_rolling(sequence, window_size=6)`
+Predict amyloid propensity for a protein sequence using rolling window analysis.
+**Parameters:**
+- `sequence` (str): Protein sequence (amino acid letters)
+- `window_size` (int): Size of the rolling window (default: 6)
+**Returns:**
+Dictionary containing:
+- `position_probs`: List of (position, probability) tuples
+- `avg_probability`: Average probability across all windows
+- `max_probability`: Maximum probability across all windows
+- `sequence_length`: Length of the input sequence
+- `num_windows`: Number of windows analyzed
+Individual model classes for ESM and UniRep-based predictions.
+## Contributing
+We welcome contributions! Please see our contributing guidelines for more information.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## Citation
+If you use AmyloDeep in your research, please cite:
+```bibtex
+@software{amylodeep2025,
+  title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
+  author={Alisa Davtyan},
+  year={2025},
+  url={https://github.com/AlisaDavtyan/protein_classification}
+}
+```
+## Support
+For questions and support:
+- Open an issue on GitHub
+- Contact: alisadavtyan7@gmail.com
+## Changelog
+### v0.1.0
+- Initial release
+- 5-model ensemble implementation
+- Rolling window prediction
+- Command-line interface
+- Python API

amylodeep-0.1.0/amylodeep.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,16 @@
+LICENSE
+README.md
+pyproject.toml
+amylodeep/__init__.py
+amylodeep/cli.py
+amylodeep/ensemble_predictor.py
+amylodeep/esm_classifier.py
+amylodeep/secret.toml
+amylodeep/unirep_model.py
+amylodeep/utils.py
+amylodeep.egg-info/PKG-INFO
+amylodeep.egg-info/SOURCES.txt
+amylodeep.egg-info/dependency_links.txt
+amylodeep.egg-info/entry_points.txt
+amylodeep.egg-info/requires.txt
+amylodeep.egg-info/top_level.txt

amylodeep-0.1.0/amylodeep.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

amylodeep-0.1.0/amylodeep.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ amylodeep-cli = amylodeep.cli:main

amylodeep-0.1.0/amylodeep.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,18 @@
+torch>=1.12.0
+transformers>=4.30.0
+xgboost>=1.7.0
+numpy>=1.20
+pandas>=1.3
+scikit-learn>=1.0
+jax-unirep>=2.0.0
+wandb>=0.14
+toml>=0.10.2
+[dev]
+pytest>=6.0
+black>=22.0
+flake8>=3.9
+[ui]
+streamlit>=1.18
+matplotlib>=3.5

amylodeep-0.1.0/amylodeep.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ amylodeep

amylodeep-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,64 @@
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "amylodeep"
+version = "0.1.0"
+description = "Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "MIT"}
+authors = [
+  {name = "Alisa Davtyan", email = "alisadavtyan7@gmail.com"}
+]
+keywords = ["bioinformatics", "amyloid", "deep learning", "protein", "sequence classification"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+dependencies = [
+    "torch>=1.12.0",
+    "transformers>=4.30.0",
+    "xgboost>=1.7.0",
+    "numpy>=1.20",
+    "pandas>=1.3",
+    "scikit-learn>=1.0",
+    "jax-unirep>=2.0.0",
+    "wandb>=0.14",
+    "toml>=0.10.2"
+]
+[project.optional-dependencies]
+ui = [
+  "streamlit>=1.18",
+  "matplotlib>=3.5"
+]
+dev = [
+  "pytest>=6.0",
+  "black>=22.0",
+  "flake8>=3.9"
+]
+[project.scripts]
+amylodeep-cli = "amylodeep.cli:main"
+[project.urls]
+Repository = "https://github.com/AlisaDavtyan/protein_classification"
+"Bug Tracker" = "https://github.com/AlisaDavtyan/protein_classification/issues"
+[tool.setuptools]
+packages = ["amylodeep"]
+[tool.setuptools.package-data]
+amylodeep = ["*.toml", "config/*"]

amylodeep-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0