amylodeep 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Alisa Davtyan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: amylodeep
3
+ Version: 0.1.0
4
+ Summary: Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models
5
+ Author-email: Alisa Davtyan <alisadavtyan7@gmail.com>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/AlisaDavtyan/protein_classification
8
+ Project-URL: Bug Tracker, https://github.com/AlisaDavtyan/protein_classification/issues
9
+ Keywords: bioinformatics,amyloid,deep learning,protein,sequence classification
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: torch>=1.12.0
23
+ Requires-Dist: transformers>=4.30.0
24
+ Requires-Dist: xgboost>=1.7.0
25
+ Requires-Dist: numpy>=1.20
26
+ Requires-Dist: pandas>=1.3
27
+ Requires-Dist: scikit-learn>=1.0
28
+ Requires-Dist: jax-unirep>=2.0.0
29
+ Requires-Dist: wandb>=0.14
30
+ Requires-Dist: toml>=0.10.2
31
+ Provides-Extra: ui
32
+ Requires-Dist: streamlit>=1.18; extra == "ui"
33
+ Requires-Dist: matplotlib>=3.5; extra == "ui"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=6.0; extra == "dev"
36
+ Requires-Dist: black>=22.0; extra == "dev"
37
+ Requires-Dist: flake8>=3.9; extra == "dev"
38
+ Dynamic: license-file
39
+
40
+ # AmyloDeep
41
+
42
+ **Prediction of amyloid propensity from amino acid sequences using deep learning**
43
+
44
+ AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
45
+
46
+ ## Features
47
+
48
+ - **Multi-model ensemble**: Combines 5 different models for robust predictions
49
+ - **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
50
+ - **Pre-trained models**: Uses models trained on amyloid sequence databases
51
+ - **Calibrated probabilities**: Includes probability calibration for better confidence estimates
52
+ - **Easy-to-use API**: Simple Python interface and command-line tool
53
+ - **Streamlit web interface**: Optional web interface for interactive predictions
54
+
55
+ ## Installation
56
+
57
+ ### From PyPI (recommended)
58
+
59
+ ```bash
60
+ pip install amylodeep
61
+ ```
62
+
63
+ ### From source
64
+
65
+ ```bash
66
+ git clone https://github.com/AlisaDavtyan/protein_classification.git
67
+ cd amylodeep
68
+ pip install -e .
69
+ ```
70
+
71
+
72
+
73
+ For development:
74
+ ```bash
75
+ pip install amylodeep[dev]
76
+ ```
77
+
78
+ ## Quick Start
79
+
80
+ ### Python API
81
+
82
+ ```python
83
+ from amylodeep import predict_ensemble_rolling
84
+
85
+ # Predict amyloid propensity for a protein sequence
86
+ sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
87
+ result = predict_ensemble_rolling(sequence, window_size=6)
88
+
89
+ print(f"Average probability: {result['avg_probability']:.4f}")
90
+ print(f"Maximum probability: {result['max_probability']:.4f}")
91
+
92
+ # Access position-wise probabilities
93
+ for position, probability in result['position_probs']:
94
+ print(f"Position {position}: {probability:.4f}")
95
+ ```
96
+
97
+ ### Command Line Interface
98
+
99
+ ```bash
100
+ # Basic prediction
101
+ amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
102
+
103
+ # With custom window size
104
+ amylodeep "SEQUENCE" --window-size 10
105
+
106
+ # Save results to file
107
+ amylodeep "SEQUENCE" --output results.json --format json
108
+
109
+ # CSV output
110
+ amylodeep "SEQUENCE" --output results.csv --format csv
111
+ ```
112
+
113
+
114
+ ## Model Architecture
115
+
116
+ AmyloDeep uses an ensemble of 5 models:
117
+
118
+ 1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
119
+ 2. **UniRep**: UniRep-based neural network classifier
120
+ 3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
121
+ 4. **SVM**: Support Vector Machine with ESM2 embeddings
122
+ 5. **XGBoost**: Gradient boosting with ESM2 embeddings
123
+
124
+ The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
125
+
126
+ ## Requirements
127
+
128
+ - Python >= 3.8
129
+ - PyTorch >= 1.9.0
130
+ - Transformers >= 4.15.0
131
+ - NumPy >= 1.20.0
132
+ - scikit-learn >= 1.0.0
133
+ - XGBoost >= 1.5.0
134
+ - jax-unirep >= 2.0.0
135
+ - wandb >= 0.12.0
136
+
137
+
138
+
139
+
140
+ ### Main Functions
141
+
142
+ #### `predict_ensemble_rolling(sequence, window_size=6)`
143
+
144
+ Predict amyloid propensity for a protein sequence using rolling window analysis.
145
+
146
+ **Parameters:**
147
+ - `sequence` (str): Protein sequence (amino acid letters)
148
+ - `window_size` (int): Size of the rolling window (default: 6)
149
+
150
+ **Returns:**
151
+ Dictionary containing:
152
+ - `position_probs`: List of (position, probability) tuples
153
+ - `avg_probability`: Average probability across all windows
154
+ - `max_probability`: Maximum probability across all windows
155
+ - `sequence_length`: Length of the input sequence
156
+ - `num_windows`: Number of windows analyzed
157
+
158
+
159
+ Individual model classes for ESM and UniRep-based predictions.
160
+
161
+ ## Contributing
162
+
163
+ We welcome contributions! Please see our contributing guidelines for more information.
164
+
165
+ ## License
166
+
167
+ This project is licensed under the MIT License - see the LICENSE file for details.
168
+
169
+ ## Citation
170
+
171
+ If you use AmyloDeep in your research, please cite:
172
+
173
+ ```bibtex
174
+ @software{amylodeep2025,
175
+ title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
176
+ author={Alisa Davtyan},
177
+ year={2025},
178
+ url={https://github.com/AlisaDavtyan/protein_classification}
179
+ }
180
+ ```
181
+
182
+ ## Support
183
+
184
+ For questions and support:
185
+ - Open an issue on GitHub
186
+ - Contact: alisadavtyan7@gmail.com
187
+
188
+ ## Changelog
189
+
190
+ ### v0.1.0
191
+ - Initial release
192
+ - 5-model ensemble implementation
193
+ - Rolling window prediction
194
+ - Command-line interface
195
+ - Python API
@@ -0,0 +1,156 @@
1
+ # AmyloDeep
2
+
3
+ **Prediction of amyloid propensity from amino acid sequences using deep learning**
4
+
5
+ AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
6
+
7
+ ## Features
8
+
9
+ - **Multi-model ensemble**: Combines 5 different models for robust predictions
10
+ - **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
11
+ - **Pre-trained models**: Uses models trained on amyloid sequence databases
12
+ - **Calibrated probabilities**: Includes probability calibration for better confidence estimates
13
+ - **Easy-to-use API**: Simple Python interface and command-line tool
14
+ - **Streamlit web interface**: Optional web interface for interactive predictions
15
+
16
+ ## Installation
17
+
18
+ ### From PyPI (recommended)
19
+
20
+ ```bash
21
+ pip install amylodeep
22
+ ```
23
+
24
+ ### From source
25
+
26
+ ```bash
27
+ git clone https://github.com/AlisaDavtyan/protein_classification.git
28
+ cd amylodeep
29
+ pip install -e .
30
+ ```
31
+
32
+
33
+
34
+ For development:
35
+ ```bash
36
+ pip install amylodeep[dev]
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ### Python API
42
+
43
+ ```python
44
+ from amylodeep import predict_ensemble_rolling
45
+
46
+ # Predict amyloid propensity for a protein sequence
47
+ sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
48
+ result = predict_ensemble_rolling(sequence, window_size=6)
49
+
50
+ print(f"Average probability: {result['avg_probability']:.4f}")
51
+ print(f"Maximum probability: {result['max_probability']:.4f}")
52
+
53
+ # Access position-wise probabilities
54
+ for position, probability in result['position_probs']:
55
+ print(f"Position {position}: {probability:.4f}")
56
+ ```
57
+
58
+ ### Command Line Interface
59
+
60
+ ```bash
61
+ # Basic prediction
62
+ amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
63
+
64
+ # With custom window size
65
+ amylodeep "SEQUENCE" --window-size 10
66
+
67
+ # Save results to file
68
+ amylodeep "SEQUENCE" --output results.json --format json
69
+
70
+ # CSV output
71
+ amylodeep "SEQUENCE" --output results.csv --format csv
72
+ ```
73
+
74
+
75
+ ## Model Architecture
76
+
77
+ AmyloDeep uses an ensemble of 5 models:
78
+
79
+ 1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
80
+ 2. **UniRep**: UniRep-based neural network classifier
81
+ 3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
82
+ 4. **SVM**: Support Vector Machine with ESM2 embeddings
83
+ 5. **XGBoost**: Gradient boosting with ESM2 embeddings
84
+
85
+ The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
86
+
87
+ ## Requirements
88
+
89
+ - Python >= 3.8
90
+ - PyTorch >= 1.9.0
91
+ - Transformers >= 4.15.0
92
+ - NumPy >= 1.20.0
93
+ - scikit-learn >= 1.0.0
94
+ - XGBoost >= 1.5.0
95
+ - jax-unirep >= 2.0.0
96
+ - wandb >= 0.12.0
97
+
98
+
99
+
100
+
101
+ ### Main Functions
102
+
103
+ #### `predict_ensemble_rolling(sequence, window_size=6)`
104
+
105
+ Predict amyloid propensity for a protein sequence using rolling window analysis.
106
+
107
+ **Parameters:**
108
+ - `sequence` (str): Protein sequence (amino acid letters)
109
+ - `window_size` (int): Size of the rolling window (default: 6)
110
+
111
+ **Returns:**
112
+ Dictionary containing:
113
+ - `position_probs`: List of (position, probability) tuples
114
+ - `avg_probability`: Average probability across all windows
115
+ - `max_probability`: Maximum probability across all windows
116
+ - `sequence_length`: Length of the input sequence
117
+ - `num_windows`: Number of windows analyzed
118
+
119
+
120
+ Individual model classes for ESM and UniRep-based predictions.
121
+
122
+ ## Contributing
123
+
124
+ We welcome contributions! Please see our contributing guidelines for more information.
125
+
126
+ ## License
127
+
128
+ This project is licensed under the MIT License - see the LICENSE file for details.
129
+
130
+ ## Citation
131
+
132
+ If you use AmyloDeep in your research, please cite:
133
+
134
+ ```bibtex
135
+ @software{amylodeep2025,
136
+ title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
137
+ author={Alisa Davtyan},
138
+ year={2025},
139
+ url={https://github.com/AlisaDavtyan/protein_classification}
140
+ }
141
+ ```
142
+
143
+ ## Support
144
+
145
+ For questions and support:
146
+ - Open an issue on GitHub
147
+ - Contact: alisadavtyan7@gmail.com
148
+
149
+ ## Changelog
150
+
151
+ ### v0.1.0
152
+ - Initial release
153
+ - 5-model ensemble implementation
154
+ - Rolling window prediction
155
+ - Command-line interface
156
+ - Python API
@@ -0,0 +1,25 @@
1
+ """
2
+ Amyloid: Prediction of amyloid propensity from amino acid sequences using deep learning
3
+
4
+ This package provides an ensemble of machine learning models to predict
5
+ amyloidogenic regions in protein sequences using a rolling window approach.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Alisa Davtyan"
10
+ __email__ = "alisadavtyan7@gmail.com"
11
+
12
+ from .utils import predict_ensemble_rolling, load_models_and_calibrators
13
+ from .ensemble_predictor import EnsembleRollingWindowPredictor
14
+ from .esm_classifier import ESMClassifier, ESMClassifierConfig
15
+ from .unirep_model import UniRepClassifier, UniRepClassifierConfig
16
+
17
+ __all__ = [
18
+ "predict_ensemble_rolling",
19
+ "load_models_and_calibrators",
20
+ "EnsembleRollingWindowPredictor",
21
+ "ESMClassifier",
22
+ "ESMClassifierConfig",
23
+ "UniRepClassifier",
24
+ "UniRepClassifierConfig",
25
+ ]
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ from .utils import load_models_and_calibrators
7
+ from .ensemble_predictor import EnsembleRollingWindowPredictor
8
+
9
+ def parse_fasta(fasta_file):
10
+ """
11
+ Parse a FASTA file and return sequences with their IDs.
12
+ """
13
+ sequences = []
14
+ current_id = None
15
+ current_seq = ""
16
+
17
+ with open(fasta_file, 'r') as f:
18
+ for line in f:
19
+ line = line.strip()
20
+ if line.startswith('>'):
21
+ if current_id is not None:
22
+ sequences.append((current_id, current_seq.upper()))
23
+ current_id = line[1:] # Remove '>' character
24
+ current_seq = ""
25
+ else:
26
+ current_seq += line
27
+
28
+ # Final sequence
29
+ if current_id is not None:
30
+ sequences.append((current_id, current_seq.upper()))
31
+
32
+ return sequences
33
+
34
+ def main():
35
+ """
36
+ CLI entry point for AmyloDeeP predictions.
37
+ """
38
+ parser = argparse.ArgumentParser(
39
+ description="Run amyloid propensity predictions on a FASTA file."
40
+ )
41
+ parser.add_argument(
42
+ "-i", "--input",
43
+ required=True,
44
+ help="Path to input FASTA file containing amino acid sequences."
45
+ )
46
+ parser.add_argument(
47
+ "-o", "--output",
48
+ required=True,
49
+ help="Path to output CSV file for writing predictions."
50
+ )
51
+ parser.add_argument(
52
+ "-w", "--window-size",
53
+ type=int,
54
+ default=6,
55
+ help="Rolling window size (default: 6)"
56
+ )
57
+
58
+ args = parser.parse_args()
59
+
60
+ # Ensure WANDB_API_KEY is set
61
+ if not os.getenv("WANDB_API_KEY"):
62
+ print("Error: Environment variable WANDB_API_KEY must be set.", file=sys.stderr)
63
+ sys.exit(1)
64
+
65
+ # Check input file
66
+ if not os.path.exists(args.input):
67
+ print(f"Error: Input file '{args.input}' not found.", file=sys.stderr)
68
+ sys.exit(1)
69
+
70
+ # Load models, calibrators, and tokenizer
71
+ try:
72
+ print("Loading models and calibrators...", file=sys.stderr)
73
+ models, calibrators, tokenizer_1 = load_models_and_calibrators()
74
+ except Exception as e:
75
+ print(f"Error loading models: {e}", file=sys.stderr)
76
+ sys.exit(1)
77
+
78
+ # Initialize ensemble predictor
79
+ predictor = EnsembleRollingWindowPredictor(
80
+ models_dict=models,
81
+ calibrators_dict=calibrators,
82
+ tokenizer=tokenizer_1
83
+ )
84
+
85
+ # Parse FASTA
86
+ try:
87
+ print("Parsing FASTA file...", file=sys.stderr)
88
+ sequences = parse_fasta(args.input)
89
+ if not sequences:
90
+ print("Error: No sequences found in FASTA file.", file=sys.stderr)
91
+ sys.exit(1)
92
+ print(f"Found {len(sequences)} sequences.", file=sys.stderr)
93
+ except Exception as e:
94
+ print(f"Error parsing FASTA file: {e}", file=sys.stderr)
95
+ sys.exit(1)
96
+
97
+ # Run predictions
98
+ results = []
99
+ try:
100
+ for i, (seq_id, sequence) in enumerate(sequences, 1):
101
+ print(f"Processing sequence {i}/{len(sequences)}: {seq_id}", file=sys.stderr)
102
+
103
+ if not sequence.replace('X', '').isalpha():
104
+ print(f"Warning: Sequence {seq_id} contains invalid characters. Skipping.", file=sys.stderr)
105
+ continue
106
+
107
+ result = predictor.rolling_window_prediction(sequence, args.window_size)
108
+
109
+ for position, probability in result['position_probs']:
110
+ results.append({
111
+ 'sequence_id': seq_id,
112
+ 'position': position,
113
+ 'probability': probability,
114
+ 'sequence_length': result['sequence_length'],
115
+ 'avg_probability': result['avg_probability'],
116
+ 'max_probability': result['max_probability']
117
+ })
118
+
119
+ except Exception as e:
120
+ print(f"Error during prediction: {e}", file=sys.stderr)
121
+ sys.exit(1)
122
+
123
+ # Write CSV
124
+ try:
125
+ import pandas as pd
126
+ df = pd.DataFrame(results)
127
+ df.to_csv(args.output, index=False)
128
+ print(f"Predictions saved to {args.output}")
129
+ print(f"Total predictions: {len(results)} position-wise results from {len(sequences)} sequences")
130
+ except ImportError:
131
+ print("pandas is required to write CSV outputs. Install via 'pip install pandas'", file=sys.stderr)
132
+ sys.exit(1)
133
+ except Exception as e:
134
+ print(f"Error writing output file: {e}", file=sys.stderr)
135
+ sys.exit(1)
136
+
137
+ # Allow running as `python -m amylodeep.cli`
138
+ if __name__ == "__main__":
139
+ main()
@@ -0,0 +1,254 @@
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import jax_unirep
6
+ import pickle
7
+
8
+
9
+ class EnsembleRollingWindowPredictor:
10
+ def __init__(self, models_dict, calibrators_dict=None, tokenizer=None):
11
+ """
12
+ Initialize the ensemble predictor with all 5 models and calibrators.
13
+
14
+ Args:
15
+ models_dict: Dictionary containing all 5 models with keys:
16
+ 'esm2_150M', 'unirep', 'esm2_650M', 'svm', 'xgboost'
17
+ calibrators_dict: Dictionary containing calibrators where applicable
18
+ """
19
+ self.models = models_dict
20
+ self.calibrators = calibrators_dict or {}
21
+ self.tokenizer_1 = tokenizer
22
+
23
+
24
+ self.tokenizer_esm = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
25
+ self.esm_model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D", add_pooling_layer=False)
26
+
27
+ # Freeze ESM model parameters
28
+ for param in self.esm_model.parameters():
29
+ param.requires_grad = False
30
+
31
+ self.esm_model.eval()
32
+
33
+ def _predict_model_1(self, sequences):
34
+ """ESM2 150M fine-tuned model prediction"""
35
+ def tokenize_function(sequences):
36
+ return self.tokenizer_1(sequences, padding="max_length", truncation=True, max_length=128)
37
+
38
+ encodings = tokenize_function(sequences)
39
+ input_ids = torch.tensor(encodings['input_ids'])
40
+ attention_mask = torch.tensor(encodings['attention_mask'])
41
+
42
+ with torch.no_grad():
43
+ outputs = self.models['esm2_150M'](input_ids=input_ids, attention_mask=attention_mask)
44
+ probs = F.softmax(outputs.logits, dim=1)[:, 1]
45
+
46
+ return probs.numpy()
47
+
48
+ def _predict_model_2(self, sequences):
49
+ """UniRep model prediction"""
50
+ def unirep_tokenize_function(sequences):
51
+ h_final, c_final, h_avg = jax_unirep.get_reps(sequences)
52
+ return {
53
+ "embeddings": h_final,
54
+ "avg_hidden": h_avg,
55
+ "cell_state": c_final
56
+ }
57
+
58
+ encodings = unirep_tokenize_function(sequences)
59
+ embeddings = torch.tensor(encodings["embeddings"], dtype=torch.float32)
60
+
61
+ with torch.no_grad():
62
+ outputs = self.models['unirep'](embeddings=embeddings)
63
+ probs = F.softmax(outputs['logits'], dim=1)[:, 1]
64
+
65
+ probs_np = probs.numpy()
66
+
67
+
68
+ if 'platt_unirep' in self.calibrators:
69
+ probs_np = self.calibrators['platt_unirep'].predict_proba(probs_np.reshape(-1, 1))[:, 1]
70
+
71
+ return probs_np
72
+
73
+ def _extract_mean_esm_embeddings(self, encodings, batch_size=8):
74
+ """Shared helper to extract mean-pooled ESM2-650M embeddings."""
75
+ embeddings = []
76
+ input_ids = encodings['input_ids']
77
+ attention_mask = encodings['attention_mask']
78
+ dataset_size = input_ids.size(0)
79
+
80
+ with torch.no_grad():
81
+ for i in range(0, dataset_size, batch_size):
82
+ batch_input_ids = input_ids[i:i+batch_size]
83
+ batch_attention_mask = attention_mask[i:i+batch_size]
84
+ outputs = self.esm_model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
85
+
86
+ sequence_output = outputs.last_hidden_state
87
+ mask_expanded = batch_attention_mask.unsqueeze(-1).expand(sequence_output.size()).float()
88
+ sum_embeddings = torch.sum(sequence_output * mask_expanded, 1)
89
+ sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
90
+ mean_embeddings = sum_embeddings / sum_mask
91
+
92
+ embeddings.append(mean_embeddings)
93
+
94
+ return torch.cat(embeddings, dim=0)
95
+
96
+ def _predict_model_3(self, sequences):
97
+ """ESM2 650M with custom classifier prediction"""
98
+ def tokenize_function(sequences):
99
+ return self.tokenizer_esm(sequences, padding="max_length", truncation=True,
100
+ max_length=128, return_tensors="pt")
101
+
102
+ def extract_esm_embeddings(encodings, batch_size=8):
103
+ embeddings = []
104
+ input_ids = encodings['input_ids']
105
+ attention_mask = encodings['attention_mask']
106
+ dataset_size = input_ids.size(0)
107
+
108
+ with torch.no_grad():
109
+ for i in range(0, dataset_size, batch_size):
110
+ batch_input_ids = input_ids[i:i+batch_size]
111
+ batch_attention_mask = attention_mask[i:i+batch_size]
112
+ outputs = self.esm_model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
113
+
114
+ sequence_output = outputs.last_hidden_state
115
+ mask_expanded = batch_attention_mask.unsqueeze(-1).expand(sequence_output.size()).float()
116
+ sum_embeddings = torch.sum(sequence_output * mask_expanded, 1)
117
+ sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
118
+ mean_embeddings = sum_embeddings / sum_mask
119
+
120
+ embeddings.append(mean_embeddings)
121
+
122
+ return torch.cat(embeddings, dim=0)
123
+
124
+ encodings = tokenize_function(sequences)
125
+ embeddings = extract_esm_embeddings(encodings)
126
+
127
+ with torch.no_grad():
128
+ outputs = self.models['esm2_650M'](embeddings=embeddings)
129
+ probs = F.softmax(outputs['logits'], dim=1)[:, 1]
130
+
131
+ probs_np = probs.numpy()
132
+
133
+
134
+ if 'isotonic_650M_NN' in self.calibrators:
135
+ probs_np = self.calibrators['isotonic_650M_NN'].predict(probs_np)
136
+
137
+ return probs_np
138
+
139
+ def _predict_model_4(self, sequences):
140
+ """SVM model prediction"""
141
+ X_features = self._extract_features_for_svm(sequences)
142
+
143
+ probs = self.models['svm'].predict_proba(X_features)[:, 1]
144
+ return probs
145
+
146
+ def _predict_model_5(self, sequences):
147
+ """XGBoost model prediction"""
148
+ X_features = self._extract_features_for_xgboost(sequences)
149
+
150
+ probs = self.models['xgboost'].predict_proba(X_features)[:, 1]
151
+
152
+
153
+ if 'isotonic_XGBoost' in self.calibrators:
154
+ probs = self.calibrators['isotonic_XGBoost'].predict(probs)
155
+
156
+ return probs
157
+
158
+ def _extract_features_for_svm(self, sequences):
159
+ """Extract ESM2-650M mean-pooled embeddings for SVM."""
160
+ tokenized = self.tokenizer_esm(
161
+ sequences,
162
+ padding="max_length",
163
+ truncation=True,
164
+ max_length=128,
165
+ return_tensors="pt"
166
+ )
167
+ with torch.no_grad():
168
+ embeddings = self._extract_mean_esm_embeddings(tokenized)
169
+ return embeddings.numpy()
170
+
171
+ def _extract_features_for_xgboost(self, sequences):
172
+ """Extract ESM2-650M mean-pooled embeddings for XGBoost."""
173
+ return self._extract_features_for_svm(sequences) # same as SVM
174
+
175
+
176
+ def predict_ensemble(self, sequences):
177
+ """
178
+ Predict ensemble probabilities for a list of sequences.
179
+
180
+ Args:
181
+ sequences: List of protein sequences
182
+
183
+ Returns:
184
+ numpy array of ensemble probabilities
185
+ """
186
+ # Get predictions from all models
187
+ probs_1 = self._predict_model_1(sequences) # ESM2 150M - NO calibration
188
+ probs_2 = self._predict_model_2(sequences) # UniRep - WITH calibration (platt_unirep)
189
+ probs_3 = self._predict_model_3(sequences) # ESM2 650M - WITH calibration (isotonic_650M_NN)
190
+ probs_4 = self._predict_model_4(sequences) # SVM - NO calibration
191
+ probs_5 = self._predict_model_5(sequences) # XGBoost - WITH calibration (isotonic_XGBoost)
192
+
193
+ # Combine probabilities (matching your original mixed_probs_list order)
194
+ mixed_probs_list = [probs_1, probs_2, probs_3, probs_4, probs_5]
195
+
196
+ # Compute average probabilities
197
+ avg_probs = np.mean(mixed_probs_list, axis=0)
198
+
199
+ return avg_probs
200
+
201
+ def rolling_window_prediction(self, sequence, window_size):
202
+ """
203
+ Predict amyloid probability for an entire sequence using rolling window approach.
204
+ The window slides one position at a time across the sequence.
205
+
206
+ Args:
207
+ sequence: Single protein sequence string
208
+ window_size: Size of the sliding window
209
+
210
+ Returns:
211
+ dict containing:
212
+ - 'position_probs': List of (position, probability) tuples
213
+ - 'avg_probability': Average probability across all windows
214
+ - 'max_probability': Maximum probability across all windows
215
+ - 'sequence_length': Length of the input sequence
216
+ """
217
+ sequence_length = len(sequence)
218
+
219
+ if sequence_length < window_size:
220
+ # If sequence is shorter than window, predict on the entire sequence
221
+ prob = self.predict_ensemble([sequence])[0]
222
+ return {
223
+ 'position_probs': [(0, prob)],
224
+ 'avg_probability': prob,
225
+ 'max_probability': prob,
226
+ 'sequence_length': sequence_length
227
+ }
228
+
229
+ # Generate windows - slide one position at a time
230
+ windows = []
231
+ positions = []
232
+
233
+ for i in range(sequence_length - window_size + 1):
234
+ window = sequence[i:i + window_size]
235
+ windows.append(window)
236
+ positions.append(i)
237
+
238
+ # Predict on all windows
239
+ window_probs = self.predict_ensemble(windows)
240
+
241
+ # Combine results
242
+ position_probs = list(zip(positions, window_probs))
243
+ avg_probability = np.mean(window_probs)
244
+ max_probability = np.max(window_probs)
245
+
246
+ return {
247
+ 'position_probs': position_probs,
248
+ 'avg_probability': avg_probability,
249
+ 'max_probability': max_probability,
250
+ 'sequence_length': sequence_length,
251
+ 'num_windows': len(windows)
252
+ }
253
+
254
+
@@ -0,0 +1,57 @@
1
+ # Define a custom configuration for ESM embeddings
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import PreTrainedModel, PretrainedConfig
5
+
6
+
7
+ class ESMClassifierConfig(PretrainedConfig):
8
+ model_type = "esm_classifier"
9
+
10
+ def __init__(
11
+ self,
12
+ input_dim=1280, # ESM2_t30_150M has 640 dim embeddings
13
+ hidden_dims=[2056, 1024, 512, 256, 128],
14
+ num_labels=2,
15
+ dropout=0.2,
16
+ **kwargs
17
+ ):
18
+ super().__init__(**kwargs)
19
+ self.input_dim = input_dim
20
+ self.hidden_dims = hidden_dims
21
+ self.num_labels = num_labels
22
+ self.dropout = dropout
23
+
24
+ # Define a custom model that works with the Trainer
25
+ class ESMClassifier(PreTrainedModel):
26
+ config_class = ESMClassifierConfig
27
+
28
+ def __init__(self, config):
29
+ super().__init__(config)
30
+
31
+ layers = []
32
+ dims = [config.input_dim] + config.hidden_dims
33
+
34
+ for i in range(len(dims) - 1):
35
+ layers.append(nn.Linear(dims[i], dims[i+1]))
36
+ layers.append(nn.ReLU())
37
+ layers.append(nn.Dropout(config.dropout))
38
+
39
+ self.feature_extractor = nn.Sequential(*layers)
40
+ self.classifier = nn.Linear(dims[-1], config.num_labels)
41
+
42
+ def forward(
43
+ self,
44
+ embeddings=None, # This will be your ESM embeddings
45
+ labels=None,
46
+ **kwargs
47
+ ):
48
+ # Process embeddings
49
+ features = self.feature_extractor(embeddings)
50
+ logits = self.classifier(features)
51
+
52
+ loss = None
53
+ if labels is not None:
54
+ loss_fct = nn.CrossEntropyLoss()
55
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
56
+ #
57
+ return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
@@ -0,0 +1 @@
1
+ WANDB_API_KEY = "d54c49d77e48f6ab7ee41727ea0563c4d4a259a1"
@@ -0,0 +1,86 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.utils.data import Dataset
4
+ from transformers import PreTrainedModel, PretrainedConfig
5
+ import jax_unirep
6
+
7
+ # --- UniRep Tokenizer ---
8
+ def unirep_tokenize_function(sequences):
9
+ """
10
+ Get UniRep embeddings for a list of protein sequences.
11
+ Returns a dictionary with embeddings compatible with PyTorch datasets.
12
+ """
13
+ h_final, c_final, h_avg = jax_unirep.get_reps(sequences)
14
+ return {
15
+ "embeddings": h_final,
16
+ "avg_hidden": h_avg,
17
+ "cell_state": c_final
18
+ }
19
+
20
+ # --- Custom Dataset using UniRep ---
21
+ class UniRepProteinDataset(Dataset):
22
+ def __init__(self, encodings, labels):
23
+ self.embeddings = torch.tensor(encodings["embeddings"], dtype=torch.float32)
24
+ self.avg_hidden = torch.tensor(encodings["avg_hidden"], dtype=torch.float32)
25
+ self.cell_state = torch.tensor(encodings["cell_state"], dtype=torch.float32)
26
+ self.labels = torch.tensor(labels, dtype=torch.long)
27
+
28
+ def __getitem__(self, idx):
29
+ return {
30
+ "embeddings": self.embeddings[idx],
31
+ "avg_hidden": self.avg_hidden[idx],
32
+ "cell_state": self.cell_state[idx],
33
+ "labels": self.labels[idx]
34
+ }
35
+
36
+ def __len__(self):
37
+ return len(self.labels)
38
+
39
+ # --- Custom Config for UniRepClassifier ---
40
+ class UniRepClassifierConfig(PretrainedConfig):
41
+ model_type = "unirep_classifier"
42
+
43
+ def __init__(
44
+ self,
45
+ input_dim=1900,
46
+ hidden_dims=[512, 128],
47
+ num_labels=2,
48
+ dropout=0.1,
49
+ **kwargs
50
+ ):
51
+ super().__init__(**kwargs)
52
+ self.input_dim = input_dim
53
+ self.hidden_dims = hidden_dims
54
+ self.num_labels = num_labels
55
+ self.dropout = dropout
56
+
57
+ # --- UniRep Classifier Model ---
58
+ class UniRepClassifier(PreTrainedModel):
59
+ config_class = UniRepClassifierConfig
60
+
61
+ def __init__(self, config):
62
+ super().__init__(config)
63
+
64
+ dims = [config.input_dim] + config.hidden_dims
65
+ layers = []
66
+
67
+ for i in range(len(dims) - 1):
68
+ layers.append(nn.Linear(dims[i], dims[i + 1]))
69
+ layers.append(nn.ReLU())
70
+ layers.append(nn.Dropout(config.dropout))
71
+
72
+ self.feature_extractor = nn.Sequential(*layers)
73
+ self.classifier = nn.Linear(dims[-1], config.num_labels)
74
+
75
+ def forward(self, embeddings=None, labels=None, **kwargs):
76
+ features = self.feature_extractor(embeddings)
77
+ logits = self.classifier(features)
78
+
79
+ loss = None
80
+ if labels is not None:
81
+ loss_fn = nn.CrossEntropyLoss()
82
+ loss = loss_fn(logits.view(-1, self.config.num_labels), labels.view(-1))
83
+
84
+ return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
85
+
86
+
@@ -0,0 +1,88 @@
1
+ from unirep_model import UniRepClassifier
2
+ from esm_classifier import ESMClassifier
3
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
4
+ import pickle
5
+ from ensemble_predictor import EnsembleRollingWindowPredictor
6
+ import xgboost as xgb
7
+ import wandb
8
+ import os
9
+ os.environ["WANDB_MODE"] = "disabled"
10
+
11
+ def load_models_and_calibrators():
12
+ """
13
+ Load models and calibrators
14
+ """
15
+ models = {}
16
+
17
+ #initialize wandb api
18
+ api = wandb.Api(api_key=os.environ["WANDB_API_KEY"])
19
+
20
+ # Model 1: ESM2 150M fine-tuned
21
+ artifact_1 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_esm2_150M_checkpoint_100_epochs:v0')
22
+ model_path_1 = artifact_1.download()
23
+ models['esm2_150M'] = AutoModelForSequenceClassification.from_pretrained(model_path_1)
24
+ tokenizer_1 = AutoTokenizer.from_pretrained(model_path_1)
25
+
26
+ # Model 2: UniRep classifier
27
+ artifact_2 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_UniRepClassifier_4_layers_50_epochs:v0')
28
+ model_path_2 = artifact_2.download()
29
+ models['unirep'] = UniRepClassifier.from_pretrained(model_path_2)
30
+
31
+ # Model 3: ESM2 650M classifier
32
+ artifact_3 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/final_ESMClassifier_650_layers_50_epochs:v0')
33
+ model_path_3 = artifact_3.download()
34
+ models['esm2_650M'] = ESMClassifier.from_pretrained(model_path_3)
35
+
36
+ # Model 4: SVM model
37
+ artifact_4 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/svm_model:v0')
38
+ model_path_4 = artifact_4.download()
39
+ model_path_4_join = os.path.join(model_path_4, "svm_model.pkl")
40
+
41
+ with open(model_path_4_join, "rb") as f:
42
+ models['svm'] = pickle.load(f)
43
+
44
+ # Model 5: XGBoost model
45
+ artifact_5 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/XGBoost:v0')
46
+ model_path_5 = artifact_5.download()
47
+ model_path_5_join = os.path.join(model_path_5, "xgb_model.json")
48
+ xgb_model = xgb.XGBClassifier()
49
+ xgb_model.load_model(model_path_5_join)
50
+ models['xgboost'] = xgb_model
51
+
52
+
53
+ # Calibrators
54
+ calibrators = {}
55
+
56
+ # platt_unirep
57
+ artifact_p1 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/platt_unirep:v0')
58
+ model_path_p1 = artifact_p1.download()
59
+ calibrator_path_p1 = os.path.join(model_path_p1, "platt_unirep.pkl")
60
+ with open(calibrator_path_p1, "rb") as f:
61
+ calibrators['platt_unirep'] = pickle.load(f)
62
+
63
+ # isotonic_650M_NN
64
+ artifact_p2 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/isotonic_650M_NN:v0')
65
+ model_path_p2 = artifact_p2.download()
66
+ calibrator_path_p2 = os.path.join(model_path_p2, "isotonic_650M_NN.pkl")
67
+ with open(calibrator_path_p2, "rb") as f:
68
+ calibrators['isotonic_650M_NN'] = pickle.load(f)
69
+
70
+ # isotonic_XGBoost
71
+ artifact_p3 = api.artifact('biophysarm-l-k-jordan-associates/amylodeep/isotonic_XGBoost:v0')
72
+ model_path_p3 = artifact_p3.download()
73
+ calibrator_path_p3 = os.path.join(model_path_p3, "isotonic_XGBoost.pkl")
74
+ with open(calibrator_path_p3, "rb") as f:
75
+ calibrators['isotonic_XGBoost'] = pickle.load(f)
76
+
77
+
78
+ return models, calibrators,tokenizer_1
79
+
80
+
81
+ def predict_ensemble_rolling(sequence: str, window_size: int = 6):
82
+ """
83
+ Run ensemble prediction with rolling window over a single sequence.
84
+ Returns dictionary with average/max probs and position-wise scores.
85
+ """
86
+ models, calibrators ,tokenizer_1 = load_models_and_calibrators()
87
+ predictor = EnsembleRollingWindowPredictor(models, calibrators,tokenizer_1)
88
+ return predictor.rolling_window_prediction(sequence, window_size)
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: amylodeep
3
+ Version: 0.1.0
4
+ Summary: Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models
5
+ Author-email: Alisa Davtyan <alisadavtyan7@gmail.com>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/AlisaDavtyan/protein_classification
8
+ Project-URL: Bug Tracker, https://github.com/AlisaDavtyan/protein_classification/issues
9
+ Keywords: bioinformatics,amyloid,deep learning,protein,sequence classification
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: torch>=1.12.0
23
+ Requires-Dist: transformers>=4.30.0
24
+ Requires-Dist: xgboost>=1.7.0
25
+ Requires-Dist: numpy>=1.20
26
+ Requires-Dist: pandas>=1.3
27
+ Requires-Dist: scikit-learn>=1.0
28
+ Requires-Dist: jax-unirep>=2.0.0
29
+ Requires-Dist: wandb>=0.14
30
+ Requires-Dist: toml>=0.10.2
31
+ Provides-Extra: ui
32
+ Requires-Dist: streamlit>=1.18; extra == "ui"
33
+ Requires-Dist: matplotlib>=3.5; extra == "ui"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=6.0; extra == "dev"
36
+ Requires-Dist: black>=22.0; extra == "dev"
37
+ Requires-Dist: flake8>=3.9; extra == "dev"
38
+ Dynamic: license-file
39
+
40
+ # AmyloDeep
41
+
42
+ **Prediction of amyloid propensity from amino acid sequences using deep learning**
43
+
44
+ AmyloDeep is a Python package that uses a 5-model ensemble to predict amyloidogenic regions in protein sequences using a rolling window approach. The package combines multiple state-of-the-art machine learning models including ESM2 transformers, UniRep embeddings, SVM, and XGBoost to provide accurate amyloid propensity predictions.
45
+
46
+ ## Features
47
+
48
+ - **Multi-model ensemble**: Combines 5 different models for robust predictions
49
+ - **Rolling window analysis**: Analyzes sequences using sliding windows of configurable size
50
+ - **Pre-trained models**: Uses models trained on amyloid sequence databases
51
+ - **Calibrated probabilities**: Includes probability calibration for better confidence estimates
52
+ - **Easy-to-use API**: Simple Python interface and command-line tool
53
+ - **Streamlit web interface**: Optional web interface for interactive predictions
54
+
55
+ ## Installation
56
+
57
+ ### From PyPI (recommended)
58
+
59
+ ```bash
60
+ pip install amylodeep
61
+ ```
62
+
63
+ ### From source
64
+
65
+ ```bash
66
+ git clone https://github.com/AlisaDavtyan/protein_classification.git
67
+ cd amylodeep
68
+ pip install -e .
69
+ ```
70
+
71
+
72
+
73
+ For development:
74
+ ```bash
75
+ pip install amylodeep[dev]
76
+ ```
77
+
78
+ ## Quick Start
79
+
80
+ ### Python API
81
+
82
+ ```python
83
+ from amylodeep import predict_ensemble_rolling
84
+
85
+ # Predict amyloid propensity for a protein sequence
86
+ sequence = "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
87
+ result = predict_ensemble_rolling(sequence, window_size=6)
88
+
89
+ print(f"Average probability: {result['avg_probability']:.4f}")
90
+ print(f"Maximum probability: {result['max_probability']:.4f}")
91
+
92
+ # Access position-wise probabilities
93
+ for position, probability in result['position_probs']:
94
+ print(f"Position {position}: {probability:.4f}")
95
+ ```
96
+
97
+ ### Command Line Interface
98
+
99
+ ```bash
100
+ # Basic prediction
101
+ amylodeep "MKTFFFLLLLFTIGFCYVQFSKLKLENLHFKDNSEGLKNGGLQRQLGLTLKFNSNSLHHTSNL"
102
+
103
+ # With custom window size
104
+ amylodeep "SEQUENCE" --window-size 10
105
+
106
+ # Save results to file
107
+ amylodeep "SEQUENCE" --output results.json --format json
108
+
109
+ # CSV output
110
+ amylodeep "SEQUENCE" --output results.csv --format csv
111
+ ```
112
+
113
+
114
+ ## Model Architecture
115
+
116
+ AmyloDeep uses an ensemble of 5 models:
117
+
118
+ 1. **ESM2-150M**: Fine-tuned ESM2 transformer (150M parameters)
119
+ 2. **UniRep**: UniRep-based neural network classifier
120
+ 3. **ESM2-650M**: Custom classifier using ESM2-650M embeddings
121
+ 4. **SVM**: Support Vector Machine with ESM2 embeddings
122
+ 5. **XGBoost**: Gradient boosting with ESM2 embeddings
123
+
124
+ The models are combined using probability averaging, with some models using probability calibration (Platt scaling or isotonic regression) for better confidence estimates.
125
+
126
+ ## Requirements
127
+
128
+ - Python >= 3.8
129
+ - PyTorch >= 1.9.0
130
+ - Transformers >= 4.15.0
131
+ - NumPy >= 1.20.0
132
+ - scikit-learn >= 1.0.0
133
+ - XGBoost >= 1.5.0
134
+ - jax-unirep >= 2.0.0
135
+ - wandb >= 0.12.0
136
+
137
+
138
+
139
+
140
+ ### Main Functions
141
+
142
+ #### `predict_ensemble_rolling(sequence, window_size=6)`
143
+
144
+ Predict amyloid propensity for a protein sequence using rolling window analysis.
145
+
146
+ **Parameters:**
147
+ - `sequence` (str): Protein sequence (amino acid letters)
148
+ - `window_size` (int): Size of the rolling window (default: 6)
149
+
150
+ **Returns:**
151
+ Dictionary containing:
152
+ - `position_probs`: List of (position, probability) tuples
153
+ - `avg_probability`: Average probability across all windows
154
+ - `max_probability`: Maximum probability across all windows
155
+ - `sequence_length`: Length of the input sequence
156
+ - `num_windows`: Number of windows analyzed
157
+
158
+
159
+ Individual model classes for ESM and UniRep-based predictions.
160
+
161
+ ## Contributing
162
+
163
+ We welcome contributions! Please see our contributing guidelines for more information.
164
+
165
+ ## License
166
+
167
+ This project is licensed under the MIT License - see the LICENSE file for details.
168
+
169
+ ## Citation
170
+
171
+ If you use AmyloDeep in your research, please cite:
172
+
173
+ ```bibtex
174
+ @software{amylodeep2025,
175
+ title={AmyloDeep: Prediction of amyloid propensity from amino acid sequences using deep learning},
176
+ author={Alisa Davtyan},
177
+ year={2025},
178
+ url={https://github.com/AlisaDavtyan/protein_classification}
179
+ }
180
+ ```
181
+
182
+ ## Support
183
+
184
+ For questions and support:
185
+ - Open an issue on GitHub
186
+ - Contact: alisadavtyan7@gmail.com
187
+
188
+ ## Changelog
189
+
190
+ ### v0.1.0
191
+ - Initial release
192
+ - 5-model ensemble implementation
193
+ - Rolling window prediction
194
+ - Command-line interface
195
+ - Python API
@@ -0,0 +1,16 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ amylodeep/__init__.py
5
+ amylodeep/cli.py
6
+ amylodeep/ensemble_predictor.py
7
+ amylodeep/esm_classifier.py
8
+ amylodeep/secret.toml
9
+ amylodeep/unirep_model.py
10
+ amylodeep/utils.py
11
+ amylodeep.egg-info/PKG-INFO
12
+ amylodeep.egg-info/SOURCES.txt
13
+ amylodeep.egg-info/dependency_links.txt
14
+ amylodeep.egg-info/entry_points.txt
15
+ amylodeep.egg-info/requires.txt
16
+ amylodeep.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ amylodeep-cli = amylodeep.cli:main
@@ -0,0 +1,18 @@
1
+ torch>=1.12.0
2
+ transformers>=4.30.0
3
+ xgboost>=1.7.0
4
+ numpy>=1.20
5
+ pandas>=1.3
6
+ scikit-learn>=1.0
7
+ jax-unirep>=2.0.0
8
+ wandb>=0.14
9
+ toml>=0.10.2
10
+
11
+ [dev]
12
+ pytest>=6.0
13
+ black>=22.0
14
+ flake8>=3.9
15
+
16
+ [ui]
17
+ streamlit>=1.18
18
+ matplotlib>=3.5
@@ -0,0 +1 @@
1
+ amylodeep
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "amylodeep"
7
+ version = "0.1.0"
8
+ description = "Prediction of amyloid propensity from amino acid sequences using ensemble deep learning and LLM models"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+
13
+ authors = [
14
+ {name = "Alisa Davtyan", email = "alisadavtyan7@gmail.com"}
15
+ ]
16
+
17
+ keywords = ["bioinformatics", "amyloid", "deep learning", "protein", "sequence classification"]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Intended Audience :: Science/Research",
21
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.8",
25
+ "Programming Language :: Python :: 3.9",
26
+ "Programming Language :: Python :: 3.10",
27
+ "Programming Language :: Python :: 3.11",
28
+ ]
29
+
30
+ dependencies = [
31
+ "torch>=1.12.0",
32
+ "transformers>=4.30.0",
33
+ "xgboost>=1.7.0",
34
+ "numpy>=1.20",
35
+ "pandas>=1.3",
36
+ "scikit-learn>=1.0",
37
+ "jax-unirep>=2.0.0",
38
+ "wandb>=0.14",
39
+ "toml>=0.10.2"
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ ui = [
44
+ "streamlit>=1.18",
45
+ "matplotlib>=3.5"
46
+ ]
47
+ dev = [
48
+ "pytest>=6.0",
49
+ "black>=22.0",
50
+ "flake8>=3.9"
51
+ ]
52
+
53
+ [project.scripts]
54
+ amylodeep-cli = "amylodeep.cli:main"
55
+
56
+ [project.urls]
57
+ Repository = "https://github.com/AlisaDavtyan/protein_classification"
58
+ "Bug Tracker" = "https://github.com/AlisaDavtyan/protein_classification/issues"
59
+
60
+ [tool.setuptools]
61
+ packages = ["amylodeep"]
62
+
63
+ [tool.setuptools.package-data]
64
+ amylodeep = ["*.toml", "config/*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+