jingwei 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jingwei-0.0.1/PKG-INFO +327 -0
- jingwei-0.0.1/README.md +298 -0
- jingwei-0.0.1/pyproject.toml +50 -0
- jingwei-0.0.1/setup.cfg +4 -0
- jingwei-0.0.1/src/jingwei/__init__.py +21 -0
- jingwei-0.0.1/src/jingwei/datasets.py +84 -0
- jingwei-0.0.1/src/jingwei/dcae.py +100 -0
- jingwei-0.0.1/src/jingwei/dmf.py +105 -0
- jingwei-0.0.1/src/jingwei/models.py +367 -0
- jingwei-0.0.1/src/jingwei/train.py +296 -0
- jingwei-0.0.1/src/jingwei.egg-info/PKG-INFO +327 -0
- jingwei-0.0.1/src/jingwei.egg-info/SOURCES.txt +13 -0
- jingwei-0.0.1/src/jingwei.egg-info/dependency_links.txt +1 -0
- jingwei-0.0.1/src/jingwei.egg-info/requires.txt +13 -0
- jingwei-0.0.1/src/jingwei.egg-info/top_level.txt +1 -0
jingwei-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jingwei
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Proteomic data imputation framework with DMF and DCAE methods.
|
|
5
|
+
Author: JINGWEI Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: imputation,proteomics,DMF,DCAE,deep-learning
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: torch>=2.0.0
|
|
18
|
+
Requires-Dist: pytorch-lightning>=2.0.0
|
|
19
|
+
Requires-Dist: numpy>=1.24.0
|
|
20
|
+
Requires-Dist: pandas>=2.0.0
|
|
21
|
+
Requires-Dist: tensorboard>=2.13.0
|
|
22
|
+
Requires-Dist: scipy>=1.10.0
|
|
23
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
27
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# JINGWEI - Proteomic Data Imputation Framework
|
|
31
|
+
|
|
32
|
+
JINGWEI is a deep learning framework for missing proteomic data imputation, supporting both **DMF (Deep Matrix Factorization)** and **DCAE (Dilated Convolutional AutoEncoder)** methods.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- **Multiple Imputation Methods**: Support for DMF and DCAE algorithms
|
|
37
|
+
- **Flexible Architecture**: Configurable network architectures and hyperparameters
|
|
38
|
+
- **GPU Acceleration**: CUDA support with specific GPU selection
|
|
39
|
+
- **Comprehensive Logging**: TensorBoard integration for training monitoring
|
|
40
|
+
- **Early Stopping**: Prevent overfitting with configurable patience
|
|
41
|
+
- **Batch Processing**: Efficient batch training with customizable batch sizes
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
### Requirements
|
|
46
|
+
|
|
47
|
+
- Python 3.12+
|
|
48
|
+
- CUDA-capable GPU (optional, but recommended)
|
|
49
|
+
It is recommended to use conda to manage the environment.
|
|
50
|
+
```bash
|
|
51
|
+
conda create -n jingwei python=3.12
|
|
52
|
+
conda activate jingwei
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Dependencies
|
|
56
|
+
|
|
57
|
+
Install the package in editable mode (recommended for development):
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install -e .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
If you want runtime-only dependencies without editable install:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install -r requirements.txt
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Optional dev dependencies:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install -e .[dev]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Quick Start
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Basic usage with DMF method
|
|
81
|
+
python -m jingwei.train --data-path data/your_dataset.csv
|
|
82
|
+
|
|
83
|
+
# Use DCAE method with GPU 1
|
|
84
|
+
python -m jingwei.train --data-path data/Alzheimer.csv --method DCAE --device cuda --gpu-id 1
|
|
85
|
+
|
|
86
|
+
# Custom parameters with early stopping
|
|
87
|
+
python -m jingwei.train --data-path data/your_dataset.csv \
|
|
88
|
+
--method DMF \
|
|
89
|
+
--hidden-dims 512 256 128 \
|
|
90
|
+
--embedding-dim 128 \
|
|
91
|
+
--early-stopping \
|
|
92
|
+
--max-epochs 100
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Python API Example
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import pytorch_lightning as pl
|
|
99
|
+
|
|
100
|
+
from jingwei import CSVDataset, DMFImputer, DCAEImputer
|
|
101
|
+
|
|
102
|
+
dataset = CSVDataset("data/Alzheimer.csv")
|
|
103
|
+
|
|
104
|
+
# Choose one of the two methods
|
|
105
|
+
model = DMFImputer(
|
|
106
|
+
full_data_tensor=dataset.data_normalized,
|
|
107
|
+
full_mask_tensor=dataset.mask,
|
|
108
|
+
embedding_dim=64,
|
|
109
|
+
hidden_dims=[256, 128],
|
|
110
|
+
batch_size=1024,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Or use DCAE
|
|
114
|
+
# model = DCAEImputer(
|
|
115
|
+
# full_data_tensor=dataset.data_normalized,
|
|
116
|
+
# full_mask_tensor=dataset.mask,
|
|
117
|
+
# ae_dim=64,
|
|
118
|
+
# batch_size=1024,
|
|
119
|
+
# )
|
|
120
|
+
|
|
121
|
+
trainer = pl.Trainer(max_epochs=5)
|
|
122
|
+
trainer.fit(model)
|
|
123
|
+
|
|
124
|
+
imputed_normalized = model.get_imputed_data()
|
|
125
|
+
imputed = dataset.inverse_transform(imputed_normalized)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Available Parameters
|
|
129
|
+
|
|
130
|
+
#### Required Arguments
|
|
131
|
+
- `--data-path PATH`: Path to input CSV file
|
|
132
|
+
|
|
133
|
+
#### Method Selection
|
|
134
|
+
- `--method {DMF,DCAE}`: Imputation method (default: DMF)
|
|
135
|
+
|
|
136
|
+
#### General Network Parameters
|
|
137
|
+
- `--hidden-dims DIMS`: Hidden layer dimensions, space-separated (default: "256 128")
|
|
138
|
+
- `--batch-size SIZE`: Batch size for training (default: 1024)
|
|
139
|
+
- `--learning-rate RATE`: Learning rate (default: 0.001)
|
|
140
|
+
- `--weight-decay DECAY`: Weight decay for optimizer (default: 0.00001)
|
|
141
|
+
- `--gradient-clip VALUE`: Gradient clipping value (default: 1.0)
|
|
142
|
+
|
|
143
|
+
#### DMF Specific Parameters
|
|
144
|
+
- `--embedding-dim DIM`: Embedding dimension (default: 64)
|
|
145
|
+
|
|
146
|
+
#### DCAE Specific Parameters
|
|
147
|
+
- `--latent-dim DIM`: Latent dimension (default: 64)
|
|
148
|
+
- `--num-encoder-blocks NUM`: Number of encoder blocks (default: 2)
|
|
149
|
+
- `--num-decoder-blocks NUM`: Number of decoder blocks (default: 2)
|
|
150
|
+
- `--dilation VALUE`: Dilation factor (default: 2)
|
|
151
|
+
|
|
152
|
+
#### Loss Weights
|
|
153
|
+
- `--mask-weight WEIGHT`: Weight for mask prediction loss (default: 0.5)
|
|
154
|
+
- `--reconstruction-weight WEIGHT`: Weight for reconstruction loss (default: 1.0)
|
|
155
|
+
|
|
156
|
+
#### Training Control
|
|
157
|
+
- `--max-epochs EPOCHS`: Maximum training epochs (default: 200)
|
|
158
|
+
- `--early-stopping`: Enable early stopping
|
|
159
|
+
- `--patience PATIENCE`: Patience for early stopping (default: 20)
|
|
160
|
+
|
|
161
|
+
#### Device Settings
|
|
162
|
+
- `--device {cpu,cuda,auto}`: Device to use (default: auto)
|
|
163
|
+
- `--gpu-id ID`: Specific GPU ID to use (0, 1, etc.)
|
|
164
|
+
|
|
165
|
+
#### Output Settings
|
|
166
|
+
- `--results-dir DIR`: Directory for saving results (default: ./results)
|
|
167
|
+
- `--log-interval INTERVAL`: Logging interval in steps (default: 50)
|
|
168
|
+
- `--progress-bar`: Show progress bar during training
|
|
169
|
+
|
|
170
|
+
## Data Format
|
|
171
|
+
|
|
172
|
+
The input CSV file should have the following format:
|
|
173
|
+
- First row: Header (will be skipped)
|
|
174
|
+
- First column: Sample IDs/names (will be skipped)
|
|
175
|
+
- Remaining columns: Protein expression data
|
|
176
|
+
- Missing values: Use 0, negative values, or NaN
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
```csv
|
|
180
|
+
Sample_ID,Protein_1,Protein_2,Protein_3,...
|
|
181
|
+
Sample_001,1.23,0.45,NaN,...
|
|
182
|
+
Sample_002,2.34,0,1.67,...
|
|
183
|
+
Sample_003,1.45,1.23,2.89,...
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Output Files
|
|
187
|
+
|
|
188
|
+
JINGWEI generates the following outputs in the results directory:
|
|
189
|
+
|
|
190
|
+
```
|
|
191
|
+
results/
|
|
192
|
+
├── checkpoints/ # Model checkpoints
|
|
193
|
+
├── logs/ # TensorBoard logs
|
|
194
|
+
└── outputs/
|
|
195
|
+
└── {METHOD}_{DATASET}_{TIMESTAMP}/
|
|
196
|
+
├── config.json # Training configuration
|
|
197
|
+
├── imputed_data.csv # Imputed protein data
|
|
198
|
+
├── training_metrics.csv # Training loss history
|
|
199
|
+
└── model_final.ckpt # Final trained model
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Examples
|
|
203
|
+
|
|
204
|
+
### Example 1: DMF with Custom Architecture
|
|
205
|
+
```bash
|
|
206
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
207
|
+
--method DMF \
|
|
208
|
+
--hidden-dims 512 256 128 64 \
|
|
209
|
+
--embedding-dim 128 \
|
|
210
|
+
--mask-weight 0.3 \
|
|
211
|
+
--learning-rate 0.0005 \
|
|
212
|
+
--max-epochs 150 \
|
|
213
|
+
--early-stopping \
|
|
214
|
+
--progress-bar
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Example 2: DCAE with GPU Acceleration
|
|
218
|
+
```bash
|
|
219
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
220
|
+
--method DCAE \
|
|
221
|
+
--device cuda \
|
|
222
|
+
--gpu-id 1 \
|
|
223
|
+
--latent-dim 128 \
|
|
224
|
+
--num-encoder-blocks 3 \
|
|
225
|
+
--num-decoder-blocks 3 \
|
|
226
|
+
--dilation 4 \
|
|
227
|
+
--batch-size 512
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Example 3: CPU Training with Custom Output Directory
|
|
231
|
+
```bash
|
|
232
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
233
|
+
--device cpu \
|
|
234
|
+
--results-dir ./my_results \
|
|
235
|
+
--max-epochs 50 \
|
|
236
|
+
--log-interval 10
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Method Descriptions
|
|
240
|
+
|
|
241
|
+
### DMF (Deep Matrix Factorization)
|
|
242
|
+
- Uses row and column embeddings to capture latent patterns
|
|
243
|
+
- Suitable for collaborative filtering-style missing data
|
|
244
|
+
- Good for datasets with structured missing patterns
|
|
245
|
+
|
|
246
|
+
### DCAE (Dilated Convolutional AutoEncoder)
|
|
247
|
+
- Uses dilated convolutions to capture long-range dependencies
|
|
248
|
+
- Suitable for sequential or structured protein data
|
|
249
|
+
- Better for complex missing data patterns
|
|
250
|
+
|
|
251
|
+
## Monitoring Training
|
|
252
|
+
|
|
253
|
+
### TensorBoard
|
|
254
|
+
```bash
|
|
255
|
+
tensorboard --logdir results/logs
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### Training Metrics
|
|
259
|
+
Monitor the following metrics:
|
|
260
|
+
- `train_loss`: Overall training loss
|
|
261
|
+
- `reconstruction_loss`: Data reconstruction quality
|
|
262
|
+
- `mask_loss`: Missing data pattern prediction accuracy
|
|
263
|
+
|
|
264
|
+
## Troubleshooting
|
|
265
|
+
|
|
266
|
+
### Common Issues
|
|
267
|
+
|
|
268
|
+
1. **CUDA Out of Memory**
|
|
269
|
+
- Reduce `--batch-size`
|
|
270
|
+
- Use `--device cpu` for CPU training
|
|
271
|
+
|
|
272
|
+
2. **Shape Mismatch Errors**
|
|
273
|
+
- Check CSV format (ensure first column is skipped)
|
|
274
|
+
- Verify data contains only numeric values
|
|
275
|
+
|
|
276
|
+
3. **Slow Training**
|
|
277
|
+
- Use GPU acceleration with `--device cuda`
|
|
278
|
+
- Increase `--batch-size` if memory allows
|
|
279
|
+
|
|
280
|
+
4. **Poor Performance**
|
|
281
|
+
- Adjust `--mask-weight` (try 0.1-0.8)
|
|
282
|
+
- Experiment with different `--hidden-dims`
|
|
283
|
+
- Enable `--early-stopping`
|
|
284
|
+
|
|
285
|
+
### Getting Help
|
|
286
|
+
|
|
287
|
+
For help with parameters:
|
|
288
|
+
```bash
|
|
289
|
+
python -m jingwei.train --help
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## File Structure
|
|
293
|
+
|
|
294
|
+
```
|
|
295
|
+
JINGWEI/
|
|
296
|
+
├── README.md
|
|
297
|
+
├── pyproject.toml
|
|
298
|
+
├── requirements.txt
|
|
299
|
+
├── src/
|
|
300
|
+
│ ├── JINGWEI.sh # Legacy training script (optional)
|
|
301
|
+
│ ├── train.py # Thin wrapper for package entry
|
|
302
|
+
│ ├── jingwei/
|
|
303
|
+
│ │ ├── __init__.py
|
|
304
|
+
│ │ ├── train.py # Package training entry
|
|
305
|
+
│ │ ├── datasets.py # Data loading utilities
|
|
306
|
+
│ │ ├── models.py # Model architectures (DMF/DCAE)
|
|
307
|
+
│ │ ├── dmf.py # DMF trainer
|
|
308
|
+
│ │ └── dcae.py # DCAE trainer
|
|
309
|
+
│ └── methods/ # Other baselines (not packaged)
|
|
310
|
+
└── data/
|
|
311
|
+
└── your_datasets.csv
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
## License
|
|
316
|
+
|
|
317
|
+
This project is licensed under the MIT License
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
## Changelog
|
|
321
|
+
|
|
322
|
+
### Version 0.0.1
|
|
323
|
+
- Initial release
|
|
324
|
+
- Support for DMF and DCAE methods
|
|
325
|
+
- GPU acceleration
|
|
326
|
+
- Comprehensive parameter configuration
|
|
327
|
+
- TensorBoard integration
|
jingwei-0.0.1/README.md
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# JINGWEI - Proteomic Data Imputation Framework
|
|
2
|
+
|
|
3
|
+
JINGWEI is a deep learning framework for missing proteomic data imputation, supporting both **DMF (Deep Matrix Factorization)** and **DCAE (Dilated Convolutional AutoEncoder)** methods.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multiple Imputation Methods**: Support for DMF and DCAE algorithms
|
|
8
|
+
- **Flexible Architecture**: Configurable network architectures and hyperparameters
|
|
9
|
+
- **GPU Acceleration**: CUDA support with specific GPU selection
|
|
10
|
+
- **Comprehensive Logging**: TensorBoard integration for training monitoring
|
|
11
|
+
- **Early Stopping**: Prevent overfitting with configurable patience
|
|
12
|
+
- **Batch Processing**: Efficient batch training with customizable batch sizes
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### Requirements
|
|
17
|
+
|
|
18
|
+
- Python 3.12+
|
|
19
|
+
- CUDA-capable GPU (optional, but recommended)
|
|
20
|
+
It is recommended to use conda to manage the environment.
|
|
21
|
+
```bash
|
|
22
|
+
conda create -n jingwei python=3.12
|
|
23
|
+
conda activate jingwei
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Dependencies
|
|
27
|
+
|
|
28
|
+
Install the package in editable mode (recommended for development):
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install -e .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
If you want runtime-only dependencies without editable install:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install -r requirements.txt
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Optional dev dependencies:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e .[dev]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
### Quick Start
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Basic usage with DMF method
|
|
52
|
+
python -m jingwei.train --data-path data/your_dataset.csv
|
|
53
|
+
|
|
54
|
+
# Use DCAE method with GPU 1
|
|
55
|
+
python -m jingwei.train --data-path data/Alzheimer.csv --method DCAE --device cuda --gpu-id 1
|
|
56
|
+
|
|
57
|
+
# Custom parameters with early stopping
|
|
58
|
+
python -m jingwei.train --data-path data/your_dataset.csv \
|
|
59
|
+
--method DMF \
|
|
60
|
+
--hidden-dims 512 256 128 \
|
|
61
|
+
--embedding-dim 128 \
|
|
62
|
+
--early-stopping \
|
|
63
|
+
--max-epochs 100
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Python API Example
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import pytorch_lightning as pl
|
|
70
|
+
|
|
71
|
+
from jingwei import CSVDataset, DMFImputer, DCAEImputer
|
|
72
|
+
|
|
73
|
+
dataset = CSVDataset("data/Alzheimer.csv")
|
|
74
|
+
|
|
75
|
+
# Choose one of the two methods
|
|
76
|
+
model = DMFImputer(
|
|
77
|
+
full_data_tensor=dataset.data_normalized,
|
|
78
|
+
full_mask_tensor=dataset.mask,
|
|
79
|
+
embedding_dim=64,
|
|
80
|
+
hidden_dims=[256, 128],
|
|
81
|
+
batch_size=1024,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Or use DCAE
|
|
85
|
+
# model = DCAEImputer(
|
|
86
|
+
# full_data_tensor=dataset.data_normalized,
|
|
87
|
+
# full_mask_tensor=dataset.mask,
|
|
88
|
+
# ae_dim=64,
|
|
89
|
+
# batch_size=1024,
|
|
90
|
+
# )
|
|
91
|
+
|
|
92
|
+
trainer = pl.Trainer(max_epochs=5)
|
|
93
|
+
trainer.fit(model)
|
|
94
|
+
|
|
95
|
+
imputed_normalized = model.get_imputed_data()
|
|
96
|
+
imputed = dataset.inverse_transform(imputed_normalized)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Available Parameters
|
|
100
|
+
|
|
101
|
+
#### Required Arguments
|
|
102
|
+
- `--data-path PATH`: Path to input CSV file
|
|
103
|
+
|
|
104
|
+
#### Method Selection
|
|
105
|
+
- `--method {DMF,DCAE}`: Imputation method (default: DMF)
|
|
106
|
+
|
|
107
|
+
#### General Network Parameters
|
|
108
|
+
- `--hidden-dims DIMS`: Hidden layer dimensions, space-separated (default: "256 128")
|
|
109
|
+
- `--batch-size SIZE`: Batch size for training (default: 1024)
|
|
110
|
+
- `--learning-rate RATE`: Learning rate (default: 0.001)
|
|
111
|
+
- `--weight-decay DECAY`: Weight decay for optimizer (default: 0.00001)
|
|
112
|
+
- `--gradient-clip VALUE`: Gradient clipping value (default: 1.0)
|
|
113
|
+
|
|
114
|
+
#### DMF Specific Parameters
|
|
115
|
+
- `--embedding-dim DIM`: Embedding dimension (default: 64)
|
|
116
|
+
|
|
117
|
+
#### DCAE Specific Parameters
|
|
118
|
+
- `--latent-dim DIM`: Latent dimension (default: 64)
|
|
119
|
+
- `--num-encoder-blocks NUM`: Number of encoder blocks (default: 2)
|
|
120
|
+
- `--num-decoder-blocks NUM`: Number of decoder blocks (default: 2)
|
|
121
|
+
- `--dilation VALUE`: Dilation factor (default: 2)
|
|
122
|
+
|
|
123
|
+
#### Loss Weights
|
|
124
|
+
- `--mask-weight WEIGHT`: Weight for mask prediction loss (default: 0.5)
|
|
125
|
+
- `--reconstruction-weight WEIGHT`: Weight for reconstruction loss (default: 1.0)
|
|
126
|
+
|
|
127
|
+
#### Training Control
|
|
128
|
+
- `--max-epochs EPOCHS`: Maximum training epochs (default: 200)
|
|
129
|
+
- `--early-stopping`: Enable early stopping
|
|
130
|
+
- `--patience PATIENCE`: Patience for early stopping (default: 20)
|
|
131
|
+
|
|
132
|
+
#### Device Settings
|
|
133
|
+
- `--device {cpu,cuda,auto}`: Device to use (default: auto)
|
|
134
|
+
- `--gpu-id ID`: Specific GPU ID to use (0, 1, etc.)
|
|
135
|
+
|
|
136
|
+
#### Output Settings
|
|
137
|
+
- `--results-dir DIR`: Directory for saving results (default: ./results)
|
|
138
|
+
- `--log-interval INTERVAL`: Logging interval in steps (default: 50)
|
|
139
|
+
- `--progress-bar`: Show progress bar during training
|
|
140
|
+
|
|
141
|
+
## Data Format
|
|
142
|
+
|
|
143
|
+
The input CSV file should have the following format:
|
|
144
|
+
- First row: Header (will be skipped)
|
|
145
|
+
- First column: Sample IDs/names (will be skipped)
|
|
146
|
+
- Remaining columns: Protein expression data
|
|
147
|
+
- Missing values: Use 0, negative values, or NaN
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
```csv
|
|
151
|
+
Sample_ID,Protein_1,Protein_2,Protein_3,...
|
|
152
|
+
Sample_001,1.23,0.45,NaN,...
|
|
153
|
+
Sample_002,2.34,0,1.67,...
|
|
154
|
+
Sample_003,1.45,1.23,2.89,...
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Output Files
|
|
158
|
+
|
|
159
|
+
JINGWEI generates the following outputs in the results directory:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
results/
|
|
163
|
+
├── checkpoints/ # Model checkpoints
|
|
164
|
+
├── logs/ # TensorBoard logs
|
|
165
|
+
└── outputs/
|
|
166
|
+
└── {METHOD}_{DATASET}_{TIMESTAMP}/
|
|
167
|
+
├── config.json # Training configuration
|
|
168
|
+
├── imputed_data.csv # Imputed protein data
|
|
169
|
+
├── training_metrics.csv # Training loss history
|
|
170
|
+
└── model_final.ckpt # Final trained model
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Examples
|
|
174
|
+
|
|
175
|
+
### Example 1: DMF with Custom Architecture
|
|
176
|
+
```bash
|
|
177
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
178
|
+
--method DMF \
|
|
179
|
+
--hidden-dims 512 256 128 64 \
|
|
180
|
+
--embedding-dim 128 \
|
|
181
|
+
--mask-weight 0.3 \
|
|
182
|
+
--learning-rate 0.0005 \
|
|
183
|
+
--max-epochs 150 \
|
|
184
|
+
--early-stopping \
|
|
185
|
+
--progress-bar
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Example 2: DCAE with GPU Acceleration
|
|
189
|
+
```bash
|
|
190
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
191
|
+
--method DCAE \
|
|
192
|
+
--device cuda \
|
|
193
|
+
--gpu-id 1 \
|
|
194
|
+
--latent-dim 128 \
|
|
195
|
+
--num-encoder-blocks 3 \
|
|
196
|
+
--num-decoder-blocks 3 \
|
|
197
|
+
--dilation 4 \
|
|
198
|
+
--batch-size 512
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Example 3: CPU Training with Custom Output Directory
|
|
202
|
+
```bash
|
|
203
|
+
./src/JINGWEI.sh --data-path data/Alzheimer.csv \
|
|
204
|
+
--device cpu \
|
|
205
|
+
--results-dir ./my_results \
|
|
206
|
+
--max-epochs 50 \
|
|
207
|
+
--log-interval 10
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Method Descriptions
|
|
211
|
+
|
|
212
|
+
### DMF (Deep Matrix Factorization)
|
|
213
|
+
- Uses row and column embeddings to capture latent patterns
|
|
214
|
+
- Suitable for collaborative filtering-style missing data
|
|
215
|
+
- Good for datasets with structured missing patterns
|
|
216
|
+
|
|
217
|
+
### DCAE (Dilated Convolutional AutoEncoder)
|
|
218
|
+
- Uses dilated convolutions to capture long-range dependencies
|
|
219
|
+
- Suitable for sequential or structured protein data
|
|
220
|
+
- Better for complex missing data patterns
|
|
221
|
+
|
|
222
|
+
## Monitoring Training
|
|
223
|
+
|
|
224
|
+
### TensorBoard
|
|
225
|
+
```bash
|
|
226
|
+
tensorboard --logdir results/logs
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Training Metrics
|
|
230
|
+
Monitor the following metrics:
|
|
231
|
+
- `train_loss`: Overall training loss
|
|
232
|
+
- `reconstruction_loss`: Data reconstruction quality
|
|
233
|
+
- `mask_loss`: Missing data pattern prediction accuracy
|
|
234
|
+
|
|
235
|
+
## Troubleshooting
|
|
236
|
+
|
|
237
|
+
### Common Issues
|
|
238
|
+
|
|
239
|
+
1. **CUDA Out of Memory**
|
|
240
|
+
- Reduce `--batch-size`
|
|
241
|
+
- Use `--device cpu` for CPU training
|
|
242
|
+
|
|
243
|
+
2. **Shape Mismatch Errors**
|
|
244
|
+
- Check CSV format (ensure first column is skipped)
|
|
245
|
+
- Verify data contains only numeric values
|
|
246
|
+
|
|
247
|
+
3. **Slow Training**
|
|
248
|
+
- Use GPU acceleration with `--device cuda`
|
|
249
|
+
- Increase `--batch-size` if memory allows
|
|
250
|
+
|
|
251
|
+
4. **Poor Performance**
|
|
252
|
+
- Adjust `--mask-weight` (try 0.1-0.8)
|
|
253
|
+
- Experiment with different `--hidden-dims`
|
|
254
|
+
- Enable `--early-stopping`
|
|
255
|
+
|
|
256
|
+
### Getting Help
|
|
257
|
+
|
|
258
|
+
For help with parameters:
|
|
259
|
+
```bash
|
|
260
|
+
python -m jingwei.train --help
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## File Structure
|
|
264
|
+
|
|
265
|
+
```
|
|
266
|
+
JINGWEI/
|
|
267
|
+
├── README.md
|
|
268
|
+
├── pyproject.toml
|
|
269
|
+
├── requirements.txt
|
|
270
|
+
├── src/
|
|
271
|
+
│ ├── JINGWEI.sh # Legacy training script (optional)
|
|
272
|
+
│ ├── train.py # Thin wrapper for package entry
|
|
273
|
+
│ ├── jingwei/
|
|
274
|
+
│ │ ├── __init__.py
|
|
275
|
+
│ │ ├── train.py # Package training entry
|
|
276
|
+
│ │ ├── datasets.py # Data loading utilities
|
|
277
|
+
│ │ ├── models.py # Model architectures (DMF/DCAE)
|
|
278
|
+
│ │ ├── dmf.py # DMF trainer
|
|
279
|
+
│ │ └── dcae.py # DCAE trainer
|
|
280
|
+
│ └── methods/ # Other baselines (not packaged)
|
|
281
|
+
└── data/
|
|
282
|
+
└── your_datasets.csv
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
## License
|
|
287
|
+
|
|
288
|
+
This project is licensed under the MIT License
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
## Changelog
|
|
292
|
+
|
|
293
|
+
### Version 0.0.1
|
|
294
|
+
- Initial release
|
|
295
|
+
- Support for DMF and DCAE methods
|
|
296
|
+
- GPU acceleration
|
|
297
|
+
- Comprehensive parameter configuration
|
|
298
|
+
- TensorBoard integration
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "jingwei"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Proteomic data imputation framework with DMF and DCAE methods."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "JINGWEI Contributors" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["imputation", "proteomics", "DMF", "DCAE", "deep-learning"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"torch>=2.0.0",
|
|
28
|
+
"pytorch-lightning>=2.0.0",
|
|
29
|
+
"numpy>=1.24.0",
|
|
30
|
+
"pandas>=2.0.0",
|
|
31
|
+
"tensorboard>=2.13.0",
|
|
32
|
+
"scipy>=1.10.0",
|
|
33
|
+
"scikit-learn>=1.3.0"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=7.4.0",
|
|
39
|
+
"pytest-cov>=4.1.0",
|
|
40
|
+
"black>=23.0.0",
|
|
41
|
+
"flake8>=6.0.0"
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools]
|
|
45
|
+
package-dir = {"" = "src"}
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
49
|
+
include = ["jingwei*"]
|
|
50
|
+
|
jingwei-0.0.1/setup.cfg
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""JINGWEI package for proteomic data imputation."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.0.1"
|
|
4
|
+
|
|
5
|
+
from .datasets import CSVDataset
|
|
6
|
+
from .models import DCAE, DMF
|
|
7
|
+
from .dcae import DCAEImputer
|
|
8
|
+
from .dmf import DMFImputer
|
|
9
|
+
from .train import ImputationTrainer, parse_arguments, main as train_main
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CSVDataset",
|
|
13
|
+
"DCAE",
|
|
14
|
+
"DMF",
|
|
15
|
+
"DCAEImputer",
|
|
16
|
+
"DMFImputer",
|
|
17
|
+
"ImputationTrainer",
|
|
18
|
+
"parse_arguments",
|
|
19
|
+
"train_main",
|
|
20
|
+
"__version__",
|
|
21
|
+
]
|