octopi 1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of octopi might be problematic. Click here for more details.
- octopi-1.0/LICENSE +41 -0
- octopi-1.0/PKG-INFO +209 -0
- octopi-1.0/README.md +173 -0
- octopi-1.0/octopi/__init__.py +0 -0
- octopi-1.0/octopi/datasets/__init__.py +0 -0
- octopi-1.0/octopi/datasets/augment.py +84 -0
- octopi-1.0/octopi/datasets/cached_datset.py +113 -0
- octopi-1.0/octopi/datasets/dataset.py +19 -0
- octopi-1.0/octopi/datasets/generators.py +429 -0
- octopi-1.0/octopi/datasets/mixup.py +49 -0
- octopi-1.0/octopi/datasets/multi_config_generator.py +253 -0
- octopi-1.0/octopi/entry_points/__init__.py +0 -0
- octopi-1.0/octopi/entry_points/common.py +80 -0
- octopi-1.0/octopi/entry_points/create_slurm_submission.py +243 -0
- octopi-1.0/octopi/entry_points/run_create_targets.py +281 -0
- octopi-1.0/octopi/entry_points/run_evaluate.py +65 -0
- octopi-1.0/octopi/entry_points/run_extract_mb_picks.py +141 -0
- octopi-1.0/octopi/entry_points/run_extract_midpoint.py +143 -0
- octopi-1.0/octopi/entry_points/run_localize.py +222 -0
- octopi-1.0/octopi/entry_points/run_optuna.py +139 -0
- octopi-1.0/octopi/entry_points/run_segment_predict.py +166 -0
- octopi-1.0/octopi/entry_points/run_train.py +201 -0
- octopi-1.0/octopi/extract/__init__.py +0 -0
- octopi-1.0/octopi/extract/localize.py +254 -0
- octopi-1.0/octopi/extract/membranebound_extract.py +262 -0
- octopi-1.0/octopi/extract/midpoint_extract.py +193 -0
- octopi-1.0/octopi/io.py +457 -0
- octopi-1.0/octopi/losses.py +86 -0
- octopi-1.0/octopi/main.py +101 -0
- octopi-1.0/octopi/models/AttentionUnet.py +56 -0
- octopi-1.0/octopi/models/MedNeXt.py +111 -0
- octopi-1.0/octopi/models/ModelTemplate.py +36 -0
- octopi-1.0/octopi/models/SegResNet.py +92 -0
- octopi-1.0/octopi/models/Unet.py +59 -0
- octopi-1.0/octopi/models/UnetPlusPlus.py +47 -0
- octopi-1.0/octopi/models/__init__.py +0 -0
- octopi-1.0/octopi/models/common.py +62 -0
- octopi-1.0/octopi/processing/__init__.py +0 -0
- octopi-1.0/octopi/processing/create_targets_from_picks.py +106 -0
- octopi-1.0/octopi/processing/downsample.py +129 -0
- octopi-1.0/octopi/processing/evaluate.py +289 -0
- octopi-1.0/octopi/processing/importers.py +213 -0
- octopi-1.0/octopi/processing/my_metrics.py +26 -0
- octopi-1.0/octopi/processing/segmentation_from_picks.py +167 -0
- octopi-1.0/octopi/processing/writers.py +102 -0
- octopi-1.0/octopi/pytorch/__init__.py +0 -0
- octopi-1.0/octopi/pytorch/hyper_search.py +243 -0
- octopi-1.0/octopi/pytorch/model_search_submitter.py +290 -0
- octopi-1.0/octopi/pytorch/segmentation.py +317 -0
- octopi-1.0/octopi/pytorch/trainer.py +438 -0
- octopi-1.0/octopi/pytorch_lightning/__init__.py +0 -0
- octopi-1.0/octopi/pytorch_lightning/optuna_pl_ddp.py +273 -0
- octopi-1.0/octopi/pytorch_lightning/train_pl.py +244 -0
- octopi-1.0/octopi/stopping_criteria.py +143 -0
- octopi-1.0/octopi/submit_slurm.py +95 -0
- octopi-1.0/octopi/utils.py +238 -0
- octopi-1.0/octopi/visualization_tools.py +201 -0
- octopi-1.0/pyproject.toml +43 -0
octopi-1.0/LICENSE
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Legal
|
|
2
|
+
|
|
3
|
+
## License for the octopi package
|
|
4
|
+
|
|
5
|
+
This package is licensed under the MIT License:
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2025 Chan Zuckerberg Initiative
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## License Notice for Dependencies
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
This repository is licensed under the MIT License; however, it relies on certain third-party dependencies that are licensed under the GNU General Public License (GPL). Specifically:
|
|
35
|
+
|
|
36
|
+
- monai is licensed under the Apache License 2.0.
|
|
37
|
+
- pytorch-lightning is licensed under the Apache License 2.0.
|
|
38
|
+
|
|
39
|
+
All dependencies use permissive open-source licenses that are compatible with this project's MIT License. No GPL or other copyleft licensed dependencies are included.
|
|
40
|
+
For specific licensing information about any dependency, please refer to the respective package documentation or repository.
|
|
41
|
+
```
|
octopi-1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: octopi
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: Model architecture exploration for cryoET particle picking
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Jonathan Schwartz
|
|
7
|
+
Requires-Python: >=3.9,<4.0
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: copick
|
|
16
|
+
Requires-Dist: ipywidgets
|
|
17
|
+
Requires-Dist: kaleido
|
|
18
|
+
Requires-Dist: matplotlib
|
|
19
|
+
Requires-Dist: mlflow (==2.17.0)
|
|
20
|
+
Requires-Dist: monai-weekly (==1.5.dev2448)
|
|
21
|
+
Requires-Dist: mrcfile
|
|
22
|
+
Requires-Dist: multiprocess
|
|
23
|
+
Requires-Dist: nibabel
|
|
24
|
+
Requires-Dist: optuna (==4.0.0)
|
|
25
|
+
Requires-Dist: optuna-integration[botorch,pytorch-lightning]
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: plotly
|
|
28
|
+
Requires-Dist: python-dotenv
|
|
29
|
+
Requires-Dist: pytorch-lightning (==2.4.0)
|
|
30
|
+
Requires-Dist: requests (>=2.25.1,<3.0.0)
|
|
31
|
+
Requires-Dist: seaborn
|
|
32
|
+
Requires-Dist: torch-ema
|
|
33
|
+
Requires-Dist: tqdm
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# OCTOPI 🐙🐙🐙
|
|
37
|
+
**O**bject dete**CT**ion **O**f **P**rote**I**ns. A deep learning framework for Cryo-ET 3D particle picking with autonomous model exploration capabilities.
|
|
38
|
+
|
|
39
|
+
## 🚀 Introduction
|
|
40
|
+
|
|
41
|
+
octopi addresses a critical bottleneck in cryo-electron tomography (cryo-ET) research: the efficient identification and extraction of proteins within complex cellular environments. As advances in cryo-ET enable the collection of thousands of tomograms, the need for automated, accurate particle picking has become increasingly urgent.
|
|
42
|
+
|
|
43
|
+
Our deep learning-based pipeline streamlines the training and execution of 3D autoencoder models specifically designed for cryo-ET particle picking. Built on [copick](https://github.com/copick/copick), a storage-agnostic API, octopi seamlessly accesses tomograms and segmentations across local and remote environments.
|
|
44
|
+
|
|
45
|
+
## 🧩 Features
|
|
46
|
+
|
|
47
|
+
octopi offers a modular, deep learning-driven pipeline for:
|
|
48
|
+
* Training and evaluating custom 3D U-Net models for particle segmentation.
|
|
49
|
+
* Automatically exploring model architectures using Bayesian optimization via Optuna.
|
|
50
|
+
* Performing inference for both semantic segmentation and particle localization.
|
|
51
|
+
|
|
52
|
+
octopi empowers researchers to navigate the dense, intricate landscapes of cryo-ET datasets with unprecedented precision and efficiency without manual trial and error.
|
|
53
|
+
|
|
54
|
+
## Getting Started
|
|
55
|
+
### Installation
|
|
56
|
+
|
|
57
|
+
*Octopi* is available on PyPI.
|
|
58
|
+
```
|
|
59
|
+
pip install octopi
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## 📚 Usage
|
|
63
|
+
|
|
64
|
+
octopi provides a clean, scriptable command-line interface. Run the following command to view all available subcommands:
|
|
65
|
+
```
|
|
66
|
+
octopi --help
|
|
67
|
+
```
|
|
68
|
+
Each subcommand supports its own --help flag for detailed usage. To see practical examples of how to interface directly with the octopi API, explore the notebooks/ folder.
|
|
69
|
+
|
|
70
|
+
If you're running octopi on an HPC cluster, several SLURM-compatible submission commands are available. You can view them by running:
|
|
71
|
+
```
|
|
72
|
+
octopi-slurm --help
|
|
73
|
+
```
|
|
74
|
+
This provides utilities for submitting training, inference, and localization jobs in SLURM-based environments.
|
|
75
|
+
|
|
76
|
+
### 📥 Data Import & Preprocessing
|
|
77
|
+
|
|
78
|
+
To train or run inference with octopi, your tomograms must be organized inside a CoPick project. octopi supports two primary methods for data ingestion, both of which include optional Fourier cropping to reduce resolution and accelerate downstream processing.
|
|
79
|
+
|
|
80
|
+
If your tomograms are already processed and stored locally in .mrc format (e.g., from Warp, IMOD, or AreTomo), you can import them into a new or existing CoPick project using:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
octopi import-mrc-volumes \
|
|
84
|
+
--input-folder /path/to/mrc/files --config /path/to/config.json \
|
|
85
|
+
--target-tomo-type denoised --input-voxel-size --output-voxel-size 10
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
octopi also can process tomograms that are hosted on the data portal. Users can download tomograms onto their own remote machine especially if they would like to downsample the tomograms to a lower resolution for speed and memory. You can download and process the tomograms using:
|
|
89
|
+
```
|
|
90
|
+
octopi download-dataportal \
|
|
91
|
+
--config /path/to/config.json --datasetID 10445 --overlay-path path/to/saved/zarrs \
|
|
92
|
+
--input-voxel-size 5 --output-voxel-size 10 \
|
|
93
|
+
--dataportal-name wbp --target-tomotype wbp
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### 📁 Training Labels Preparation
|
|
97
|
+
|
|
98
|
+
Use `octopi create-targets` to create semantic masks for proteins of interest using annotation metadata. In this example lets generate picks segmentations for dataset 10439 from the CZ cryoET Dataportal (only need to run this step once).
|
|
99
|
+
```
|
|
100
|
+
octopi create-targets \
|
|
101
|
+
--config config.json \
|
|
102
|
+
--target apoferritin --target beta-galactosidase,slabpick,1 \
|
|
103
|
+
--target ribosome,pytom,0 --target virus-like-particle,pytom,0 \
|
|
104
|
+
--seg-target membrane \
|
|
105
|
+
--tomo-alg wbp --voxel-size 10 \
|
|
106
|
+
--target-session-id 1 --target-segmentation-name remotetargets \
|
|
107
|
+
--target-user-id train-octopi
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 🧠 Training a single 3D U-Net model
|
|
111
|
+
Train a 3D U-Net model on the prepared datasets using the prepared target segmentations. We can use tomograms derived from multiple copick projects.
|
|
112
|
+
```
|
|
113
|
+
octopi train-model \
|
|
114
|
+
--config experiment,config1.json \
|
|
115
|
+
--config simulation,config2.json \
|
|
116
|
+
--voxel-size 10 --tomo-alg wbp --Nclass 8 \
|
|
117
|
+
--tomo-batch-size 50 --num-epochs 100 --val-interval 10 \
|
|
118
|
+
--target-info remotetargets,train-octopi,1
|
|
119
|
+
```
|
|
120
|
+
Outputs will include model weights (.pth), logs, and training metrics.
|
|
121
|
+
|
|
122
|
+
### 🔍 Model exploration with Optuna
|
|
123
|
+
|
|
124
|
+
octopi🐙 supports automatic neural architecture search using Optuna, enabling efficient discovery of optimal 3D U-Net configurations through Bayesian optimization. This allows users to maximize segmentation accuracy without manual tuning.
|
|
125
|
+
|
|
126
|
+
To launch a model exploration job:
|
|
127
|
+
```
|
|
128
|
+
octopi model-explore \
|
|
129
|
+
--config experiment,/mnt/dataportal/ml_challenge/config.json \
|
|
130
|
+
--config simulation,/mnt/dataportal/synthetic_ml_challenge/config.json \
|
|
131
|
+
--voxel-size 10 --tomo-alg wbp --Nclass 8 \
|
|
132
|
+
--model-save-path train_results
|
|
133
|
+
```
|
|
134
|
+
Each trial evaluates a different architecture and logs:
|
|
135
|
+
• Segmentation performance metrics
|
|
136
|
+
• Model weights and configs
|
|
137
|
+
• Training curves and validation loss
|
|
138
|
+
|
|
139
|
+
🔬 Trials are automatically tracked with MLflow and saved under the specified `--model-save-path`.
|
|
140
|
+
|
|
141
|
+
#### Optuna Dashboard
|
|
142
|
+
|
|
143
|
+
To quickly asses the exploration results and observe which trials results the best architectures, Optuna provides a dashboard that summarizes all the information on a dashboard. The instrucutions to access the dashboard are available here - https://optuna-dashboard.readthedocs.io/en/latest/getting-started.html, it is recommended to use either VS-Code extension or CLI.
|
|
144
|
+
|
|
145
|
+
#### 📊 MLflow experiment tracking
|
|
146
|
+
|
|
147
|
+
To use CZI cloud MLflow tracker, add a `.env` in the root directory like below. You can get a CZI MLflow access token from [here](https://mlflow.cw.use4-prod.si.czi.technology/api/2.0/mlflow/users/access-token) (note that a new token will be generated everytime you open this site).
|
|
148
|
+
```
|
|
149
|
+
MLFLOW_TRACKING_USERNAME = <Your_CZ_email>
|
|
150
|
+
MLFLOW_TRACKING_PASSWORD = <Your_mlflow_access_token>
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
octopi supports MLflow for logging and visualizing model training and hyperparameter search results, including:
|
|
154
|
+
• Training loss/validation metrics over time
|
|
155
|
+
• Model hyperparameters and architecture details
|
|
156
|
+
• Trial comparison (e.g., best performing model)
|
|
157
|
+
|
|
158
|
+
You can use either a local MLflow instance, a remote (HPC) instance, or the CZI cloud server:
|
|
159
|
+
|
|
160
|
+
#### 🧪 Local MLflow Dashboard
|
|
161
|
+
|
|
162
|
+
To inspect results locally: `mlflow ui` and open http://localhost:5000 in your browser.
|
|
163
|
+
|
|
164
|
+
#### 🖥️ HPC Cluster MLflow Access (Remote via SSH tunnel)
|
|
165
|
+
|
|
166
|
+
If running octopi on a remote cluster (e.g., Biohub Bruno), forward the MLflow port.
|
|
167
|
+
On your local machine:
|
|
168
|
+
`ssh -L 5000:localhost:5000 remote_username@remote_host` (in the case of Bruno the remote would be `login01.czbiohub.org`).
|
|
169
|
+
|
|
170
|
+
Then on the remote terminal (login node): ` mlflow ui --host 0.0.0.0 --port 5000` to launch the MLFlow dashboard on a local borwser.
|
|
171
|
+
|
|
172
|
+
#### ☁️ CZI coreweave cluser
|
|
173
|
+
|
|
174
|
+
For the CZI coreweave cluser, MLflow is already hosted. Go to the CZI [mlflow server](https://mlflow.cw.use4-prod.si.czi.technology/).
|
|
175
|
+
|
|
176
|
+
🔐 A .env file is required to authenticate (see Getting Started section).
|
|
177
|
+
📁 Be sure to register your project name in MLflow before launching runs.
|
|
178
|
+
|
|
179
|
+
### 🔮 Segmentation
|
|
180
|
+
Generate segmentation prediction masks for tomograms in a given copick project.
|
|
181
|
+
```
|
|
182
|
+
octopi inference \
|
|
183
|
+
--config config.json \
|
|
184
|
+
--seg-info predict,unet,1 \
|
|
185
|
+
--model-config train_results/best_model_config.yaml \
|
|
186
|
+
--model-weights train_results/best_model.pth \
|
|
187
|
+
--voxel-size 10 --tomo-alg wbp --tomo-batch-size 25
|
|
188
|
+
```
|
|
189
|
+
Output masks will be saved to the corresponding copick project under the `seg-info` input.
|
|
190
|
+
|
|
191
|
+
### 📍 Localization
|
|
192
|
+
Convert the segmentation masks into particle coordinates.
|
|
193
|
+
```
|
|
194
|
+
octopi localize \
|
|
195
|
+
--config config.json \
|
|
196
|
+
--pick-session-id 1 --pick-user-id unet \
|
|
197
|
+
--seg-info predict,unet,1
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Contributing
|
|
201
|
+
|
|
202
|
+
This project adheres to the Contributor Covenant code of conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to opensource@chanzuckerberg.com.
|
|
203
|
+
|
|
204
|
+
## Reporting Security Issues
|
|
205
|
+
|
|
206
|
+
Please note: If you believe you have found a security issue, please responsibly disclose by contacting us at security@chanzuckerberg.com.
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
octopi-1.0/README.md
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# OCTOPI 🐙🐙🐙
|
|
2
|
+
**O**bject dete**CT**ion **O**f **P**rote**I**ns. A deep learning framework for Cryo-ET 3D particle picking with autonomous model exploration capabilities.
|
|
3
|
+
|
|
4
|
+
## 🚀 Introduction
|
|
5
|
+
|
|
6
|
+
octopi addresses a critical bottleneck in cryo-electron tomography (cryo-ET) research: the efficient identification and extraction of proteins within complex cellular environments. As advances in cryo-ET enable the collection of thousands of tomograms, the need for automated, accurate particle picking has become increasingly urgent.
|
|
7
|
+
|
|
8
|
+
Our deep learning-based pipeline streamlines the training and execution of 3D autoencoder models specifically designed for cryo-ET particle picking. Built on [copick](https://github.com/copick/copick), a storage-agnostic API, octopi seamlessly accesses tomograms and segmentations across local and remote environments.
|
|
9
|
+
|
|
10
|
+
## 🧩 Features
|
|
11
|
+
|
|
12
|
+
octopi offers a modular, deep learning-driven pipeline for:
|
|
13
|
+
* Training and evaluating custom 3D U-Net models for particle segmentation.
|
|
14
|
+
* Automatically exploring model architectures using Bayesian optimization via Optuna.
|
|
15
|
+
* Performing inference for both semantic segmentation and particle localization.
|
|
16
|
+
|
|
17
|
+
octopi empowers researchers to navigate the dense, intricate landscapes of cryo-ET datasets with unprecedented precision and efficiency without manual trial and error.
|
|
18
|
+
|
|
19
|
+
## Getting Started
|
|
20
|
+
### Installation
|
|
21
|
+
|
|
22
|
+
*Octopi* is available on PyPI.
|
|
23
|
+
```
|
|
24
|
+
pip install octopi
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## 📚 Usage
|
|
28
|
+
|
|
29
|
+
octopi provides a clean, scriptable command-line interface. Run the following command to view all available subcommands:
|
|
30
|
+
```
|
|
31
|
+
octopi --help
|
|
32
|
+
```
|
|
33
|
+
Each subcommand supports its own --help flag for detailed usage. To see practical examples of how to interface directly with the octopi API, explore the notebooks/ folder.
|
|
34
|
+
|
|
35
|
+
If you're running octopi on an HPC cluster, several SLURM-compatible submission commands are available. You can view them by running:
|
|
36
|
+
```
|
|
37
|
+
octopi-slurm --help
|
|
38
|
+
```
|
|
39
|
+
This provides utilities for submitting training, inference, and localization jobs in SLURM-based environments.
|
|
40
|
+
|
|
41
|
+
### 📥 Data Import & Preprocessing
|
|
42
|
+
|
|
43
|
+
To train or run inference with octopi, your tomograms must be organized inside a CoPick project. octopi supports two primary methods for data ingestion, both of which include optional Fourier cropping to reduce resolution and accelerate downstream processing.
|
|
44
|
+
|
|
45
|
+
If your tomograms are already processed and stored locally in .mrc format (e.g., from Warp, IMOD, or AreTomo), you can import them into a new or existing CoPick project using:
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
octopi import-mrc-volumes \
|
|
49
|
+
--input-folder /path/to/mrc/files --config /path/to/config.json \
|
|
50
|
+
--target-tomo-type denoised --input-voxel-size --output-voxel-size 10
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
octopi also can process tomograms that are hosted on the data portal. Users can download tomograms onto their own remote machine especially if they would like to downsample the tomograms to a lower resolution for speed and memory. You can download and process the tomograms using:
|
|
54
|
+
```
|
|
55
|
+
octopi download-dataportal \
|
|
56
|
+
--config /path/to/config.json --datasetID 10445 --overlay-path path/to/saved/zarrs \
|
|
57
|
+
--input-voxel-size 5 --output-voxel-size 10 \
|
|
58
|
+
--dataportal-name wbp --target-tomotype wbp
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 📁 Training Labels Preparation
|
|
62
|
+
|
|
63
|
+
Use `octopi create-targets` to create semantic masks for proteins of interest using annotation metadata. In this example lets generate picks segmentations for dataset 10439 from the CZ cryoET Dataportal (only need to run this step once).
|
|
64
|
+
```
|
|
65
|
+
octopi create-targets \
|
|
66
|
+
--config config.json \
|
|
67
|
+
--target apoferritin --target beta-galactosidase,slabpick,1 \
|
|
68
|
+
--target ribosome,pytom,0 --target virus-like-particle,pytom,0 \
|
|
69
|
+
--seg-target membrane \
|
|
70
|
+
--tomo-alg wbp --voxel-size 10 \
|
|
71
|
+
--target-session-id 1 --target-segmentation-name remotetargets \
|
|
72
|
+
--target-user-id train-octopi
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### 🧠 Training a single 3D U-Net model
|
|
76
|
+
Train a 3D U-Net model on the prepared datasets using the prepared target segmentations. We can use tomograms derived from multiple copick projects.
|
|
77
|
+
```
|
|
78
|
+
octopi train-model \
|
|
79
|
+
--config experiment,config1.json \
|
|
80
|
+
--config simulation,config2.json \
|
|
81
|
+
--voxel-size 10 --tomo-alg wbp --Nclass 8 \
|
|
82
|
+
--tomo-batch-size 50 --num-epochs 100 --val-interval 10 \
|
|
83
|
+
--target-info remotetargets,train-octopi,1
|
|
84
|
+
```
|
|
85
|
+
Outputs will include model weights (.pth), logs, and training metrics.
|
|
86
|
+
|
|
87
|
+
### 🔍 Model exploration with Optuna
|
|
88
|
+
|
|
89
|
+
octopi🐙 supports automatic neural architecture search using Optuna, enabling efficient discovery of optimal 3D U-Net configurations through Bayesian optimization. This allows users to maximize segmentation accuracy without manual tuning.
|
|
90
|
+
|
|
91
|
+
To launch a model exploration job:
|
|
92
|
+
```
|
|
93
|
+
octopi model-explore \
|
|
94
|
+
--config experiment,/mnt/dataportal/ml_challenge/config.json \
|
|
95
|
+
--config simulation,/mnt/dataportal/synthetic_ml_challenge/config.json \
|
|
96
|
+
--voxel-size 10 --tomo-alg wbp --Nclass 8 \
|
|
97
|
+
--model-save-path train_results
|
|
98
|
+
```
|
|
99
|
+
Each trial evaluates a different architecture and logs:
|
|
100
|
+
• Segmentation performance metrics
|
|
101
|
+
• Model weights and configs
|
|
102
|
+
• Training curves and validation loss
|
|
103
|
+
|
|
104
|
+
🔬 Trials are automatically tracked with MLflow and saved under the specified `--model-save-path`.
|
|
105
|
+
|
|
106
|
+
#### Optuna Dashboard
|
|
107
|
+
|
|
108
|
+
To quickly asses the exploration results and observe which trials results the best architectures, Optuna provides a dashboard that summarizes all the information on a dashboard. The instrucutions to access the dashboard are available here - https://optuna-dashboard.readthedocs.io/en/latest/getting-started.html, it is recommended to use either VS-Code extension or CLI.
|
|
109
|
+
|
|
110
|
+
#### 📊 MLflow experiment tracking
|
|
111
|
+
|
|
112
|
+
To use CZI cloud MLflow tracker, add a `.env` in the root directory like below. You can get a CZI MLflow access token from [here](https://mlflow.cw.use4-prod.si.czi.technology/api/2.0/mlflow/users/access-token) (note that a new token will be generated everytime you open this site).
|
|
113
|
+
```
|
|
114
|
+
MLFLOW_TRACKING_USERNAME = <Your_CZ_email>
|
|
115
|
+
MLFLOW_TRACKING_PASSWORD = <Your_mlflow_access_token>
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
octopi supports MLflow for logging and visualizing model training and hyperparameter search results, including:
|
|
119
|
+
• Training loss/validation metrics over time
|
|
120
|
+
• Model hyperparameters and architecture details
|
|
121
|
+
• Trial comparison (e.g., best performing model)
|
|
122
|
+
|
|
123
|
+
You can use either a local MLflow instance, a remote (HPC) instance, or the CZI cloud server:
|
|
124
|
+
|
|
125
|
+
#### 🧪 Local MLflow Dashboard
|
|
126
|
+
|
|
127
|
+
To inspect results locally: `mlflow ui` and open http://localhost:5000 in your browser.
|
|
128
|
+
|
|
129
|
+
#### 🖥️ HPC Cluster MLflow Access (Remote via SSH tunnel)
|
|
130
|
+
|
|
131
|
+
If running octopi on a remote cluster (e.g., Biohub Bruno), forward the MLflow port.
|
|
132
|
+
On your local machine:
|
|
133
|
+
`ssh -L 5000:localhost:5000 remote_username@remote_host` (in the case of Bruno the remote would be `login01.czbiohub.org`).
|
|
134
|
+
|
|
135
|
+
Then on the remote terminal (login node): ` mlflow ui --host 0.0.0.0 --port 5000` to launch the MLFlow dashboard on a local borwser.
|
|
136
|
+
|
|
137
|
+
#### ☁️ CZI coreweave cluser
|
|
138
|
+
|
|
139
|
+
For the CZI coreweave cluser, MLflow is already hosted. Go to the CZI [mlflow server](https://mlflow.cw.use4-prod.si.czi.technology/).
|
|
140
|
+
|
|
141
|
+
🔐 A .env file is required to authenticate (see Getting Started section).
|
|
142
|
+
📁 Be sure to register your project name in MLflow before launching runs.
|
|
143
|
+
|
|
144
|
+
### 🔮 Segmentation
|
|
145
|
+
Generate segmentation prediction masks for tomograms in a given copick project.
|
|
146
|
+
```
|
|
147
|
+
octopi inference \
|
|
148
|
+
--config config.json \
|
|
149
|
+
--seg-info predict,unet,1 \
|
|
150
|
+
--model-config train_results/best_model_config.yaml \
|
|
151
|
+
--model-weights train_results/best_model.pth \
|
|
152
|
+
--voxel-size 10 --tomo-alg wbp --tomo-batch-size 25
|
|
153
|
+
```
|
|
154
|
+
Output masks will be saved to the corresponding copick project under the `seg-info` input.
|
|
155
|
+
|
|
156
|
+
### 📍 Localization
|
|
157
|
+
Convert the segmentation masks into particle coordinates.
|
|
158
|
+
```
|
|
159
|
+
octopi localize \
|
|
160
|
+
--config config.json \
|
|
161
|
+
--pick-session-id 1 --pick-user-id unet \
|
|
162
|
+
--seg-info predict,unet,1
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Contributing
|
|
166
|
+
|
|
167
|
+
This project adheres to the Contributor Covenant code of conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to opensource@chanzuckerberg.com.
|
|
168
|
+
|
|
169
|
+
## Reporting Security Issues
|
|
170
|
+
|
|
171
|
+
Please note: If you believe you have found a security issue, please responsibly disclose by contacting us at security@chanzuckerberg.com.
|
|
172
|
+
|
|
173
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from monai.transforms import (
|
|
2
|
+
Compose,
|
|
3
|
+
RandFlipd,
|
|
4
|
+
Orientationd,
|
|
5
|
+
RandRotate90d,
|
|
6
|
+
NormalizeIntensityd,
|
|
7
|
+
EnsureChannelFirstd,
|
|
8
|
+
RandCropByLabelClassesd,
|
|
9
|
+
RandScaleIntensityd,
|
|
10
|
+
RandShiftIntensityd,
|
|
11
|
+
RandAdjustContrastd,
|
|
12
|
+
RandGaussianNoised,
|
|
13
|
+
ScaleIntensityRanged,
|
|
14
|
+
RandomOrder,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
def get_transforms():
|
|
18
|
+
"""
|
|
19
|
+
Returns non-random transforms.
|
|
20
|
+
"""
|
|
21
|
+
return Compose([
|
|
22
|
+
EnsureChannelFirstd(keys=["image", "label"], channel_dim="no_channel"),
|
|
23
|
+
NormalizeIntensityd(keys="image"),
|
|
24
|
+
Orientationd(keys=["image", "label"], axcodes="RAS")
|
|
25
|
+
])
|
|
26
|
+
|
|
27
|
+
def get_random_transforms( input_dim, num_samples, Nclasses):
|
|
28
|
+
"""
|
|
29
|
+
Input:
|
|
30
|
+
input_dim: tuple of (nx, ny, nz)
|
|
31
|
+
num_samples: int
|
|
32
|
+
Nclasses: int
|
|
33
|
+
|
|
34
|
+
Returns random transforms.
|
|
35
|
+
|
|
36
|
+
For data with a missing wedge along the first axis (causing smearing in that direction),
|
|
37
|
+
we avoid rotations that would move this artifact to other axes. We only rotate around
|
|
38
|
+
the first axis (spatial_axes=[1, 2]) and avoid flipping along the first axis.
|
|
39
|
+
"""
|
|
40
|
+
return Compose([
|
|
41
|
+
RandCropByLabelClassesd(
|
|
42
|
+
keys=["image", "label"],
|
|
43
|
+
label_key="label",
|
|
44
|
+
spatial_size=[input_dim[0], input_dim[1], input_dim[2]],
|
|
45
|
+
num_classes=Nclasses,
|
|
46
|
+
num_samples=num_samples
|
|
47
|
+
),
|
|
48
|
+
# Only rotate around the first axis (keeping the missing wedge orientation consistent)
|
|
49
|
+
RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[1, 2], max_k=3),
|
|
50
|
+
# Avoid flipping along the first axis (where the missing wedge is)
|
|
51
|
+
RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=0), # Removed
|
|
52
|
+
RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=1),
|
|
53
|
+
RandFlipd(keys=["image", "label"], prob=0.5, spatial_axis=2),
|
|
54
|
+
RandomOrder([
|
|
55
|
+
# Intensity augmentations are still appropriate
|
|
56
|
+
RandScaleIntensityd(keys="image", prob=0.5, factors=(0.85, 1.15)),
|
|
57
|
+
RandShiftIntensityd(keys="image", prob=0.5, offsets=(-0.15, 0.15)),
|
|
58
|
+
RandAdjustContrastd(keys="image", prob=0.5, gamma=(0.85, 1.15)),
|
|
59
|
+
RandGaussianNoised(keys="image", prob=0.5, mean=0.0, std=0.5), # Reduced noise std
|
|
60
|
+
]),
|
|
61
|
+
])
|
|
62
|
+
|
|
63
|
+
# Augmentations to Explore in the Future:
|
|
64
|
+
# Intensity-based augmentations
|
|
65
|
+
# RandHistogramShiftd(keys="image", prob=0.5, num_control_points=(3, 5))
|
|
66
|
+
# RandGaussianSmoothd(keys="image", prob=0.5, sigma_x=(0.5, 1.5), sigma_y=(0.5, 1.5), sigma_z=(0.5, 1.5)),
|
|
67
|
+
|
|
68
|
+
# Geometric Transforms
|
|
69
|
+
# RandAffined(
|
|
70
|
+
# keys=["image", "label"],
|
|
71
|
+
# rotate_range=(0.1, 0.1, 0.1), # Rotation angles (radians) for x, y, z axes
|
|
72
|
+
# scale_range=(0.1, 0.1, 0.1), # Scale range for isotropic/anisotropic scaling
|
|
73
|
+
# prob=0.5, # Probability of applying the transform
|
|
74
|
+
# padding_mode="border" # Handle out-of-bounds values
|
|
75
|
+
# )
|
|
76
|
+
|
|
77
|
+
def get_predict_transforms():
|
|
78
|
+
"""
|
|
79
|
+
Returns predict transforms.
|
|
80
|
+
"""
|
|
81
|
+
return Compose([
|
|
82
|
+
EnsureChannelFirstd(keys=["image"], channel_dim="no_channel"),
|
|
83
|
+
NormalizeIntensityd(keys="image")
|
|
84
|
+
])
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from typing import List, Tuple, Callable, Optional, Dict, Any
|
|
2
|
+
from monai.transforms import Compose
|
|
3
|
+
from monai.data import CacheDataset
|
|
4
|
+
from octopi import io
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
import os, sys
|
|
7
|
+
|
|
8
|
+
class MultiConfigCacheDataset(CacheDataset):
|
|
9
|
+
"""
|
|
10
|
+
A custom CacheDataset that loads data lazily from multiple sources
|
|
11
|
+
with consolidated loading and caching process.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
manager,
|
|
17
|
+
run_ids: List[Tuple[str, str]],
|
|
18
|
+
transform: Optional[Callable] = None,
|
|
19
|
+
cache_rate: float = 1.0,
|
|
20
|
+
num_workers: int = 0,
|
|
21
|
+
progress: bool = True,
|
|
22
|
+
copy_cache: bool = True,
|
|
23
|
+
cache_num: int = sys.maxsize
|
|
24
|
+
):
|
|
25
|
+
# Save reference to manager and run_ids
|
|
26
|
+
self.manager = manager
|
|
27
|
+
self.run_ids = run_ids
|
|
28
|
+
self.progress = progress
|
|
29
|
+
|
|
30
|
+
# Prepare empty data list first - don't load immediately
|
|
31
|
+
self.data = []
|
|
32
|
+
|
|
33
|
+
# Initialize the parent CacheDataset with an empty list
|
|
34
|
+
# We'll override the _fill_cache method to handle loading and caching in one step
|
|
35
|
+
super().__init__(
|
|
36
|
+
data=[], # Empty list - we'll load data in _fill_cache
|
|
37
|
+
transform=transform,
|
|
38
|
+
cache_rate=cache_rate,
|
|
39
|
+
num_workers=num_workers,
|
|
40
|
+
progress=False, # We'll handle our own progress
|
|
41
|
+
copy_cache=copy_cache,
|
|
42
|
+
cache_num=cache_num
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _fill_cache(self):
|
|
46
|
+
"""
|
|
47
|
+
Override the parent's _fill_cache method to combine loading and caching.
|
|
48
|
+
"""
|
|
49
|
+
if self.progress:
|
|
50
|
+
print("Loading and caching dataset...")
|
|
51
|
+
|
|
52
|
+
# Load and process data in a single operation
|
|
53
|
+
self.data = []
|
|
54
|
+
iterator = tqdm(self.run_ids, desc="Loading dataset") if self.progress else self.run_ids
|
|
55
|
+
|
|
56
|
+
for session_name, run_name in iterator:
|
|
57
|
+
root = self.manager.roots[session_name]
|
|
58
|
+
batch_data = io.load_training_data(
|
|
59
|
+
root,
|
|
60
|
+
[run_name],
|
|
61
|
+
self.manager.voxel_size,
|
|
62
|
+
self.manager.tomo_algorithm,
|
|
63
|
+
self.manager.target_name,
|
|
64
|
+
self.manager.target_session_id,
|
|
65
|
+
self.manager.target_user_id,
|
|
66
|
+
progress_update=False
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
self.data.extend(batch_data)
|
|
70
|
+
|
|
71
|
+
# Process and cache this batch right away
|
|
72
|
+
for i, item in enumerate(batch_data):
|
|
73
|
+
if len(self._cache) < self.cache_num and self.cache_rate > 0.0:
|
|
74
|
+
if np.random.random() < self.cache_rate:
|
|
75
|
+
self._cache.append(self._transform(item))
|
|
76
|
+
|
|
77
|
+
# Check max label value if needed
|
|
78
|
+
if hasattr(self.manager, '_check_max_label_value'):
|
|
79
|
+
self.manager._check_max_label_value(self.data)
|
|
80
|
+
|
|
81
|
+
# Update the _data attribute to match the loaded data
|
|
82
|
+
self._data = self.data
|
|
83
|
+
|
|
84
|
+
def __len__(self):
|
|
85
|
+
"""
|
|
86
|
+
Return the length of the dataset.
|
|
87
|
+
"""
|
|
88
|
+
if not self.data:
|
|
89
|
+
self._fill_cache() # Load data if not loaded yet
|
|
90
|
+
return len(self.data)
|
|
91
|
+
|
|
92
|
+
def __getitem__(self, index):
|
|
93
|
+
"""
|
|
94
|
+
Return the item at the given index.
|
|
95
|
+
"""
|
|
96
|
+
if not self.data:
|
|
97
|
+
self._fill_cache() # Load data if not loaded yet
|
|
98
|
+
|
|
99
|
+
# Use parent's logic for cached items
|
|
100
|
+
if index < len(self._cache):
|
|
101
|
+
return self._cache[index]
|
|
102
|
+
|
|
103
|
+
# Otherwise transform on-the-fly
|
|
104
|
+
return self._transform(self.data[index])
|
|
105
|
+
|
|
106
|
+
# TODO: Implement Single Config Cache Dataset
|
|
107
|
+
# class SingleConfigCacheDataset(CacheDataset):
|
|
108
|
+
# def __init__(self,
|
|
109
|
+
# root: Any,
|
|
110
|
+
# run_ids: List[str],
|
|
111
|
+
# voxel_size: float,
|
|
112
|
+
# tomo_algorithm: str,
|
|
113
|
+
# target_name: str,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from torch.utils.data import Dataset
|
|
2
|
+
|
|
3
|
+
class DynamicDataset(Dataset):
|
|
4
|
+
def __init__(self, data, transform=None):
|
|
5
|
+
self.data = data
|
|
6
|
+
self.transform = transform
|
|
7
|
+
|
|
8
|
+
def __len__(self):
|
|
9
|
+
return len(self.data)
|
|
10
|
+
|
|
11
|
+
def __getitem__(self, idx):
|
|
12
|
+
sample = self.data[idx]
|
|
13
|
+
if self.transform:
|
|
14
|
+
sample = self.transform(sample)
|
|
15
|
+
return sample
|
|
16
|
+
|
|
17
|
+
def update_data(self, new_data):
|
|
18
|
+
"""Update the internal dataset with new data."""
|
|
19
|
+
self.data = new_data
|