cellitac 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cellitac-1.0.0/LICENSE +17 -0
- cellitac-1.0.0/PKG-INFO +218 -0
- cellitac-1.0.0/README.md +180 -0
- cellitac-1.0.0/pyproject.toml +73 -0
- cellitac-1.0.0/setup.cfg +4 -0
- cellitac-1.0.0/src/cellitac/__init__.py +41 -0
- cellitac-1.0.0/src/cellitac/_version.py +1 -0
- cellitac-1.0.0/src/cellitac/cli.py +147 -0
- cellitac-1.0.0/src/cellitac/config.py +229 -0
- cellitac-1.0.0/src/cellitac/mainModel.py +1155 -0
- cellitac-1.0.0/src/cellitac/pipeline.py +104 -0
- cellitac-1.0.0/src/cellitac/preprocessing.py +191 -0
- cellitac-1.0.0/src/cellitac/rscripts/__init__.py +4 -0
- cellitac-1.0.0/src/cellitac/rscripts/integration.R +157 -0
- cellitac-1.0.0/src/cellitac/rscripts/team1_rna.R +119 -0
- cellitac-1.0.0/src/cellitac/rscripts/team2_atac.R +145 -0
- cellitac-1.0.0/src/cellitac.egg-info/PKG-INFO +218 -0
- cellitac-1.0.0/src/cellitac.egg-info/SOURCES.txt +21 -0
- cellitac-1.0.0/src/cellitac.egg-info/dependency_links.txt +1 -0
- cellitac-1.0.0/src/cellitac.egg-info/entry_points.txt +4 -0
- cellitac-1.0.0/src/cellitac.egg-info/requires.txt +18 -0
- cellitac-1.0.0/src/cellitac.egg-info/top_level.txt +1 -0
- cellitac-1.0.0/tests/test_model.py +158 -0
cellitac-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rana Hamed, Syrus, Emmanuel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
cellitac-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cellitac
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Cell type identification using Transcription factor Analysis and Chromatin accessibility
|
|
5
|
+
Author-email: Rana H Abuzeid <ranahamed2111@gmail.com>, Olaitan Awe <laitanawe@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/omicscodeathon/cellitac/
|
|
8
|
+
Keywords: single-cell,scATAC-seq,scRNA-seq,multiome,cell-type-identification,transcription-factor,chromatin-accessibility,machine-learning
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: numpy>=1.24
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: openpyxl>=3.1
|
|
23
|
+
Requires-Dist: rpy2>=3.5
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: xgboost>=2.0
|
|
26
|
+
Requires-Dist: imbalanced-learn>=0.11
|
|
27
|
+
Requires-Dist: sklearn-compat>=0.1.5
|
|
28
|
+
Requires-Dist: matplotlib>=3.7
|
|
29
|
+
Requires-Dist: seaborn>=0.12
|
|
30
|
+
Requires-Dist: plotly>=5.18
|
|
31
|
+
Requires-Dist: networkx>=3.1
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
35
|
+
Requires-Dist: black; extra == "dev"
|
|
36
|
+
Requires-Dist: ruff; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# scatactf
|
|
40
|
+
|
|
41
|
+
Single-Cell ATAC + RNA Multiome Processing & ML Classification Pipeline
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## What It Does
|
|
46
|
+
|
|
47
|
+
| Stage | Steps | Tools |
|
|
48
|
+
|-------|-------|-------|
|
|
49
|
+
| **Preprocessing** | RNA QC → normalization → cell-type annotation | Seurat + SingleR (R via rpy2) |
|
|
50
|
+
| **Preprocessing** | ATAC QC → TF-IDF → LSI | Signac (R via rpy2) |
|
|
51
|
+
| **Preprocessing** | RNA + ATAC integration → ML-ready CSVs | Pure Python |
|
|
52
|
+
| **ML** | Imbalance analysis → SMOTE → feature selection | scikit-learn, imbalanced-learn |
|
|
53
|
+
| **ML** | RF + XGBoost + SVM training & evaluation | scikit-learn, xgboost |
|
|
54
|
+
| **ML** | 19 plots + JSON report + XLSX | matplotlib, seaborn, networkx |
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
### Option A – Local / Team (pip install -e)
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/your-org/scatactf.git
|
|
64
|
+
cd scatactf
|
|
65
|
+
|
|
66
|
+
# Install R packages (run once inside R)
|
|
67
|
+
Rscript -e "
|
|
68
|
+
install.packages('BiocManager')
|
|
69
|
+
BiocManager::install(c(
|
|
70
|
+
'Seurat', 'Signac', 'SingleR', 'celldex',
|
|
71
|
+
'SingleCellExperiment', 'GenomicRanges',
|
|
72
|
+
'EnsDb.Hsapiens.v75', 'biovizBase', 'hdf5r'
|
|
73
|
+
))
|
|
74
|
+
"
|
|
75
|
+
|
|
76
|
+
# Install Python package
|
|
77
|
+
pip install -e ".[dev]"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Option B – PyPI
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install scatactf
|
|
84
|
+
# R must be installed separately
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Option C – Docker (recommended for full reproducibility)
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
docker build -t scatactf:1.0.0 -f docker/Dockerfile .
|
|
91
|
+
|
|
92
|
+
docker run --rm \
|
|
93
|
+
-v /your/data:/data \
|
|
94
|
+
-v $(pwd)/results:/results \
|
|
95
|
+
scatactf:1.0.0 \
|
|
96
|
+
--input /data --output /results
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Data Download
|
|
102
|
+
|
|
103
|
+
https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-no-cell-sorting-10-k-1-standard-1-0-0
|
|
104
|
+
|
|
105
|
+
Required files (place in your `--input` directory):
|
|
106
|
+
```
|
|
107
|
+
pbmc_unsorted_10k_filtered_feature_bc_matrix.h5
|
|
108
|
+
pbmc_unsorted_10k_per_barcode_metrics.csv
|
|
109
|
+
pbmc_unsorted_10k_atac_fragments.tsv.gz
|
|
110
|
+
pbmc_unsorted_10k_atac_fragments.tsv.gz.tbi
|
|
111
|
+
pbmc_unsorted_10k_atac_peaks.bed
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Usage
|
|
117
|
+
|
|
118
|
+
### Command Line
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# Full pipeline (preprocessing + ML)
|
|
122
|
+
scatactf --input ~/singlecell/ATAC --output my_results
|
|
123
|
+
|
|
124
|
+
# Preprocessing only (generates python_ready_data/)
|
|
125
|
+
scatactf-preprocess --input ~/singlecell/ATAC --output my_results
|
|
126
|
+
|
|
127
|
+
# ML only (if you already have python_ready_data/)
|
|
128
|
+
scatactf-model --data my_results/python_ready_data --output my_results/ml
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Python API
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from scatactf import run_full_pipeline, run_preprocessing, run_model
|
|
135
|
+
|
|
136
|
+
# Full pipeline
|
|
137
|
+
run_full_pipeline(input_dir="~/singlecell/ATAC", output_dir="my_results")
|
|
138
|
+
|
|
139
|
+
# Preprocessing only
|
|
140
|
+
run_preprocessing(input_dir="~/singlecell/ATAC", output_dir_python="python_ready_data")
|
|
141
|
+
|
|
142
|
+
# ML only
|
|
143
|
+
run_model(data_dir="python_ready_data", output_dir="ml_results")
|
|
144
|
+
|
|
145
|
+
# Use the ML class directly for more control
|
|
146
|
+
from scatactf.mainModel import scATACMLPipeline
|
|
147
|
+
pipeline = scATACMLPipeline(data_dir="python_ready_data", output_dir="ml_results")
|
|
148
|
+
pipeline.run_complete_pipeline()
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Environment Variables
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
export SCATAC_INPUT_DIR=~/singlecell/ATAC
|
|
155
|
+
export SCATAC_OUT_ML=ml_results
|
|
156
|
+
scatactf
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Output Files
|
|
162
|
+
|
|
163
|
+
### ml_results/
|
|
164
|
+
| File | Description |
|
|
165
|
+
|------|-------------|
|
|
166
|
+
| `ml_pipeline_report.json` | Full JSON report |
|
|
167
|
+
| `model_performance_summary.csv` | Accuracy/F1/AUC per model |
|
|
168
|
+
| `detailed_model_results.xlsx` | Per-class metrics, CV results |
|
|
169
|
+
| `model_performance_comparison.png` | Bar chart comparison |
|
|
170
|
+
| `confusion_matrices.png` | Confusion matrices |
|
|
171
|
+
| `class_distribution_analysis.png` | Cell type distribution |
|
|
172
|
+
| `class_balancing_comparison.png` | Before/after SMOTE |
|
|
173
|
+
| `feature_importance.png` | RF + XGBoost top 20 features |
|
|
174
|
+
| `simple_feature_heatmap.png` | Feature importance heatmap |
|
|
175
|
+
| `overfitting_analysis.png` | CV train vs validation |
|
|
176
|
+
| `learning_curves.png` | Learning curves per model |
|
|
177
|
+
| `performance_radar.png` | Radar chart |
|
|
178
|
+
| `feature_distributions.png` | Violin plots |
|
|
179
|
+
| `class_separation_pca.png` | PCA scatter |
|
|
180
|
+
| `basic_tf_network.png` | Feature–cell-type network |
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Package Structure
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
scatactf/
|
|
188
|
+
├── src/scatactf/
|
|
189
|
+
│ ├── __init__.py # Public API
|
|
190
|
+
│ ├── _version.py
|
|
191
|
+
│ ├── config.py # All parameters (paths, QC thresholds, ML hyperparams)
|
|
192
|
+
│ ├── pipeline.py # run_preprocessing, run_model, run_full_pipeline
|
|
193
|
+
│ ├── preprocessing.py # R preprocessing via rpy2
|
|
194
|
+
│ ├── mainModel.py # scATACMLPipeline class (19-step ML pipeline)
|
|
195
|
+
│ ├── cli.py # scatactf / scatactf-preprocess / scatactf-model
|
|
196
|
+
│ └── rscripts/
|
|
197
|
+
│ ├── team1_rna.R # Exact Seurat + SingleR code
|
|
198
|
+
│ └── team2_atac.R # Exact Signac code
|
|
199
|
+
├── tests/
|
|
200
|
+
│ └── test_model.py
|
|
201
|
+
├── pyproject.toml
|
|
202
|
+
└── README.md
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Tests
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
pip install -e ".[dev]"
|
|
211
|
+
pytest tests/ -v
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
MIT
|
cellitac-1.0.0/README.md
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# scatactf
|
|
2
|
+
|
|
3
|
+
Single-Cell ATAC + RNA Multiome Processing & ML Classification Pipeline
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## What It Does
|
|
8
|
+
|
|
9
|
+
| Stage | Steps | Tools |
|
|
10
|
+
|-------|-------|-------|
|
|
11
|
+
| **Preprocessing** | RNA QC → normalization → cell-type annotation | Seurat + SingleR (R via rpy2) |
|
|
12
|
+
| **Preprocessing** | ATAC QC → TF-IDF → LSI | Signac (R via rpy2) |
|
|
13
|
+
| **Preprocessing** | RNA + ATAC integration → ML-ready CSVs | Pure Python |
|
|
14
|
+
| **ML** | Imbalance analysis → SMOTE → feature selection | scikit-learn, imbalanced-learn |
|
|
15
|
+
| **ML** | RF + XGBoost + SVM training & evaluation | scikit-learn, xgboost |
|
|
16
|
+
| **ML** | 19 plots + JSON report + XLSX | matplotlib, seaborn, networkx |
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
### Option A – Local / Team (pip install -e)
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/your-org/scatactf.git
|
|
26
|
+
cd scatactf
|
|
27
|
+
|
|
28
|
+
# Install R packages (run once inside R)
|
|
29
|
+
Rscript -e "
|
|
30
|
+
install.packages('BiocManager')
|
|
31
|
+
BiocManager::install(c(
|
|
32
|
+
'Seurat', 'Signac', 'SingleR', 'celldex',
|
|
33
|
+
'SingleCellExperiment', 'GenomicRanges',
|
|
34
|
+
'EnsDb.Hsapiens.v75', 'biovizBase', 'hdf5r'
|
|
35
|
+
))
|
|
36
|
+
"
|
|
37
|
+
|
|
38
|
+
# Install Python package
|
|
39
|
+
pip install -e ".[dev]"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Option B – PyPI
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install scatactf
|
|
46
|
+
# R must be installed separately
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Option C – Docker (recommended for full reproducibility)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
docker build -t scatactf:1.0.0 -f docker/Dockerfile .
|
|
53
|
+
|
|
54
|
+
docker run --rm \
|
|
55
|
+
-v /your/data:/data \
|
|
56
|
+
-v $(pwd)/results:/results \
|
|
57
|
+
scatactf:1.0.0 \
|
|
58
|
+
--input /data --output /results
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Data Download
|
|
64
|
+
|
|
65
|
+
https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-no-cell-sorting-10-k-1-standard-1-0-0
|
|
66
|
+
|
|
67
|
+
Required files (place in your `--input` directory):
|
|
68
|
+
```
|
|
69
|
+
pbmc_unsorted_10k_filtered_feature_bc_matrix.h5
|
|
70
|
+
pbmc_unsorted_10k_per_barcode_metrics.csv
|
|
71
|
+
pbmc_unsorted_10k_atac_fragments.tsv.gz
|
|
72
|
+
pbmc_unsorted_10k_atac_fragments.tsv.gz.tbi
|
|
73
|
+
pbmc_unsorted_10k_atac_peaks.bed
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
### Command Line
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# Full pipeline (preprocessing + ML)
|
|
84
|
+
scatactf --input ~/singlecell/ATAC --output my_results
|
|
85
|
+
|
|
86
|
+
# Preprocessing only (generates python_ready_data/)
|
|
87
|
+
scatactf-preprocess --input ~/singlecell/ATAC --output my_results
|
|
88
|
+
|
|
89
|
+
# ML only (if you already have python_ready_data/)
|
|
90
|
+
scatactf-model --data my_results/python_ready_data --output my_results/ml
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Python API
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from scatactf import run_full_pipeline, run_preprocessing, run_model
|
|
97
|
+
|
|
98
|
+
# Full pipeline
|
|
99
|
+
run_full_pipeline(input_dir="~/singlecell/ATAC", output_dir="my_results")
|
|
100
|
+
|
|
101
|
+
# Preprocessing only
|
|
102
|
+
run_preprocessing(input_dir="~/singlecell/ATAC", output_dir_python="python_ready_data")
|
|
103
|
+
|
|
104
|
+
# ML only
|
|
105
|
+
run_model(data_dir="python_ready_data", output_dir="ml_results")
|
|
106
|
+
|
|
107
|
+
# Use the ML class directly for more control
|
|
108
|
+
from scatactf.mainModel import scATACMLPipeline
|
|
109
|
+
pipeline = scATACMLPipeline(data_dir="python_ready_data", output_dir="ml_results")
|
|
110
|
+
pipeline.run_complete_pipeline()
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Environment Variables
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
export SCATAC_INPUT_DIR=~/singlecell/ATAC
|
|
117
|
+
export SCATAC_OUT_ML=ml_results
|
|
118
|
+
scatactf
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Output Files
|
|
124
|
+
|
|
125
|
+
### ml_results/
|
|
126
|
+
| File | Description |
|
|
127
|
+
|------|-------------|
|
|
128
|
+
| `ml_pipeline_report.json` | Full JSON report |
|
|
129
|
+
| `model_performance_summary.csv` | Accuracy/F1/AUC per model |
|
|
130
|
+
| `detailed_model_results.xlsx` | Per-class metrics, CV results |
|
|
131
|
+
| `model_performance_comparison.png` | Bar chart comparison |
|
|
132
|
+
| `confusion_matrices.png` | Confusion matrices |
|
|
133
|
+
| `class_distribution_analysis.png` | Cell type distribution |
|
|
134
|
+
| `class_balancing_comparison.png` | Before/after SMOTE |
|
|
135
|
+
| `feature_importance.png` | RF + XGBoost top 20 features |
|
|
136
|
+
| `simple_feature_heatmap.png` | Feature importance heatmap |
|
|
137
|
+
| `overfitting_analysis.png` | CV train vs validation |
|
|
138
|
+
| `learning_curves.png` | Learning curves per model |
|
|
139
|
+
| `performance_radar.png` | Radar chart |
|
|
140
|
+
| `feature_distributions.png` | Violin plots |
|
|
141
|
+
| `class_separation_pca.png` | PCA scatter |
|
|
142
|
+
| `basic_tf_network.png` | Feature–cell-type network |
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Package Structure
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
scatactf/
|
|
150
|
+
├── src/scatactf/
|
|
151
|
+
│ ├── __init__.py # Public API
|
|
152
|
+
│ ├── _version.py
|
|
153
|
+
│ ├── config.py # All parameters (paths, QC thresholds, ML hyperparams)
|
|
154
|
+
│ ├── pipeline.py # run_preprocessing, run_model, run_full_pipeline
|
|
155
|
+
│ ├── preprocessing.py # R preprocessing via rpy2
|
|
156
|
+
│ ├── mainModel.py # scATACMLPipeline class (19-step ML pipeline)
|
|
157
|
+
│ ├── cli.py # scatactf / scatactf-preprocess / scatactf-model
|
|
158
|
+
│ └── rscripts/
|
|
159
|
+
│ ├── team1_rna.R # Exact Seurat + SingleR code
|
|
160
|
+
│ └── team2_atac.R # Exact Signac code
|
|
161
|
+
├── tests/
|
|
162
|
+
│ └── test_model.py
|
|
163
|
+
├── pyproject.toml
|
|
164
|
+
└── README.md
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Tests
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
pip install -e ".[dev]"
|
|
173
|
+
pytest tests/ -v
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
MIT
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cellitac"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Cell type identification using Transcription factor Analysis and Chromatin accessibility"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Rana H Abuzeid", email = "ranahamed2111@gmail.com"},
|
|
14
|
+
{name = "Olaitan Awe", email = "laitanawe@gmail.com"}
|
|
15
|
+
]
|
|
16
|
+
keywords = [
|
|
17
|
+
"single-cell", "scATAC-seq", "scRNA-seq", "multiome",
|
|
18
|
+
"cell-type-identification", "transcription-factor",
|
|
19
|
+
"chromatin-accessibility", "machine-learning"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
31
|
+
]
|
|
32
|
+
requires-python = ">=3.9"
|
|
33
|
+
|
|
34
|
+
dependencies = [
|
|
35
|
+
# Data
|
|
36
|
+
"numpy>=1.24",
|
|
37
|
+
"pandas>=2.0",
|
|
38
|
+
"openpyxl>=3.1",
|
|
39
|
+
# rpy2 bridge
|
|
40
|
+
"rpy2>=3.5",
|
|
41
|
+
# ML
|
|
42
|
+
"scikit-learn>=1.3",
|
|
43
|
+
"xgboost>=2.0",
|
|
44
|
+
"imbalanced-learn>=0.11",
|
|
45
|
+
"sklearn-compat>=0.1.5",
|
|
46
|
+
# Visualization
|
|
47
|
+
"matplotlib>=3.7",
|
|
48
|
+
"seaborn>=0.12",
|
|
49
|
+
"plotly>=5.18",
|
|
50
|
+
"networkx>=3.1",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[project.optional-dependencies]
|
|
54
|
+
dev = [
|
|
55
|
+
"pytest>=7.0",
|
|
56
|
+
"pytest-cov",
|
|
57
|
+
"black",
|
|
58
|
+
"ruff",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
[project.scripts]
|
|
62
|
+
cellitac = "cellitac.cli:main"
|
|
63
|
+
cellitac-preprocess = "cellitac.cli:run_preprocess"
|
|
64
|
+
cellitac-model = "cellitac.cli:run_model"
|
|
65
|
+
|
|
66
|
+
[project.urls]
|
|
67
|
+
Homepage = "https://github.com/omicscodeathon/cellitac/"
|
|
68
|
+
|
|
69
|
+
[tool.setuptools.packages.find]
|
|
70
|
+
where = ["src"]
|
|
71
|
+
|
|
72
|
+
[tool.setuptools.package-data]
|
|
73
|
+
cellitac = ["rscripts/*.R"]
|
cellitac-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cellitac
|
|
3
|
+
========
|
|
4
|
+
Single-Cell ATAC + RNA Multiome Processing & ML Classification Pipeline.
|
|
5
|
+
|
|
6
|
+
This package runs two stages end-to-end:
|
|
7
|
+
|
|
8
|
+
Stage 1 – Preprocessing (R via rpy2)
|
|
9
|
+
• Team 1: RNA processing, QC, normalization, cell-type annotation
|
|
10
|
+
(Seurat + SingleR + celldex)
|
|
11
|
+
• Team 2: ATAC processing, QC, TF-IDF, LSI
|
|
12
|
+
(Signac + EnsDb)
|
|
13
|
+
• Integration: combine RNA + ATAC → ML-ready CSVs
|
|
14
|
+
|
|
15
|
+
Stage 2 – ML Classification (pure Python)
|
|
16
|
+
• Class imbalance analysis & SMOTE balancing
|
|
17
|
+
• Feature engineering & selection
|
|
18
|
+
• Random Forest, XGBoost, SVM training
|
|
19
|
+
• Evaluation, visualizations, reports
|
|
20
|
+
|
|
21
|
+
Quick start
|
|
22
|
+
-----------
|
|
23
|
+
>>> from cellitac import run_preprocessing, run_model, run_full_pipeline
|
|
24
|
+
>>> run_full_pipeline(input_dir="~/singlecell/ATAC", output_dir="results")
|
|
25
|
+
|
|
26
|
+
Or from the command line::
|
|
27
|
+
|
|
28
|
+
cellitac --input ~/singlecell/ATAC --output results
|
|
29
|
+
cellitac-preprocess --input ~/singlecell/ATAC
|
|
30
|
+
cellitac-model --data python_ready_data --output ml_results
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from cellitac._version import __version__
|
|
34
|
+
from cellitac.pipeline import run_preprocessing, run_model, run_full_pipeline
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"run_preprocessing",
|
|
38
|
+
"run_model",
|
|
39
|
+
"run_full_pipeline",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cellitac.cli
|
|
3
|
+
============
|
|
4
|
+
Command-line interface for the cellitac package.
|
|
5
|
+
|
|
6
|
+
Installed entry points (pyproject.toml):
|
|
7
|
+
cellitac → run full pipeline (preprocessing + ML)
|
|
8
|
+
cellitac-preprocess → run preprocessing only (R steps)
|
|
9
|
+
cellitac-model → run ML model only (on existing python_ready_data)
|
|
10
|
+
|
|
11
|
+
Examples
|
|
12
|
+
--------
|
|
13
|
+
# Full pipeline
|
|
14
|
+
cellitac --input ~/singlecell/ATAC --output results/
|
|
15
|
+
|
|
16
|
+
# Preprocessing only
|
|
17
|
+
cellitac-preprocess --input ~/singlecell/ATAC --output results/
|
|
18
|
+
|
|
19
|
+
# ML only (after preprocessing)
|
|
20
|
+
cellitac-model --data results/python_ready_data --output results/ml_results
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
from cellitac import config as cfg
|
|
27
|
+
from cellitac.pipeline import run_full_pipeline, run_preprocessing, run_model
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ============================================================================
|
|
31
|
+
# Full pipeline CLI
|
|
32
|
+
# ============================================================================
|
|
33
|
+
def main():
|
|
34
|
+
parser = argparse.ArgumentParser(
|
|
35
|
+
prog="cellitac",
|
|
36
|
+
description="scATAC + RNA Multiome Processing & ML Classification Pipeline",
|
|
37
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
38
|
+
epilog="""
|
|
39
|
+
Examples:
|
|
40
|
+
cellitac --input ~/singlecell/ATAC --output my_results
|
|
41
|
+
cellitac --input /data/pbmc10k
|
|
42
|
+
""",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--input", "-i",
|
|
46
|
+
default=cfg.INPUT_DIR,
|
|
47
|
+
metavar="DIR",
|
|
48
|
+
help=f"Raw 10x data directory (default: {cfg.INPUT_DIR})",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--output", "-o",
|
|
52
|
+
default="cellitac_results",
|
|
53
|
+
metavar="DIR",
|
|
54
|
+
help="Base output directory (default: cellitac_results)",
|
|
55
|
+
)
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"--version", "-v",
|
|
58
|
+
action="version",
|
|
59
|
+
version="cellitac 1.0.0",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
try:
|
|
64
|
+
success = run_full_pipeline(input_dir=args.input, output_dir=args.output)
|
|
65
|
+
sys.exit(0 if success else 1)
|
|
66
|
+
except KeyboardInterrupt:
|
|
67
|
+
print("\nInterrupted.", file=sys.stderr)
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ============================================================================
|
|
75
|
+
# Preprocessing-only CLI
|
|
76
|
+
# ============================================================================
|
|
77
|
+
def run_preprocess():
|
|
78
|
+
parser = argparse.ArgumentParser(
|
|
79
|
+
prog="cellitac-preprocess",
|
|
80
|
+
description="Run preprocessing only: RNA (Seurat) + ATAC (Signac) + Integration",
|
|
81
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
82
|
+
epilog="""
|
|
83
|
+
Examples:
|
|
84
|
+
cellitac-preprocess --input ~/singlecell/ATAC
|
|
85
|
+
cellitac-preprocess --input /data --output custom_results/
|
|
86
|
+
""",
|
|
87
|
+
)
|
|
88
|
+
parser.add_argument("--input", "-i", default=cfg.INPUT_DIR,
|
|
89
|
+
metavar="DIR", help="Raw data directory")
|
|
90
|
+
parser.add_argument("--output", "-o", default="cellitac_results",
|
|
91
|
+
metavar="DIR", help="Base output directory")
|
|
92
|
+
parser.add_argument("--version", action="version", version="cellitac 1.0.0")
|
|
93
|
+
|
|
94
|
+
args = parser.parse_args()
|
|
95
|
+
|
|
96
|
+
import os
|
|
97
|
+
team1_dir = os.path.join(args.output, "team1_rna_output")
|
|
98
|
+
team2_dir = os.path.join(args.output, "team2_atac_output")
|
|
99
|
+
python_dir = os.path.join(args.output, "python_ready_data")
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
run_preprocessing(
|
|
103
|
+
input_dir=args.input,
|
|
104
|
+
output_dir_team1=team1_dir,
|
|
105
|
+
output_dir_team2=team2_dir,
|
|
106
|
+
output_dir_python=python_dir,
|
|
107
|
+
)
|
|
108
|
+
print(f"\nPreprocessing complete. ML-ready data in: {python_dir}")
|
|
109
|
+
sys.exit(0)
|
|
110
|
+
except KeyboardInterrupt:
|
|
111
|
+
print("\nInterrupted.", file=sys.stderr)
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
except Exception as exc:
|
|
114
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ============================================================================
|
|
119
|
+
# ML-only CLI
|
|
120
|
+
# ============================================================================
|
|
121
|
+
def run_model_cli():
|
|
122
|
+
parser = argparse.ArgumentParser(
|
|
123
|
+
prog="cellitac-model",
|
|
124
|
+
description="Run ML classification on existing python_ready_data",
|
|
125
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
126
|
+
epilog="""
|
|
127
|
+
Examples:
|
|
128
|
+
cellitac-model --data python_ready_data --output ml_results
|
|
129
|
+
cellitac-model --data cellitac_results/python_ready_data
|
|
130
|
+
""",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument("--data", "-d", default=cfg.OUTPUT_DIR_PYTHON,
|
|
133
|
+
metavar="DIR", help="python_ready_data directory")
|
|
134
|
+
parser.add_argument("--output", "-o", default=cfg.OUTPUT_DIR_ML,
|
|
135
|
+
metavar="DIR", help="ML results output directory")
|
|
136
|
+
parser.add_argument("--version", action="version", version="cellitac 1.0.0")
|
|
137
|
+
|
|
138
|
+
args = parser.parse_args()
|
|
139
|
+
try:
|
|
140
|
+
success = run_model(data_dir=args.data, output_dir=args.output)
|
|
141
|
+
sys.exit(0 if success else 1)
|
|
142
|
+
except KeyboardInterrupt:
|
|
143
|
+
print("\nInterrupted.", file=sys.stderr)
|
|
144
|
+
sys.exit(1)
|
|
145
|
+
except Exception as exc:
|
|
146
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
147
|
+
sys.exit(1)
|