M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m3Drop/ControlDeviceCPU.py +218 -0
- m3Drop/ControlDeviceGPU.py +236 -0
- m3Drop/CoreCPU.py +508 -0
- m3Drop/CoreGPU.py +506 -0
- m3Drop/DiagnosticsCPU.py +401 -0
- m3Drop/DiagnosticsGPU.py +420 -0
- m3Drop/NormalizationCPU.py +199 -0
- m3Drop/{normalizationGPU.py → NormalizationGPU.py} +3 -5
- m3Drop/__init__.py +42 -51
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/METADATA +4 -2
- m3drop-0.4.45.dist-info/RECORD +14 -0
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/WHEEL +1 -1
- m3Drop/coreCPU.py +0 -477
- m3Drop/coreGPU.py +0 -591
- m3Drop/diagnosticsCPU.py +0 -391
- m3Drop/diagnosticsGPU.py +0 -481
- m3Drop/normalizationCPU.py +0 -146
- m3drop-0.4.42.dist-info/RECORD +0 -12
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/licenses/LICENSE +0 -0
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/top_level.txt +0 -0
m3Drop/__init__.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
# M3Drop
|
|
2
|
-
# This file imports all CPU and GPU functions to make them
|
|
3
|
-
# directly accessible from the main package.
|
|
1
|
+
# M3Drop/__init__.py
|
|
4
2
|
|
|
5
3
|
# --- CPU Functions ---
|
|
6
4
|
|
|
7
|
-
# From
|
|
8
|
-
from .
|
|
5
|
+
# From CoreCPU.py
|
|
6
|
+
from .CoreCPU import (
|
|
9
7
|
ConvertDataSparseCPU,
|
|
10
8
|
hidden_calc_valsCPU,
|
|
11
9
|
NBumiFitModelCPU,
|
|
@@ -15,56 +13,57 @@ from .coreCPU import (
|
|
|
15
13
|
NBumiCombinedDropVolcanoCPU,
|
|
16
14
|
)
|
|
17
15
|
|
|
18
|
-
# From
|
|
19
|
-
from .
|
|
16
|
+
# From DiagnosticsCPU.py
|
|
17
|
+
from .DiagnosticsCPU import (
|
|
20
18
|
NBumiFitBasicModelCPU,
|
|
21
19
|
NBumiCheckFitFSCPU,
|
|
22
20
|
NBumiCompareModelsCPU,
|
|
23
21
|
NBumiPlotDispVsMeanCPU,
|
|
24
22
|
)
|
|
25
23
|
|
|
26
|
-
# From
|
|
27
|
-
from .
|
|
28
|
-
|
|
29
|
-
NBumiPearsonResidualsApproxCPU,
|
|
24
|
+
# From NormalizationCPU.py
|
|
25
|
+
from .NormalizationCPU import (
|
|
26
|
+
NBumiPearsonResidualsCombinedCPU,
|
|
30
27
|
)
|
|
31
28
|
|
|
29
|
+
# --- GPU Functions (Placeholders based on your request) ---
|
|
32
30
|
|
|
33
|
-
#
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# From diagnosticsGPU.py
|
|
48
|
-
from .diagnosticsGPU import (
|
|
49
|
-
NBumiFitBasicModelGPU,
|
|
50
|
-
NBumiCheckFitFSGPU,
|
|
51
|
-
NBumiCompareModelsGPU,
|
|
52
|
-
NBumiPlotDispVsMeanGPU,
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# From normalizationGPU.py
|
|
56
|
-
from .normalizationGPU import (
|
|
57
|
-
NBumiPearsonResidualsGPU,
|
|
58
|
-
NBumiPearsonResidualsApproxGPU,
|
|
59
|
-
)
|
|
31
|
+
# From CoreGPU.py
|
|
32
|
+
try:
|
|
33
|
+
from .CoreGPU import (
|
|
34
|
+
ConvertDataSparseGPU,
|
|
35
|
+
hidden_calc_valsGPU,
|
|
36
|
+
NBumiFitModelGPU,
|
|
37
|
+
NBumiFitDispVsMeanGPU,
|
|
38
|
+
NBumiFeatureSelectionHighVarGPU,
|
|
39
|
+
NBumiFeatureSelectionCombinedDropGPU,
|
|
40
|
+
NBumiCombinedDropVolcanoGPU,
|
|
41
|
+
)
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass # Handle case where GPU modules might not be present on CPU node
|
|
60
44
|
|
|
45
|
+
# From DiagnosticsGPU.py
|
|
46
|
+
try:
|
|
47
|
+
from .DiagnosticsGPU import (
|
|
48
|
+
NBumiFitBasicModelGPU,
|
|
49
|
+
NBumiCheckFitFSGPU,
|
|
50
|
+
NBumiCompareModelsGPU,
|
|
51
|
+
NBumiPlotDispVsMeanGPU,
|
|
52
|
+
)
|
|
53
|
+
except ImportError:
|
|
54
|
+
pass
|
|
61
55
|
|
|
62
|
-
#
|
|
63
|
-
|
|
56
|
+
# From NormalizationGPU.py
|
|
57
|
+
try:
|
|
58
|
+
from .NormalizationGPU import (
|
|
59
|
+
NBumiPearsonResidualsCombinedGPU,
|
|
60
|
+
)
|
|
61
|
+
except ImportError:
|
|
62
|
+
pass
|
|
64
63
|
|
|
64
|
+
# --- Public API ---
|
|
65
65
|
__all__ = [
|
|
66
66
|
# --- CPU ---
|
|
67
|
-
# coreCPU
|
|
68
67
|
'ConvertDataSparseCPU',
|
|
69
68
|
'hidden_calc_valsCPU',
|
|
70
69
|
'NBumiFitModelCPU',
|
|
@@ -73,18 +72,14 @@ __all__ = [
|
|
|
73
72
|
'NBumiFeatureSelectionCombinedDropCPU',
|
|
74
73
|
'NBumiCombinedDropVolcanoCPU',
|
|
75
74
|
|
|
76
|
-
# diagnosticsCPU
|
|
77
75
|
'NBumiFitBasicModelCPU',
|
|
78
76
|
'NBumiCheckFitFSCPU',
|
|
79
77
|
'NBumiCompareModelsCPU',
|
|
80
78
|
'NBumiPlotDispVsMeanCPU',
|
|
81
79
|
|
|
82
|
-
|
|
83
|
-
'NBumiPearsonResidualsCPU',
|
|
84
|
-
'NBumiPearsonResidualsApproxCPU',
|
|
80
|
+
'NBumiPearsonResidualsCombinedCPU',
|
|
85
81
|
|
|
86
82
|
# --- GPU ---
|
|
87
|
-
# coreGPU
|
|
88
83
|
'ConvertDataSparseGPU',
|
|
89
84
|
'hidden_calc_valsGPU',
|
|
90
85
|
'NBumiFitModelGPU',
|
|
@@ -92,15 +87,11 @@ __all__ = [
|
|
|
92
87
|
'NBumiFeatureSelectionHighVarGPU',
|
|
93
88
|
'NBumiFeatureSelectionCombinedDropGPU',
|
|
94
89
|
'NBumiCombinedDropVolcanoGPU',
|
|
95
|
-
'get_optimal_chunk_size',
|
|
96
90
|
|
|
97
|
-
# diagnosticsGPU
|
|
98
91
|
'NBumiFitBasicModelGPU',
|
|
99
92
|
'NBumiCheckFitFSGPU',
|
|
100
93
|
'NBumiCompareModelsGPU',
|
|
101
94
|
'NBumiPlotDispVsMeanGPU',
|
|
102
95
|
|
|
103
|
-
|
|
104
|
-
'NBumiPearsonResidualsGPU',
|
|
105
|
-
'NBumiPearsonResidualsApproxGPU',
|
|
96
|
+
'NBumiPearsonResidualsCombinedGPU',
|
|
106
97
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: M3Drop
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.45
|
|
4
4
|
Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
|
|
5
5
|
Home-page: https://github.com/PragalvhaSharma/m3DropNew
|
|
6
6
|
Author: Tallulah Andrews
|
|
@@ -21,12 +21,14 @@ Requires-Dist: matplotlib-venn>=0.11
|
|
|
21
21
|
Requires-Dist: memory_profiler>=0.60.0
|
|
22
22
|
Requires-Dist: numpy>=1.21.0
|
|
23
23
|
Requires-Dist: pandas>=1.5.0
|
|
24
|
-
Requires-Dist: py-cpuinfo
|
|
25
24
|
Requires-Dist: scanpy>=1.9.0
|
|
26
25
|
Requires-Dist: scikit-learn>=1.0.0
|
|
27
26
|
Requires-Dist: scipy>=1.8.0
|
|
28
27
|
Requires-Dist: seaborn>=0.11.0
|
|
29
28
|
Requires-Dist: statsmodels>=0.13.0
|
|
29
|
+
Requires-Dist: numba>=0.57.0
|
|
30
|
+
Requires-Dist: psutil>=5.9.0
|
|
31
|
+
Requires-Dist: py-cpuinfo
|
|
30
32
|
Provides-Extra: gpu
|
|
31
33
|
Requires-Dist: cupy-cuda12x; extra == "gpu"
|
|
32
34
|
Dynamic: author
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
m3Drop/ControlDeviceCPU.py,sha256=8P-hxd4thc2wSeon73b9rz3clIGkE3x0cEE82RiGFds,8880
|
|
2
|
+
m3Drop/ControlDeviceGPU.py,sha256=4nzPtgyV0RsEOeezwCVJ7oyDOsp9-dRVLczlduUocpU,9143
|
|
3
|
+
m3Drop/CoreCPU.py,sha256=csRg5TLQx1Sup7k3lDJm9OO5Oe5-1aC3u_6ldE_GIX8,18679
|
|
4
|
+
m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
|
|
5
|
+
m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
|
|
6
|
+
m3Drop/DiagnosticsGPU.py,sha256=z5BMOZNo_ruMBaDWJIE6zWhMUtf2ItY5Vcgu4N9lbok,14321
|
|
7
|
+
m3Drop/NormalizationCPU.py,sha256=Mm8VzWDu-NONbp-ngAt4PLjCKAGc7gJZKf-Yd-U95r0,7255
|
|
8
|
+
m3Drop/NormalizationGPU.py,sha256=3gRO82_6hSzB4rxmTRGocRUO2hy--i-szGCAY6FBnAI,8462
|
|
9
|
+
m3Drop/__init__.py,sha256=_J5p4bb_RAD6k_bnJUqj0DfA_akZMjd-AXzcVQpkW_g,2240
|
|
10
|
+
m3drop-0.4.45.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
|
|
11
|
+
m3drop-0.4.45.dist-info/METADATA,sha256=kFQ74ZykcHo4U-NKp-fISDj1UnF2VSsghT3rzgCiUxw,5248
|
|
12
|
+
m3drop-0.4.45.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
13
|
+
m3drop-0.4.45.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
|
|
14
|
+
m3drop-0.4.45.dist-info/RECORD,,
|
m3Drop/coreCPU.py
DELETED
|
@@ -1,477 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import anndata
|
|
3
|
-
import h5py
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import time
|
|
6
|
-
|
|
7
|
-
from scipy.sparse import csr_matrix as sp_csr_matrix
|
|
8
|
-
|
|
9
|
-
import statsmodels.api as sm
|
|
10
|
-
import matplotlib.pyplot as plt
|
|
11
|
-
from scipy.stats import norm
|
|
12
|
-
from statsmodels.stats.multitest import multipletests
|
|
13
|
-
def ConvertDataSparseCPU(
|
|
14
|
-
input_filename: str,
|
|
15
|
-
output_filename: str,
|
|
16
|
-
row_chunk_size: int = 5000
|
|
17
|
-
):
|
|
18
|
-
"""
|
|
19
|
-
Performs out-of-core data cleaning on a standard (cell, gene) sparse
|
|
20
|
-
.h5ad file. It correctly identifies and removes genes with zero counts
|
|
21
|
-
across all cells. CPU-only implementation.
|
|
22
|
-
"""
|
|
23
|
-
start_time = time.perf_counter()
|
|
24
|
-
print(f"FUNCTION: ConvertDataSparseCPU() | FILE: {input_filename}")
|
|
25
|
-
|
|
26
|
-
with h5py.File(input_filename, 'r') as f_in:
|
|
27
|
-
x_group_in = f_in['X']
|
|
28
|
-
n_cells, n_genes = x_group_in.attrs['shape']
|
|
29
|
-
|
|
30
|
-
print("Phase [1/2]: Identifying genes with non-zero counts...")
|
|
31
|
-
genes_to_keep_mask = np.zeros(n_genes, dtype=bool)
|
|
32
|
-
|
|
33
|
-
h5_indptr = x_group_in['indptr']
|
|
34
|
-
h5_indices = x_group_in['indices']
|
|
35
|
-
|
|
36
|
-
for i in range(0, n_cells, row_chunk_size):
|
|
37
|
-
end_row = min(i + row_chunk_size, n_cells)
|
|
38
|
-
print(f"Phase [1/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
|
|
39
|
-
|
|
40
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
41
|
-
if start_idx == end_idx:
|
|
42
|
-
continue
|
|
43
|
-
|
|
44
|
-
indices_slice = np.array(h5_indices[start_idx:end_idx])
|
|
45
|
-
unique_in_chunk = np.unique(indices_slice)
|
|
46
|
-
genes_to_keep_mask[unique_in_chunk] = True
|
|
47
|
-
|
|
48
|
-
n_genes_to_keep = np.sum(genes_to_keep_mask)
|
|
49
|
-
print(f"\nPhase [1/2]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
|
|
50
|
-
|
|
51
|
-
print("Phase [2/2]: Rounding up decimals and saving filtered output to disk...")
|
|
52
|
-
adata_meta = anndata.read_h5ad(input_filename, backed='r')
|
|
53
|
-
filtered_var_df = adata_meta.var[genes_to_keep_mask]
|
|
54
|
-
|
|
55
|
-
adata_out_template = anndata.AnnData(obs=adata_meta.obs, var=filtered_var_df, uns=adata_meta.uns)
|
|
56
|
-
adata_out_template.write_h5ad(output_filename, compression="gzip")
|
|
57
|
-
|
|
58
|
-
with h5py.File(output_filename, 'a') as f_out:
|
|
59
|
-
if 'X' in f_out:
|
|
60
|
-
del f_out['X']
|
|
61
|
-
x_group_out = f_out.create_group('X')
|
|
62
|
-
|
|
63
|
-
out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
|
|
64
|
-
out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
|
|
65
|
-
out_indptr = x_group_out.create_dataset('indptr', shape=(n_cells + 1,), dtype='int64')
|
|
66
|
-
out_indptr[0] = 0
|
|
67
|
-
current_nnz = 0
|
|
68
|
-
|
|
69
|
-
h5_data = x_group_in['data']
|
|
70
|
-
|
|
71
|
-
for i in range(0, n_cells, row_chunk_size):
|
|
72
|
-
end_row = min(i + row_chunk_size, n_cells)
|
|
73
|
-
print(f"Phase [2/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
|
|
74
|
-
|
|
75
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
76
|
-
data_slice = np.array(h5_data[start_idx:end_idx])
|
|
77
|
-
indices_slice = np.array(h5_indices[start_idx:end_idx])
|
|
78
|
-
indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
|
|
79
|
-
|
|
80
|
-
chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, n_genes))
|
|
81
|
-
filtered_chunk = chunk[:, genes_to_keep_mask]
|
|
82
|
-
filtered_chunk.data = np.ceil(filtered_chunk.data).astype('float32')
|
|
83
|
-
|
|
84
|
-
out_data.resize(current_nnz + filtered_chunk.nnz, axis=0)
|
|
85
|
-
out_data[current_nnz:] = filtered_chunk.data
|
|
86
|
-
|
|
87
|
-
out_indices.resize(current_nnz + filtered_chunk.nnz, axis=0)
|
|
88
|
-
out_indices[current_nnz:] = filtered_chunk.indices
|
|
89
|
-
|
|
90
|
-
new_indptr_list = filtered_chunk.indptr[1:].astype(np.int64) + current_nnz
|
|
91
|
-
out_indptr[i + 1 : end_row + 1] = new_indptr_list
|
|
92
|
-
|
|
93
|
-
current_nnz += filtered_chunk.nnz
|
|
94
|
-
|
|
95
|
-
x_group_out.attrs['encoding-type'] = 'csr_matrix'
|
|
96
|
-
x_group_out.attrs['encoding-version'] = '0.1.0'
|
|
97
|
-
x_group_out.attrs['shape'] = np.array([n_cells, n_genes_to_keep], dtype='int64')
|
|
98
|
-
print(f"\nPhase [2/2]: COMPLETE | Output: {output_filename} {' ' * 50}")
|
|
99
|
-
|
|
100
|
-
end_time = time.perf_counter()
|
|
101
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def hidden_calc_valsCPU(
|
|
105
|
-
filename: str,
|
|
106
|
-
chunk_size: int = 5000
|
|
107
|
-
) -> dict:
|
|
108
|
-
"""
|
|
109
|
-
Calculates key statistics from a large, sparse (cell, gene) .h5ad file
|
|
110
|
-
using a memory-safe, CPU-only, single-pass algorithm.
|
|
111
|
-
"""
|
|
112
|
-
start_time = time.perf_counter()
|
|
113
|
-
print(f"FUNCTION: hidden_calc_valsCPU() | FILE: {filename}")
|
|
114
|
-
|
|
115
|
-
adata_meta = anndata.read_h5ad(filename, backed='r')
|
|
116
|
-
print("Phase [1/3]: Finding nc and ng...")
|
|
117
|
-
nc, ng = adata_meta.shape
|
|
118
|
-
print("Phase [1/3]: COMPLETE")
|
|
119
|
-
|
|
120
|
-
tis = np.zeros(nc, dtype='float64')
|
|
121
|
-
cell_non_zeros = np.zeros(nc, dtype='int64')
|
|
122
|
-
tjs = np.zeros(ng, dtype='float64')
|
|
123
|
-
gene_non_zeros = np.zeros(ng, dtype='int64')
|
|
124
|
-
|
|
125
|
-
print("Phase [2/3]: Calculating tis and tjs...")
|
|
126
|
-
with h5py.File(filename, 'r') as f_in:
|
|
127
|
-
x_group = f_in['X']
|
|
128
|
-
h5_indptr = x_group['indptr']
|
|
129
|
-
h5_data = x_group['data']
|
|
130
|
-
h5_indices = x_group['indices']
|
|
131
|
-
|
|
132
|
-
for i in range(0, nc, chunk_size):
|
|
133
|
-
end_row = min(i + chunk_size, nc)
|
|
134
|
-
print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
135
|
-
|
|
136
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
137
|
-
if start_idx == end_idx:
|
|
138
|
-
continue
|
|
139
|
-
|
|
140
|
-
data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
|
|
141
|
-
indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
|
|
142
|
-
indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
|
|
143
|
-
|
|
144
|
-
chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, ng))
|
|
145
|
-
|
|
146
|
-
tis[i:end_row] = np.asarray(chunk.sum(axis=1)).ravel()
|
|
147
|
-
cell_non_zeros[i:end_row] = np.diff(indptr_slice)
|
|
148
|
-
|
|
149
|
-
np.add.at(tjs, indices_slice, data_slice)
|
|
150
|
-
unique_indices, counts = np.unique(indices_slice, return_counts=True)
|
|
151
|
-
gene_non_zeros[unique_indices] += counts
|
|
152
|
-
|
|
153
|
-
tjs_series = pd.Series(tjs, index=adata_meta.var.index)
|
|
154
|
-
tis_series = pd.Series(tis, index=adata_meta.obs.index)
|
|
155
|
-
print(f"Phase [2/3]: COMPLETE{' ' * 50}")
|
|
156
|
-
|
|
157
|
-
print("Phase [3/3]: Calculating dis, djs, and total...")
|
|
158
|
-
dis = ng - cell_non_zeros
|
|
159
|
-
djs = nc - gene_non_zeros
|
|
160
|
-
total = tjs.sum()
|
|
161
|
-
print("Phase [3/3]: COMPLETE")
|
|
162
|
-
|
|
163
|
-
end_time = time.perf_counter()
|
|
164
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
165
|
-
|
|
166
|
-
return {
|
|
167
|
-
"tis": tis_series,
|
|
168
|
-
"tjs": tjs_series,
|
|
169
|
-
"dis": pd.Series(dis, index=adata_meta.obs.index),
|
|
170
|
-
"djs": pd.Series(djs, index=adata_meta.var.index),
|
|
171
|
-
"total": total,
|
|
172
|
-
"nc": nc,
|
|
173
|
-
"ng": ng
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def NBumiFitModelCPU(
|
|
178
|
-
cleaned_filename: str,
|
|
179
|
-
stats: dict,
|
|
180
|
-
chunk_size: int = 5000
|
|
181
|
-
) -> dict:
|
|
182
|
-
start_time = time.perf_counter()
|
|
183
|
-
print(f"FUNCTION: NBumiFitModelCPU() | FILE: {cleaned_filename}")
|
|
184
|
-
|
|
185
|
-
tjs_series = stats['tjs']
|
|
186
|
-
tis_series = stats['tis']
|
|
187
|
-
tjs = tjs_series.values.astype(np.float64)
|
|
188
|
-
tis = tis_series.values.astype(np.float64)
|
|
189
|
-
nc, ng = stats['nc'], stats['ng']
|
|
190
|
-
total = stats['total']
|
|
191
|
-
|
|
192
|
-
sum_x_sq = np.zeros(ng, dtype=np.float64)
|
|
193
|
-
sum_2xmu = np.zeros(ng, dtype=np.float64)
|
|
194
|
-
|
|
195
|
-
print("Phase [1/3]: Pre-calculating sum of squared expectations...")
|
|
196
|
-
sum_tis_sq = np.sum(tis**2)
|
|
197
|
-
sum_mu_sq = (tjs**2 / total**2) * sum_tis_sq
|
|
198
|
-
print("Phase [1/3]: COMPLETE")
|
|
199
|
-
|
|
200
|
-
print("Phase [2/3]: Calculating variance components from data chunks...")
|
|
201
|
-
with h5py.File(cleaned_filename, 'r') as f_in:
|
|
202
|
-
x_group = f_in['X']
|
|
203
|
-
h5_indptr = x_group['indptr']
|
|
204
|
-
h5_data = x_group['data']
|
|
205
|
-
h5_indices = x_group['indices']
|
|
206
|
-
|
|
207
|
-
for i in range(0, nc, chunk_size):
|
|
208
|
-
end_row = min(i + chunk_size, nc)
|
|
209
|
-
print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
210
|
-
|
|
211
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
212
|
-
if start_idx == end_idx:
|
|
213
|
-
continue
|
|
214
|
-
|
|
215
|
-
data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
|
|
216
|
-
indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
|
|
217
|
-
indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
|
|
218
|
-
|
|
219
|
-
np.add.at(sum_x_sq, indices_slice, data_slice**2)
|
|
220
|
-
|
|
221
|
-
row_lengths = np.diff(indptr_slice)
|
|
222
|
-
if row_lengths.sum() == 0:
|
|
223
|
-
continue
|
|
224
|
-
cell_indices = np.repeat(np.arange(i, end_row), row_lengths)
|
|
225
|
-
|
|
226
|
-
tis_per_nz = tis[cell_indices]
|
|
227
|
-
tjs_per_nz = tjs[indices_slice]
|
|
228
|
-
term_vals = 2 * data_slice * tjs_per_nz * tis_per_nz / total
|
|
229
|
-
np.add.at(sum_2xmu, indices_slice, term_vals)
|
|
230
|
-
|
|
231
|
-
print(f"Phase [2/3]: COMPLETE {' ' * 50}")
|
|
232
|
-
|
|
233
|
-
print("Phase [3/3]: Finalizing dispersion and variance calculations...")
|
|
234
|
-
sum_sq_dev = sum_x_sq - sum_2xmu + sum_mu_sq
|
|
235
|
-
var_obs = sum_sq_dev / max(nc - 1, 1)
|
|
236
|
-
|
|
237
|
-
sizes = np.full(ng, 10000.0, dtype=np.float64)
|
|
238
|
-
numerator = (tjs**2 / total**2) * sum_tis_sq
|
|
239
|
-
denominator = sum_sq_dev - tjs
|
|
240
|
-
stable_mask = denominator > 1e-6
|
|
241
|
-
sizes[stable_mask] = numerator[stable_mask] / denominator[stable_mask]
|
|
242
|
-
sizes[np.isnan(sizes) | (sizes <= 0)] = 10000.0
|
|
243
|
-
|
|
244
|
-
print("Phase [3/3]: COMPLETE")
|
|
245
|
-
|
|
246
|
-
end_time = time.perf_counter()
|
|
247
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
248
|
-
|
|
249
|
-
return {
|
|
250
|
-
'var_obs': pd.Series(var_obs, index=tjs_series.index),
|
|
251
|
-
'sizes': pd.Series(sizes, index=tjs_series.index),
|
|
252
|
-
'vals': stats
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def NBumiFitDispVsMeanCPU(fit, suppress_plot=True):
|
|
257
|
-
"""
|
|
258
|
-
Fits a linear model to the log-dispersion vs log-mean of gene expression.
|
|
259
|
-
"""
|
|
260
|
-
vals = fit['vals']
|
|
261
|
-
size_g = fit['sizes'].values
|
|
262
|
-
tjs = vals['tjs'].values
|
|
263
|
-
|
|
264
|
-
mean_expression = tjs / vals['nc']
|
|
265
|
-
forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
|
|
266
|
-
|
|
267
|
-
log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
|
|
268
|
-
higher = log2_mean_expr > 4
|
|
269
|
-
if np.sum(higher & forfit) > 2000:
|
|
270
|
-
forfit = higher & forfit
|
|
271
|
-
|
|
272
|
-
y = np.log(size_g[forfit])
|
|
273
|
-
x = np.log(mean_expression[forfit])
|
|
274
|
-
|
|
275
|
-
X = sm.add_constant(x)
|
|
276
|
-
model = sm.OLS(y, X).fit()
|
|
277
|
-
|
|
278
|
-
if not suppress_plot:
|
|
279
|
-
plt.figure(figsize=(7, 6))
|
|
280
|
-
plt.scatter(x, y, alpha=0.5, label="Data Points")
|
|
281
|
-
plt.plot(x, model.fittedvalues, color='red', label='Regression Fit')
|
|
282
|
-
plt.title('Dispersion vs. Mean Expression')
|
|
283
|
-
plt.xlabel("Log Mean Expression")
|
|
284
|
-
plt.ylabel("Log Size (Dispersion)")
|
|
285
|
-
plt.legend()
|
|
286
|
-
plt.grid(True)
|
|
287
|
-
plt.show()
|
|
288
|
-
|
|
289
|
-
return model.params
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def NBumiFeatureSelectionHighVarCPU(fit: dict) -> pd.DataFrame:
|
|
293
|
-
"""
|
|
294
|
-
Selects features (genes) with higher variance than expected.
|
|
295
|
-
"""
|
|
296
|
-
start_time = time.perf_counter()
|
|
297
|
-
print(f"FUNCTION: NBumiFeatureSelectionHighVarCPU()")
|
|
298
|
-
|
|
299
|
-
print("Phase [1/1]: Calculating residuals for high variance selection...")
|
|
300
|
-
vals = fit['vals']
|
|
301
|
-
coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
|
|
302
|
-
|
|
303
|
-
mean_expression = vals['tjs'].values / vals['nc']
|
|
304
|
-
|
|
305
|
-
with np.errstate(divide='ignore', invalid='ignore'):
|
|
306
|
-
log_mean_expression = np.log(mean_expression)
|
|
307
|
-
log_mean_expression[np.isneginf(log_mean_expression)] = 0
|
|
308
|
-
exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
|
|
309
|
-
|
|
310
|
-
with np.errstate(divide='ignore', invalid='ignore'):
|
|
311
|
-
res = np.log(fit['sizes'].values) - np.log(exp_size)
|
|
312
|
-
|
|
313
|
-
results_df = pd.DataFrame({
|
|
314
|
-
'Gene': fit['sizes'].index,
|
|
315
|
-
'Residual': res
|
|
316
|
-
})
|
|
317
|
-
|
|
318
|
-
final_table = results_df.sort_values(by='Residual', ascending=True)
|
|
319
|
-
print("Phase [1/1]: COMPLETE")
|
|
320
|
-
|
|
321
|
-
end_time = time.perf_counter()
|
|
322
|
-
print(f"Total time: {end_time - start_time:.4f} seconds.\n")
|
|
323
|
-
|
|
324
|
-
return final_table
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
def NBumiFeatureSelectionCombinedDropCPU(
|
|
328
|
-
fit: dict,
|
|
329
|
-
cleaned_filename: str,
|
|
330
|
-
chunk_size: int = 5000,
|
|
331
|
-
method="fdr_bh",
|
|
332
|
-
qval_thresh=0.05
|
|
333
|
-
) -> pd.DataFrame:
|
|
334
|
-
"""
|
|
335
|
-
Selects features with a significantly higher dropout rate than expected,
|
|
336
|
-
using an out-of-core, CPU-only approach.
|
|
337
|
-
"""
|
|
338
|
-
start_time = time.perf_counter()
|
|
339
|
-
print(f"FUNCTION: NBumiFeatureSelectionCombinedDropCPU() | FILE: {cleaned_filename}")
|
|
340
|
-
|
|
341
|
-
print("Phase [1/3]: Initializing arrays and calculating expected dispersion...")
|
|
342
|
-
vals = fit['vals']
|
|
343
|
-
coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
|
|
344
|
-
|
|
345
|
-
tjs = vals['tjs'].values.astype(np.float64)
|
|
346
|
-
tis = vals['tis'].values.astype(np.float64)
|
|
347
|
-
total = vals['total']
|
|
348
|
-
nc = vals['nc']
|
|
349
|
-
ng = vals['ng']
|
|
350
|
-
|
|
351
|
-
mean_expression = tjs / nc
|
|
352
|
-
with np.errstate(divide='ignore'):
|
|
353
|
-
exp_size = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression, where=(mean_expression > 0)))
|
|
354
|
-
exp_size = np.nan_to_num(exp_size, nan=1.0, posinf=1e6, neginf=1.0)
|
|
355
|
-
|
|
356
|
-
p_sum = np.zeros(ng, dtype=np.float64)
|
|
357
|
-
p_var_sum = np.zeros(ng, dtype=np.float64)
|
|
358
|
-
print("Phase [1/3]: COMPLETE")
|
|
359
|
-
|
|
360
|
-
print("Phase [2/3]: Calculating expected dropout sums from data chunks...")
|
|
361
|
-
for i in range(0, nc, chunk_size):
|
|
362
|
-
end_col = min(i + chunk_size, nc)
|
|
363
|
-
print(f"Phase [2/3]: Processing: {end_col} of {nc} cells.", end='\r')
|
|
364
|
-
|
|
365
|
-
tis_chunk = tis[i:end_col]
|
|
366
|
-
if tis_chunk.size == 0:
|
|
367
|
-
continue
|
|
368
|
-
|
|
369
|
-
mu_chunk = tjs[:, np.newaxis] * tis_chunk[np.newaxis, :] / total
|
|
370
|
-
base = 1 + mu_chunk / exp_size[:, np.newaxis]
|
|
371
|
-
base = np.maximum(base, 1e-12)
|
|
372
|
-
p_is_chunk = np.power(base, -exp_size[:, np.newaxis])
|
|
373
|
-
p_is_chunk = np.nan_to_num(p_is_chunk, nan=0.0, posinf=1.0, neginf=0.0)
|
|
374
|
-
|
|
375
|
-
p_var_is_chunk = p_is_chunk * (1 - p_is_chunk)
|
|
376
|
-
|
|
377
|
-
p_sum += np.sum(p_is_chunk, axis=1)
|
|
378
|
-
p_var_sum += np.sum(p_var_is_chunk, axis=1)
|
|
379
|
-
|
|
380
|
-
print(f"Phase [2/3]: COMPLETE {' ' * 50}")
|
|
381
|
-
|
|
382
|
-
print("Phase [3/3]: Performing statistical test and adjusting p-values...")
|
|
383
|
-
|
|
384
|
-
droprate_exp = p_sum / nc
|
|
385
|
-
droprate_exp_err = np.sqrt(p_var_sum / (nc**2))
|
|
386
|
-
|
|
387
|
-
droprate_obs = vals['djs'].values / nc
|
|
388
|
-
|
|
389
|
-
diff = droprate_obs - droprate_exp
|
|
390
|
-
combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
|
|
391
|
-
|
|
392
|
-
with np.errstate(divide='ignore', invalid='ignore'):
|
|
393
|
-
Zed = diff / combined_err
|
|
394
|
-
|
|
395
|
-
pvalue = norm.sf(Zed)
|
|
396
|
-
|
|
397
|
-
results_df = pd.DataFrame({
|
|
398
|
-
'Gene': vals['tjs'].index,
|
|
399
|
-
'p.value': pvalue,
|
|
400
|
-
'effect_size': diff
|
|
401
|
-
})
|
|
402
|
-
results_df = results_df.sort_values(by='p.value')
|
|
403
|
-
|
|
404
|
-
qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
|
|
405
|
-
results_df['q.value'] = qval
|
|
406
|
-
final_table = results_df[results_df['q.value'] < qval_thresh]
|
|
407
|
-
print("Phase [3/3]: COMPLETE")
|
|
408
|
-
|
|
409
|
-
end_time = time.perf_counter()
|
|
410
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
411
|
-
|
|
412
|
-
return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
def NBumiCombinedDropVolcanoCPU(
|
|
416
|
-
results_df: pd.DataFrame,
|
|
417
|
-
qval_thresh: float = 0.05,
|
|
418
|
-
effect_size_thresh: float = 0.25,
|
|
419
|
-
top_n_genes: int = 10,
|
|
420
|
-
suppress_plot: bool = False,
|
|
421
|
-
plot_filename: str = None
|
|
422
|
-
):
|
|
423
|
-
"""
|
|
424
|
-
Generates a volcano plot from the results of feature selection (CPU version).
|
|
425
|
-
"""
|
|
426
|
-
start_time = time.perf_counter()
|
|
427
|
-
print(f"FUNCTION: NBumiCombinedDropVolcanoCPU()")
|
|
428
|
-
|
|
429
|
-
print("Phase [1/1]: Preparing data for visualization...")
|
|
430
|
-
df = results_df.copy()
|
|
431
|
-
|
|
432
|
-
non_zero_min = df[df['q.value'] > 0]['q.value'].min()
|
|
433
|
-
df['q.value'] = df['q.value'].replace(0, non_zero_min)
|
|
434
|
-
df['-log10_qval'] = -np.log10(df['q.value'])
|
|
435
|
-
|
|
436
|
-
df['color'] = 'grey'
|
|
437
|
-
sig_up = (df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh)
|
|
438
|
-
sig_down = (df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh)
|
|
439
|
-
df.loc[sig_up, 'color'] = 'red'
|
|
440
|
-
df.loc[sig_down, 'color'] = 'blue'
|
|
441
|
-
|
|
442
|
-
print("Phase [1/1]: COMPLETE")
|
|
443
|
-
print("Phase [2/2]: Generating plot...")
|
|
444
|
-
|
|
445
|
-
plt.figure(figsize=(10, 8))
|
|
446
|
-
plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6)
|
|
447
|
-
plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
|
|
448
|
-
plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
|
|
449
|
-
plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
|
|
450
|
-
|
|
451
|
-
top_genes = df.nsmallest(top_n_genes, 'q.value')
|
|
452
|
-
for _, row in top_genes.iterrows():
|
|
453
|
-
plt.text(row['effect_size'], row['-log10_qval'], row['Gene'],
|
|
454
|
-
fontsize=9, ha='left', va='bottom', alpha=0.8)
|
|
455
|
-
|
|
456
|
-
plt.title('Volcano Plot of Dropout Feature Selection')
|
|
457
|
-
plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
|
|
458
|
-
plt.ylabel('-log10 (Adjusted p-value)')
|
|
459
|
-
plt.grid(True, linestyle='--', alpha=0.3)
|
|
460
|
-
|
|
461
|
-
ax = plt.gca()
|
|
462
|
-
|
|
463
|
-
if plot_filename:
|
|
464
|
-
plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
|
|
465
|
-
print(f"STATUS: Volcano plot saved to '{plot_filename}'")
|
|
466
|
-
|
|
467
|
-
if not suppress_plot:
|
|
468
|
-
plt.show()
|
|
469
|
-
|
|
470
|
-
plt.close()
|
|
471
|
-
|
|
472
|
-
print("Phase [2/2]: COMPLETE")
|
|
473
|
-
|
|
474
|
-
end_time = time.perf_counter()
|
|
475
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
476
|
-
|
|
477
|
-
return ax
|