knn-normalization 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knn_normalization/__init__.py +7 -0
- knn_normalization/pl/__init__.py +0 -0
- knn_normalization/pl/plotting.py +0 -0
- knn_normalization/pp/__init__.py +1 -0
- knn_normalization/pp/preprocessing.py +60 -0
- knn_normalization/tl/__init__.py +1 -0
- knn_normalization/tl/tools.py +505 -0
- knn_normalization-0.1.0.dist-info/METADATA +122 -0
- knn_normalization-0.1.0.dist-info/RECORD +11 -0
- knn_normalization-0.1.0.dist-info/WHEEL +4 -0
- knn_normalization-0.1.0.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .preprocessing import calculate_neighbors_from_protein, retrieve_neighbors
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import scanpy as sc
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def retrieve_neighbors(neighbors):
|
|
5
|
+
"""
|
|
6
|
+
Convert the KNN graph into a dictionary mapping each cell to its neighbors.
|
|
7
|
+
|
|
8
|
+
Parameters
|
|
9
|
+
----------
|
|
10
|
+
neighbors
|
|
11
|
+
KNN graph in the format of ``.obsp["connectivities"]``.
|
|
12
|
+
|
|
13
|
+
Returns
|
|
14
|
+
-------
|
|
15
|
+
dict
|
|
16
|
+
Dictionary mapping each cell index to a list of its neighbor indices.
|
|
17
|
+
"""
|
|
18
|
+
row_indices, col_indices = neighbors.nonzero()
|
|
19
|
+
neighbors = {}
|
|
20
|
+
for src, tgt in zip(row_indices, col_indices, strict=False):
|
|
21
|
+
neighbors.setdefault(src, []).append(tgt)
|
|
22
|
+
|
|
23
|
+
return neighbors
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def calculate_neighbors_from_protein(protein_data, n_neighbors, log_transform=True):
|
|
27
|
+
"""
|
|
28
|
+
Calculate the KNN graph from protein expression data.
|
|
29
|
+
|
|
30
|
+
If the number of proteins is less than 70, neighbors are calculated
|
|
31
|
+
directly on the protein data. Otherwise, PCA is applied first.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
protein_data
|
|
36
|
+
AnnData object with raw protein expression counts.
|
|
37
|
+
n_neighbors
|
|
38
|
+
Number of neighbors to compute.
|
|
39
|
+
log_transform
|
|
40
|
+
Whether to log-transform the protein data before computing neighbors.
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
The KNN graph in scipy.sparse.csr_matrix format.
|
|
45
|
+
"""
|
|
46
|
+
data_for_neighbors = protein_data.copy()
|
|
47
|
+
if log_transform:
|
|
48
|
+
sc.pp.log1p(data_for_neighbors)
|
|
49
|
+
|
|
50
|
+
n_proteins = data_for_neighbors.n_vars
|
|
51
|
+
if (
|
|
52
|
+
n_proteins < 70
|
|
53
|
+
): # If the number of proteins is less than 70, calculate neighbors on the protein data without performing PCA first.
|
|
54
|
+
sc.pp.neighbors(data_for_neighbors, use_rep="X", metric="cosine", n_neighbors=n_neighbors)
|
|
55
|
+
else: # If the number of proteins is more than 70, calculate neighbors on the PCA results of the protein data.
|
|
56
|
+
sc.pp.pca(data_for_neighbors)
|
|
57
|
+
sc.pp.neighbors(data_for_neighbors, metric="cosine", n_neighbors=n_neighbors)
|
|
58
|
+
neighbors = data_for_neighbors.obsp["connectivities"]
|
|
59
|
+
|
|
60
|
+
return neighbors
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .tools import _normalize_with_neighbors, knn_normalize
|
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
from numbers import Integral
|
|
2
|
+
from typing import Literal
|
|
3
|
+
from warnings import warn
|
|
4
|
+
|
|
5
|
+
import mudata
|
|
6
|
+
import numpy as np
|
|
7
|
+
import scanpy as sc
|
|
8
|
+
from anndata import AnnData
|
|
9
|
+
from mudata import MuData
|
|
10
|
+
from scipy import stats
|
|
11
|
+
|
|
12
|
+
from knn_normalization.pp.preprocessing import calculate_neighbors_from_protein, retrieve_neighbors
|
|
13
|
+
|
|
14
|
+
mudata.set_options(pull_on_update=False)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def knn_normalize(
|
|
18
|
+
data: AnnData | MuData,
|
|
19
|
+
calculate_neighbors_from: Literal["prot", "rna", "use_existing_neighbors"] = "prot",
|
|
20
|
+
preprocess_rna: bool = True,
|
|
21
|
+
log_transform: bool = True,
|
|
22
|
+
n_neighbors: Integral | None = None,
|
|
23
|
+
pseudocount: Integral = 5,
|
|
24
|
+
max_iterations: Integral = 25,
|
|
25
|
+
mean: Literal["average", "geom_mean", "trimmed_mean"] = "average",
|
|
26
|
+
inplace: bool = True,
|
|
27
|
+
save_size_factors: bool = False,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
preserve_total_counts=True,
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Normalize protein expression with KNN normalization.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
data
|
|
37
|
+
AnnData object with protein expression counts or MuData object with
|
|
38
|
+
``prot`` and ``rna`` modalities.
|
|
39
|
+
calculate_neighbors_from
|
|
40
|
+
Whether to use the ``prot`` or the ``rna`` modality to calculate neighbor
|
|
41
|
+
cells. If ``use_existing_neighbors``, the neighbors already present in the
|
|
42
|
+
protein data will be used.
|
|
43
|
+
preprocess_rna
|
|
44
|
+
If using RNA to calculate neighbors, whether to preprocess the RNA data
|
|
45
|
+
with library size normalization and log-transformation first.
|
|
46
|
+
n_neighbors
|
|
47
|
+
Number of neighbors. If ``None``, calculated automatically as
|
|
48
|
+
``max(15, min(round(n_cells / 20), 300))``.
|
|
49
|
+
log_transform
|
|
50
|
+
Whether to log-transform the protein data.
|
|
51
|
+
pseudocount
|
|
52
|
+
Pseudocount to add before normalization to avoid zero-division errors.
|
|
53
|
+
max_iterations
|
|
54
|
+
Maximum number of iterations.
|
|
55
|
+
mean
|
|
56
|
+
Type of mean to use: ``'average'``, ``'geom_mean'``, or ``'trimmed_mean'``.
|
|
57
|
+
inplace
|
|
58
|
+
Whether to update the object in place or return a copy.
|
|
59
|
+
save_size_factors
|
|
60
|
+
If ``True``, saves the final size factors to ``data.obs['size_factor']``
|
|
61
|
+
and the size factor history to ``data.obsm['size_factor_history']``.
|
|
62
|
+
verbose
|
|
63
|
+
Whether to print progress messages.
|
|
64
|
+
preserve_total_counts
|
|
65
|
+
Whether to preserve total counts across iterations.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
Normalized data will be written to ``data`` (if it is an AnnData object) or
|
|
70
|
+
``data.mod['prot']`` (if it is a MuData object) as an X matrix. If inplace is False,
|
|
71
|
+
returns a new AnnData object (if input is AnnData) or a new MuData object (if input is MuData)
|
|
72
|
+
with the normalized data.
|
|
73
|
+
"""
|
|
74
|
+
toreturn = None
|
|
75
|
+
|
|
76
|
+
assert calculate_neighbors_from in ["prot", "rna", "use_existing_neighbors"], (
|
|
77
|
+
"the argument calculate_neighbors_from must be prot, rna or use_existing_neighbors."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if isinstance(data, AnnData):
|
|
81
|
+
if ("log1p" in data.uns) and log_transform:
|
|
82
|
+
warn("The protein data might already be log-transformed.", stacklevel=2)
|
|
83
|
+
|
|
84
|
+
if n_neighbors is None:
|
|
85
|
+
n_cells = data.n_obs
|
|
86
|
+
n_neighbors = max(15, min(round(n_cells / 20), 300))
|
|
87
|
+
|
|
88
|
+
if calculate_neighbors_from == "prot":
|
|
89
|
+
neighbors = calculate_neighbors_from_protein(data, n_neighbors=n_neighbors, log_transform=log_transform)
|
|
90
|
+
elif calculate_neighbors_from == "use_existing_neighbors":
|
|
91
|
+
neighbors = data.obsp["connectivities"]
|
|
92
|
+
assert calculate_neighbors_from != "rna", (
|
|
93
|
+
"If an AnnData object with the protein data is provided, ``calculate_neighbors_from`` cannot be ``rna``. If calculate_neighbors_from = rna is desired, please provide a MuData object with the protein and the rna data."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if inplace:
|
|
97
|
+
_normalize_with_neighbors(
|
|
98
|
+
data,
|
|
99
|
+
neighbors,
|
|
100
|
+
log_transform=log_transform,
|
|
101
|
+
pseudocount=pseudocount,
|
|
102
|
+
max_iterations=max_iterations,
|
|
103
|
+
verbose=verbose,
|
|
104
|
+
save_size_factors=save_size_factors,
|
|
105
|
+
inplace=True,
|
|
106
|
+
mean=mean,
|
|
107
|
+
preserve_total_counts=preserve_total_counts,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
knn_normalized_protein = _normalize_with_neighbors(
|
|
111
|
+
data,
|
|
112
|
+
neighbors,
|
|
113
|
+
log_transform=log_transform,
|
|
114
|
+
pseudocount=pseudocount,
|
|
115
|
+
max_iterations=max_iterations,
|
|
116
|
+
verbose=verbose,
|
|
117
|
+
save_size_factors=save_size_factors,
|
|
118
|
+
inplace=False,
|
|
119
|
+
mean=mean,
|
|
120
|
+
preserve_total_counts=preserve_total_counts,
|
|
121
|
+
)
|
|
122
|
+
toreturn = knn_normalized_protein
|
|
123
|
+
|
|
124
|
+
elif isinstance(data, MuData):
|
|
125
|
+
assert "prot" in data.mod, (
|
|
126
|
+
"The MuData object does not have a modality called ``prot``, please add a modality called ``prot`` with the protein data."
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if ("log1p" in data["prot"].uns) and log_transform:
|
|
130
|
+
warn("The protein data might already be log-transformed.", stacklevel=2)
|
|
131
|
+
|
|
132
|
+
if n_neighbors is None:
|
|
133
|
+
n_cells = data.n_obs
|
|
134
|
+
n_neighbors = max(15, min(round(n_cells / 20), 300))
|
|
135
|
+
|
|
136
|
+
if calculate_neighbors_from == "prot":
|
|
137
|
+
neighbors = calculate_neighbors_from_protein(
|
|
138
|
+
data["prot"], n_neighbors=n_neighbors, log_transform=log_transform
|
|
139
|
+
)
|
|
140
|
+
elif calculate_neighbors_from == "rna":
|
|
141
|
+
assert "rna" in data.mod, (
|
|
142
|
+
"The MuData object does not have a modality called ``rna``, please add a modality called ``rna`` in order to calculate neighbors from the RNA data."
|
|
143
|
+
)
|
|
144
|
+
data_for_neighbors = data["rna"].copy()
|
|
145
|
+
if preprocess_rna:
|
|
146
|
+
sc.pp.normalize_total(data_for_neighbors)
|
|
147
|
+
sc.pp.log1p(data_for_neighbors)
|
|
148
|
+
sc.pp.pca(data_for_neighbors)
|
|
149
|
+
sc.pp.neighbors(data_for_neighbors, n_neighbors=n_neighbors)
|
|
150
|
+
neighbors = data_for_neighbors.obsp["connectivities"]
|
|
151
|
+
|
|
152
|
+
elif calculate_neighbors_from == "use_existing_neighbors":
|
|
153
|
+
neighbors = data["prot"].obsp["connectivities"]
|
|
154
|
+
|
|
155
|
+
if inplace:
|
|
156
|
+
_normalize_with_neighbors(
|
|
157
|
+
data["prot"],
|
|
158
|
+
neighbors,
|
|
159
|
+
log_transform=log_transform,
|
|
160
|
+
pseudocount=pseudocount,
|
|
161
|
+
max_iterations=max_iterations,
|
|
162
|
+
verbose=verbose,
|
|
163
|
+
save_size_factors=save_size_factors,
|
|
164
|
+
inplace=True,
|
|
165
|
+
mean=mean,
|
|
166
|
+
preserve_total_counts=preserve_total_counts,
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
knn_normalized_protein = _normalize_with_neighbors(
|
|
170
|
+
data["prot"],
|
|
171
|
+
neighbors,
|
|
172
|
+
log_transform=log_transform,
|
|
173
|
+
pseudocount=pseudocount,
|
|
174
|
+
max_iterations=max_iterations,
|
|
175
|
+
verbose=verbose,
|
|
176
|
+
save_size_factors=save_size_factors,
|
|
177
|
+
inplace=False,
|
|
178
|
+
mean=mean,
|
|
179
|
+
preserve_total_counts=preserve_total_counts,
|
|
180
|
+
)
|
|
181
|
+
new_mdata = MuData({"rna": data["rna"], "prot": knn_normalized_protein})
|
|
182
|
+
toreturn = new_mdata
|
|
183
|
+
|
|
184
|
+
return toreturn
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _normalize_with_neighbors(
|
|
188
|
+
protein_anndata,
|
|
189
|
+
connectivities,
|
|
190
|
+
log_transform=True,
|
|
191
|
+
log_transform_before=False,
|
|
192
|
+
save_size_factors=False,
|
|
193
|
+
pseudocount=5,
|
|
194
|
+
max_iterations=25,
|
|
195
|
+
change_for_stop=0.0005,
|
|
196
|
+
verbose=True,
|
|
197
|
+
inplace: bool = True,
|
|
198
|
+
mean="average",
|
|
199
|
+
preserve_total_counts=True,
|
|
200
|
+
):
|
|
201
|
+
"""
|
|
202
|
+
Apply KNN normalization given precomputed neighbors.
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
protein_anndata
|
|
207
|
+
AnnData object with protein expression data.
|
|
208
|
+
connectivities
|
|
209
|
+
KNN graph containing neighbor cells, in the format of
|
|
210
|
+
``.obsp["connectivities"]``.
|
|
211
|
+
log_transform
|
|
212
|
+
If ``True``, log-transforms the data after normalization.
|
|
213
|
+
log_transform_before
|
|
214
|
+
If ``True``, log-transforms the data before normalization.
|
|
215
|
+
Cannot be ``True`` at the same time as ``log_transform``.
|
|
216
|
+
save_size_factors
|
|
217
|
+
If ``True``, saves the final size factors to
|
|
218
|
+
``protein_anndata.obs["size_factor"]`` and the size factor history
|
|
219
|
+
to ``protein_anndata.obsm["size_factor_history"]``.
|
|
220
|
+
pseudocount
|
|
221
|
+
Pseudocount added to avoid zero-division errors.
|
|
222
|
+
max_iterations
|
|
223
|
+
Maximum number of iterations.
|
|
224
|
+
change_for_stop
|
|
225
|
+
Convergence criterion. The algorithm stops when the change in size
|
|
226
|
+
factor between iterations is smaller than this value.
|
|
227
|
+
verbose
|
|
228
|
+
Whether to print progress messages.
|
|
229
|
+
inplace
|
|
230
|
+
Whether to update the AnnData object in place or return a copy.
|
|
231
|
+
mean
|
|
232
|
+
Type of mean to use: ``'average'``, ``'geom_mean'``, or
|
|
233
|
+
``'trimmed_mean'``.
|
|
234
|
+
preserve_total_counts
|
|
235
|
+
Whether to preserve total counts across iterations.
|
|
236
|
+
|
|
237
|
+
Returns
|
|
238
|
+
-------
|
|
239
|
+
Normalized data will be written to ``protein_anndata.X``. If inplace is False,
|
|
240
|
+
returns a new AnnData object with the normalized data.
|
|
241
|
+
"""
|
|
242
|
+
neighbors = retrieve_neighbors(
|
|
243
|
+
connectivities
|
|
244
|
+
) # Converts the format of the KNN graph into a dictionary mapping each cell to its neighbor cells.
|
|
245
|
+
|
|
246
|
+
if not inplace:
|
|
247
|
+
protein_anndata = protein_anndata.copy()
|
|
248
|
+
|
|
249
|
+
# TODO: FUNCTIONS IN CASE THE DATA IS SPARSE.
|
|
250
|
+
|
|
251
|
+
x = protein_anndata.X
|
|
252
|
+
x += pseudocount # To avoid zero-division, we add a pseudocount.
|
|
253
|
+
|
|
254
|
+
assert not (log_transform_before and log_transform), "log_transform and log_transform_before cannot be both True"
|
|
255
|
+
if log_transform_before:
|
|
256
|
+
x = np.log(x)
|
|
257
|
+
total_sums_before = x.sum()
|
|
258
|
+
|
|
259
|
+
num_cells = x.shape[0]
|
|
260
|
+
size_factor_history = []
|
|
261
|
+
|
|
262
|
+
# KNN normalization.
|
|
263
|
+
for iteration in range(max_iterations):
|
|
264
|
+
size_factors = np.zeros(num_cells)
|
|
265
|
+
|
|
266
|
+
for target_cell, neighbor_list in neighbors.items():
|
|
267
|
+
neighbor_indices = np.array(neighbor_list)
|
|
268
|
+
target_indices = np.full(len(neighbor_list), target_cell)
|
|
269
|
+
|
|
270
|
+
ratios = x[neighbor_indices] / x[target_indices]
|
|
271
|
+
proto_size_factors = np.median(ratios, axis=1)
|
|
272
|
+
|
|
273
|
+
# After having collected the ratios for between the neighbor cells and the target cell, we calculate the average of those ratios. That will be the cell-specific size factor.
|
|
274
|
+
if mean == "average":
|
|
275
|
+
size_factor = np.mean(proto_size_factors)
|
|
276
|
+
elif mean == "trimmed_mean":
|
|
277
|
+
size_factor = stats.trim_mean(proto_size_factors, 0.1)
|
|
278
|
+
else:
|
|
279
|
+
size_factor = stats.gmean(proto_size_factors)
|
|
280
|
+
|
|
281
|
+
size_factors[target_cell] = size_factor
|
|
282
|
+
|
|
283
|
+
# Now, we multiply the protein expression of each cell by its cell-specific factor.
|
|
284
|
+
x *= size_factors[:, None]
|
|
285
|
+
|
|
286
|
+
if preserve_total_counts:
|
|
287
|
+
total_sums_after_iteration = x.sum()
|
|
288
|
+
ratio_of_total_counts = total_sums_after_iteration / total_sums_before
|
|
289
|
+
x = x / ratio_of_total_counts
|
|
290
|
+
size_factors = size_factors / ratio_of_total_counts
|
|
291
|
+
|
|
292
|
+
# Save this iteration's size factors. This is done mainly to compare with the previous iteration for the stopping criterion.
|
|
293
|
+
size_factor_history.append(size_factors)
|
|
294
|
+
if verbose:
|
|
295
|
+
print("Iteration ", iteration + 1)
|
|
296
|
+
|
|
297
|
+
# Unless it's the first iteration, check the algorithm stopping criterion: if all changes of size_factors are smaller than the "change_for_stop" value with respect to the previous iteration.
|
|
298
|
+
|
|
299
|
+
if iteration > 0:
|
|
300
|
+
biggest_size_factor_change = np.max(np.abs(size_factor_history[-1] - size_factor_history[-2]))
|
|
301
|
+
if verbose:
|
|
302
|
+
print("Change wrt previous iteration:", biggest_size_factor_change)
|
|
303
|
+
if biggest_size_factor_change < change_for_stop:
|
|
304
|
+
break
|
|
305
|
+
|
|
306
|
+
if log_transform:
|
|
307
|
+
x = np.log(x)
|
|
308
|
+
|
|
309
|
+
if save_size_factors:
|
|
310
|
+
total_size_factors = np.prod(
|
|
311
|
+
np.array(size_factor_history), axis=0
|
|
312
|
+
) # Multiplication of the size factors across all iterations.
|
|
313
|
+
protein_anndata.obs["size_factor"] = total_size_factors
|
|
314
|
+
size_factor_history = np.array(size_factor_history).T
|
|
315
|
+
protein_anndata.obsm["size_factor_history"] = size_factor_history
|
|
316
|
+
protein_anndata.obsp["connectivities_KNN_normalization"] = connectivities
|
|
317
|
+
|
|
318
|
+
protein_anndata.X = x
|
|
319
|
+
|
|
320
|
+
return None if inplace else protein_anndata
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
# def _normalize_with_neighbors(
|
|
324
|
+
# protein_anndata,
|
|
325
|
+
# connectivities,
|
|
326
|
+
# log_transform=True,
|
|
327
|
+
# log_transform_before=False,
|
|
328
|
+
# save_size_factors=False,
|
|
329
|
+
# pseudocount=5,
|
|
330
|
+
# max_iterations=25,
|
|
331
|
+
# change_for_stop=0.0005,
|
|
332
|
+
# verbose=True,
|
|
333
|
+
# inplace: bool = True,
|
|
334
|
+
# mean="average",
|
|
335
|
+
# ):
|
|
336
|
+
# """
|
|
337
|
+
# Applies KNN normalization given precomputed neighbors.
|
|
338
|
+
|
|
339
|
+
# protein_data: an AnnData object with the protein data in CITE-seq.
|
|
340
|
+
# connectivities: The KNN graph containing neighbor cells. It expects the format from .obsp["connectivities"].
|
|
341
|
+
# log_transform: if True, takes the logarithm of the data.
|
|
342
|
+
# save_size_factors: if True, the final size factors are saved to protein_anndata.obs["size_factor"] and the size factor history (all size factors across iterations) is saved to protein_anndata.obsm["size_factor_history"].
|
|
343
|
+
# pseudocount: adds pseudocounts to the data to avoid ZeroDivision errors. This argument also determines the value of the pseudocount (5 by default).
|
|
344
|
+
# max_iteration: maximum number of iterations.
|
|
345
|
+
# change_for_stop: the algorithm stops when the change in size factor is smaller than this value (convergence criterion).
|
|
346
|
+
# verbose: whether you want to print guidance information when running the function.
|
|
347
|
+
# """
|
|
348
|
+
# neighbors = retrieve_neighbors(
|
|
349
|
+
# connectivities
|
|
350
|
+
# ) # Converts the format of the KNN graph into a dictionary mapping each cell to its neighbor cells.
|
|
351
|
+
|
|
352
|
+
# if not inplace:
|
|
353
|
+
# protein_anndata = protein_anndata.copy()
|
|
354
|
+
|
|
355
|
+
# # TODO: FUNCTIONS IN CASE THE DATA IS SPARSE.
|
|
356
|
+
|
|
357
|
+
# x = protein_anndata.X
|
|
358
|
+
# x += pseudocount # To avoid zero-division, we add a pseudocount.
|
|
359
|
+
|
|
360
|
+
# assert not (log_transform_before and log_transform), "log_transform and log_transform_before cannot be both True"
|
|
361
|
+
# if log_transform_before:
|
|
362
|
+
# x = np.log(x)
|
|
363
|
+
|
|
364
|
+
# num_cells = x.shape[0]
|
|
365
|
+
# size_factor_history = []
|
|
366
|
+
|
|
367
|
+
# # KNN normalization.
|
|
368
|
+
# for iteration in range(max_iterations):
|
|
369
|
+
# size_factors = np.zeros(num_cells)
|
|
370
|
+
|
|
371
|
+
# for target_cell, neighbor_list in neighbors.items():
|
|
372
|
+
# neighbor_indices = np.array(neighbor_list)
|
|
373
|
+
# target_indices = np.full(len(neighbor_list), target_cell)
|
|
374
|
+
# ratios = x[neighbor_indices] / x[target_indices]
|
|
375
|
+
# proto_size_factors = np.median(ratios, axis=1)
|
|
376
|
+
|
|
377
|
+
# # After having collected the ratios for between the neighbor cells and the target cell, we calculate the average of those ratios. That will be the cell-specific size factor.
|
|
378
|
+
# if mean == "average":
|
|
379
|
+
# size_factor = np.mean(proto_size_factors)
|
|
380
|
+
# else:
|
|
381
|
+
# size_factor = stats.gmean(proto_size_factors)
|
|
382
|
+
# size_factors[target_cell] = size_factor
|
|
383
|
+
|
|
384
|
+
# # Now, we multiply the protein expression of each cell by its cell-specific factor.
|
|
385
|
+
# x *= size_factors[:, None]
|
|
386
|
+
|
|
387
|
+
# # Save this iteration's size factors. This is done mainly to compare with the previous iteration for the stopping criterion.
|
|
388
|
+
# size_factor_history.append(size_factors)
|
|
389
|
+
# if verbose:
|
|
390
|
+
# print("Iteration ", iteration + 1)
|
|
391
|
+
|
|
392
|
+
# # Unless it's the first iteration, check the algorithm stopping criterion: if all changes of size_factors are smaller than the "change_for_stop" value with respect to the previous iteration.
|
|
393
|
+
|
|
394
|
+
# if iteration > 0:
|
|
395
|
+
# biggest_size_factor_change = np.max(np.abs(size_factor_history[-1] - size_factor_history[-2]))
|
|
396
|
+
# if verbose:
|
|
397
|
+
# print("Change wrt previous iteration:", biggest_size_factor_change)
|
|
398
|
+
# if biggest_size_factor_change < change_for_stop:
|
|
399
|
+
# break
|
|
400
|
+
|
|
401
|
+
# if log_transform:
|
|
402
|
+
# x = np.log(x)
|
|
403
|
+
|
|
404
|
+
# if save_size_factors:
|
|
405
|
+
# total_size_factors = np.prod(
|
|
406
|
+
# np.array(size_factor_history), axis=0
|
|
407
|
+
# ) # Multiplication of the size factors across all iterations.
|
|
408
|
+
# protein_anndata.obs["size_factor"] = total_size_factors
|
|
409
|
+
# size_factor_history = np.array(size_factor_history).T
|
|
410
|
+
# protein_anndata.obsm["size_factor_history"] = size_factor_history
|
|
411
|
+
# protein_anndata.obsp["connectivities_KNN_normalization"] = connectivities
|
|
412
|
+
|
|
413
|
+
# protein_anndata.X = x
|
|
414
|
+
|
|
415
|
+
# return None if inplace else protein_anndata
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# def _normalize_with_neighbors(
|
|
419
|
+
# protein_anndata,
|
|
420
|
+
# neighbors,
|
|
421
|
+
# log_transform=True,
|
|
422
|
+
# log_transform_before=False,
|
|
423
|
+
# save_size_factors=False,
|
|
424
|
+
# pseudocount=5,
|
|
425
|
+
# max_iterations=25,
|
|
426
|
+
# change_for_stop=0.0005,
|
|
427
|
+
# verbose=True,
|
|
428
|
+
# inplace: bool = True,
|
|
429
|
+
# mean="average",
|
|
430
|
+
# ):
|
|
431
|
+
# """
|
|
432
|
+
# Applies KNN normalization given precomputed neighbors.
|
|
433
|
+
|
|
434
|
+
# protein_data: an AnnData object with the protein data in CITE-seq.
|
|
435
|
+
# neighbors: Neighbor cells. These neighbors are retrieved with the "retrieve_neighbors" function. The expected format a is dictionary of lists indicating which cells are neighbors.
|
|
436
|
+
# log_transform: if True, takes the logarithm of the data.
|
|
437
|
+
# save_size_factors: if True, the final size factors are saved to protein_anndata.obs["size_factor"] and the size factor history (all size factors across iterations) is saved to protein_anndata.obsm["size_factor_history"].
|
|
438
|
+
# pseudocount: adds pseudocounts to the data to avoid ZeroDivision errors. This argument also determines the value of the pseudocount (5 by default).
|
|
439
|
+
# max_iteration: maximum number of iterations.
|
|
440
|
+
# change_for_stop: the algorithm stops when the change in size factor is smaller than this value (convergence criterion).
|
|
441
|
+
# verbose: whether you want to print guidance information when running the function.
|
|
442
|
+
# """
|
|
443
|
+
# if not inplace:
|
|
444
|
+
# protein_anndata = protein_anndata.copy()
|
|
445
|
+
|
|
446
|
+
# # TODO: FUNCTIONS IN CASE THE DATA IS SPARSE.
|
|
447
|
+
|
|
448
|
+
# x = protein_anndata.X
|
|
449
|
+
# x += pseudocount # To avoid zero-division, we add a pseudocount.
|
|
450
|
+
|
|
451
|
+
# assert not (log_transform_before and log_transform), "log_transform and log_transform_before cannot be both True"
|
|
452
|
+
# if log_transform_before:
|
|
453
|
+
# x = np.log(x)
|
|
454
|
+
|
|
455
|
+
# num_cells = x.shape[0]
|
|
456
|
+
# size_factor_history = []
|
|
457
|
+
|
|
458
|
+
# # KNN normalization.
|
|
459
|
+
# for iteration in range(max_iterations):
|
|
460
|
+
# size_factors = np.zeros(num_cells)
|
|
461
|
+
|
|
462
|
+
# for target_cell, neighbor_list in neighbors.items():
|
|
463
|
+
# neighbor_indices = np.array(neighbor_list)
|
|
464
|
+
# target_indices = np.full(len(neighbor_list), target_cell)
|
|
465
|
+
# ratios = x[neighbor_indices] / x[target_indices]
|
|
466
|
+
# proto_size_factors = np.median(ratios, axis=1)
|
|
467
|
+
|
|
468
|
+
# # After having collected the ratios for between the neighbor cells and the target cell, we calculate the average of those ratios. That will be the cell-specific size factor.
|
|
469
|
+
# if mean == "average":
|
|
470
|
+
# size_factor = np.mean(proto_size_factors)
|
|
471
|
+
# else:
|
|
472
|
+
# size_factor = stats.gmean(proto_size_factors)
|
|
473
|
+
# size_factors[target_cell] = size_factor
|
|
474
|
+
|
|
475
|
+
# # Now, we multiply the protein expression of each cell by its cell-specific factor.
|
|
476
|
+
# x *= size_factors[:, None]
|
|
477
|
+
|
|
478
|
+
# # Save this iteration's size factors. This is done mainly to compare with the previous iteration for the stopping criterion.
|
|
479
|
+
# size_factor_history.append(size_factors)
|
|
480
|
+
# if verbose:
|
|
481
|
+
# print("Iteration ", iteration + 1)
|
|
482
|
+
|
|
483
|
+
# # Unless it's the first iteration, check the algorithm stopping criterion: if all changes of size_factors are smaller than the "change_for_stop" value with respect to the previous iteration.
|
|
484
|
+
|
|
485
|
+
# if iteration > 0:
|
|
486
|
+
# biggest_size_factor_change = np.max(np.abs(size_factor_history[-1] - size_factor_history[-2]))
|
|
487
|
+
# if verbose:
|
|
488
|
+
# print("Change wrt previous iteration:", biggest_size_factor_change)
|
|
489
|
+
# if biggest_size_factor_change < change_for_stop:
|
|
490
|
+
# break
|
|
491
|
+
|
|
492
|
+
# if log_transform:
|
|
493
|
+
# x = np.log(x)
|
|
494
|
+
|
|
495
|
+
# if save_size_factors:
|
|
496
|
+
# total_size_factors = np.prod(
|
|
497
|
+
# np.array(size_factor_history), axis=0
|
|
498
|
+
# ) # Multiplication of the size factors across all iterations.
|
|
499
|
+
# protein_anndata.obs["size_factor"] = total_size_factors
|
|
500
|
+
# size_factor_history = np.array(size_factor_history).T
|
|
501
|
+
# protein_anndata.obsm["size_factor_history"] = size_factor_history
|
|
502
|
+
|
|
503
|
+
# protein_anndata.X = x
|
|
504
|
+
|
|
505
|
+
# return None if inplace else protein_anndata
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: knn-normalization
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Neighbor-based normalization of CITE-seq data
|
|
5
|
+
Project-URL: Documentation, https://knn-normalization.readthedocs.io/
|
|
6
|
+
Project-URL: Homepage, https://github.com/javier-marchena-hurtado/KNN_normalization
|
|
7
|
+
Project-URL: Source, https://github.com/javier-marchena-hurtado/KNN_normalization
|
|
8
|
+
Author: Javier Marchena Hurtado
|
|
9
|
+
Maintainer-email: Javier Marchena Hurtado <javier.marchena.hurtado@gmail.com>
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2025, Javier Marchena Hurtado
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
38
|
+
Requires-Python: >=3.10
|
|
39
|
+
Requires-Dist: anndata
|
|
40
|
+
Requires-Dist: mudata
|
|
41
|
+
Requires-Dist: numpy
|
|
42
|
+
Requires-Dist: scanpy
|
|
43
|
+
Requires-Dist: scipy
|
|
44
|
+
Requires-Dist: session-info2
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: build; extra == 'dev'
|
|
47
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
48
|
+
Requires-Dist: twine>=4.0.2; extra == 'dev'
|
|
49
|
+
Provides-Extra: doc
|
|
50
|
+
Requires-Dist: docutils!=0.18.*,!=0.19.*,>=0.8; extra == 'doc'
|
|
51
|
+
Requires-Dist: ipykernel; extra == 'doc'
|
|
52
|
+
Requires-Dist: ipython; extra == 'doc'
|
|
53
|
+
Requires-Dist: myst-nb>=1.1; extra == 'doc'
|
|
54
|
+
Requires-Dist: pandas; extra == 'doc'
|
|
55
|
+
Requires-Dist: setuptools; extra == 'doc'
|
|
56
|
+
Requires-Dist: sphinx-autodoc-typehints; extra == 'doc'
|
|
57
|
+
Requires-Dist: sphinx-book-theme>=1; extra == 'doc'
|
|
58
|
+
Requires-Dist: sphinx-copybutton; extra == 'doc'
|
|
59
|
+
Requires-Dist: sphinx-tabs; extra == 'doc'
|
|
60
|
+
Requires-Dist: sphinx>=4; extra == 'doc'
|
|
61
|
+
Requires-Dist: sphinxcontrib-bibtex>=1; extra == 'doc'
|
|
62
|
+
Requires-Dist: sphinxext-opengraph; extra == 'doc'
|
|
63
|
+
Provides-Extra: test
|
|
64
|
+
Requires-Dist: coverage; extra == 'test'
|
|
65
|
+
Requires-Dist: pytest; extra == 'test'
|
|
66
|
+
Description-Content-Type: text/markdown
|
|
67
|
+
|
|
68
|
+
# KNN normalization
|
|
69
|
+
|
|
70
|
+
[![Tests][badge-tests]][tests]
|
|
71
|
+
[![Documentation][badge-docs]][documentation]
|
|
72
|
+
|
|
73
|
+
[badge-tests]: https://img.shields.io/github/actions/workflow/status/javier-marchena-hurtado/KNN_normalization/test.yaml?branch=main
|
|
74
|
+
[badge-docs]: https://img.shields.io/readthedocs/KNN_normalization
|
|
75
|
+
|
|
76
|
+
<img src="https://raw.githubusercontent.com/javier-marchena-hurtado/KNN_normalization/main/images/KNN_normalization_logo.png" width="250">
|
|
77
|
+
|
|
78
|
+
## Background and motivation
|
|
79
|
+
|
|
80
|
+
KNN normalization is a normalization method for protein counts in CITE-seq data. KNN normalization learns from neighbor cells in a KNN graph in order to estimate the appropriate total protein counts in each cell. KNN normalization accurately estimates total protein counts while preserving biological information.
|
|
81
|
+
|
|
82
|
+
## Getting started
|
|
83
|
+
|
|
84
|
+
Please refer to the [documentation][],
|
|
85
|
+
in particular, the [API documentation][].
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
<!--
|
|
91
|
+
1) Install the latest release of `KNN_normalization` from [PyPI][]:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install KNN_normalization
|
|
95
|
+
```
|
|
96
|
+
-->
|
|
97
|
+
Install the latest development version:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install git+https://github.com/javier-marchena-hurtado/KNN_normalization.git@main
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Release notes
|
|
104
|
+
|
|
105
|
+
See the [changelog][].
|
|
106
|
+
|
|
107
|
+
## Contact
|
|
108
|
+
|
|
109
|
+
For questions and help requests, please open a [discussion][] on GitHub.
|
|
110
|
+
If you found a bug, please use the [issue tracker][].
|
|
111
|
+
|
|
112
|
+
## Citation
|
|
113
|
+
|
|
114
|
+
> t.b.a
|
|
115
|
+
|
|
116
|
+
[issue tracker]: https://github.com/javier-marchena-hurtado/KNN_normalization/issues
|
|
117
|
+
[tests]: https://github.com/javier-marchena-hurtado/KNN_normalization/actions/workflows/test.yaml
|
|
118
|
+
[documentation]: https://knn-normalization.readthedocs.io
|
|
119
|
+
[changelog]: https://knn-normalization.readthedocs.io/en/latest/changelog.html
|
|
120
|
+
[api documentation]: https://knn-normalization.readthedocs.io/en/latest/api.html
|
|
121
|
+
[pypi]: https://pypi.org/project/KNN_normalization
|
|
122
|
+
[discussion]: https://github.com/javier-marchena-hurtado/KNN_normalization/discussions
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
knn_normalization/__init__.py,sha256=91NobikqQ6ujn6mjyirBBBKGYosH-X2w4u2dtiTWsEQ,139
|
|
2
|
+
knn_normalization/pl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
knn_normalization/pl/plotting.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
knn_normalization/pp/__init__.py,sha256=8revjs6LifS_m7smQvlIS3EU3BJtG4VJedZ0MXGqJLY,80
|
|
5
|
+
knn_normalization/pp/preprocessing.py,sha256=xBrLokt7Nv5xbGjer_nQkUS-pjRBOuFF1STDKzbBmn8,1945
|
|
6
|
+
knn_normalization/tl/__init__.py,sha256=Sk8ckeXIjHWcuGowidNJduZPEc_UKRwtyeFwAtTzwyg,60
|
|
7
|
+
knn_normalization/tl/tools.py,sha256=tpXZqIiXskTpCaOlmnuvppwewyvmU-Z0ggdGxhwFp2s,20992
|
|
8
|
+
knn_normalization-0.1.0.dist-info/METADATA,sha256=NH25-tkPV-pmTJx16obt3XV-c-anzp2TpnRqHTYO5Eo,5035
|
|
9
|
+
knn_normalization-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
10
|
+
knn_normalization-0.1.0.dist-info/licenses/LICENSE,sha256=zL2KEEuPAgaYdVcUNXrCmgdBG_abnDl7zmgSJqcBoQw,1081
|
|
11
|
+
knn_normalization-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Javier Marchena Hurtado
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|