sclab 0.1.7__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sclab/__init__.py +3 -1
- sclab/_io.py +83 -12
- sclab/_methods_registry.py +65 -0
- sclab/_sclab.py +241 -21
- sclab/dataset/_dataset.py +4 -6
- sclab/dataset/processor/_processor.py +41 -19
- sclab/dataset/processor/_results_panel.py +94 -0
- sclab/dataset/processor/step/_processor_step_base.py +12 -6
- sclab/examples/processor_steps/__init__.py +8 -0
- sclab/examples/processor_steps/_cluster.py +2 -2
- sclab/examples/processor_steps/_differential_expression.py +329 -0
- sclab/examples/processor_steps/_doublet_detection.py +68 -0
- sclab/examples/processor_steps/_gene_expression.py +125 -0
- sclab/examples/processor_steps/_integration.py +116 -0
- sclab/examples/processor_steps/_neighbors.py +26 -6
- sclab/examples/processor_steps/_pca.py +13 -8
- sclab/examples/processor_steps/_preprocess.py +52 -25
- sclab/examples/processor_steps/_qc.py +24 -8
- sclab/examples/processor_steps/_umap.py +2 -2
- sclab/gui/__init__.py +0 -0
- sclab/gui/components/__init__.py +7 -0
- sclab/gui/components/_guided_pseudotime.py +482 -0
- sclab/gui/components/_transfer_metadata.py +186 -0
- sclab/methods/__init__.py +50 -0
- sclab/preprocess/__init__.py +26 -0
- sclab/preprocess/_cca.py +176 -0
- sclab/preprocess/_cca_integrate.py +109 -0
- sclab/preprocess/_filter_obs.py +42 -0
- sclab/preprocess/_harmony.py +421 -0
- sclab/preprocess/_harmony_integrate.py +53 -0
- sclab/preprocess/_normalize_weighted.py +65 -0
- sclab/preprocess/_pca.py +51 -0
- sclab/preprocess/_preprocess.py +155 -0
- sclab/preprocess/_qc.py +38 -0
- sclab/preprocess/_rpca.py +116 -0
- sclab/preprocess/_subset.py +208 -0
- sclab/preprocess/_transfer_metadata.py +196 -0
- sclab/preprocess/_transform.py +82 -0
- sclab/preprocess/_utils.py +96 -0
- sclab/scanpy/__init__.py +0 -0
- sclab/scanpy/_compat.py +92 -0
- sclab/scanpy/_settings.py +526 -0
- sclab/scanpy/logging.py +290 -0
- sclab/scanpy/plotting/__init__.py +0 -0
- sclab/scanpy/plotting/_rcmod.py +73 -0
- sclab/scanpy/plotting/palettes.py +221 -0
- sclab/scanpy/readwrite.py +1108 -0
- sclab/tools/__init__.py +0 -0
- sclab/tools/cellflow/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
- sclab/tools/cellflow/pseudotime/__init__.py +0 -0
- sclab/tools/cellflow/pseudotime/_pseudotime.py +336 -0
- sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
- sclab/tools/cellflow/utils/__init__.py +0 -0
- sclab/tools/cellflow/utils/density_nd.py +215 -0
- sclab/tools/cellflow/utils/interpolate.py +334 -0
- sclab/tools/cellflow/utils/periodic_genes.py +106 -0
- sclab/tools/cellflow/utils/smoothen.py +124 -0
- sclab/tools/cellflow/utils/times.py +55 -0
- sclab/tools/differential_expression/__init__.py +7 -0
- sclab/tools/differential_expression/_pseudobulk_edger.py +309 -0
- sclab/tools/differential_expression/_pseudobulk_helpers.py +290 -0
- sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
- sclab/tools/doublet_detection/__init__.py +5 -0
- sclab/tools/doublet_detection/_scrublet.py +64 -0
- sclab/tools/embedding/__init__.py +0 -0
- sclab/tools/imputation/__init__.py +0 -0
- sclab/tools/imputation/_alra.py +135 -0
- sclab/tools/labeling/__init__.py +6 -0
- sclab/tools/labeling/sctype.py +233 -0
- sclab/tools/utils/__init__.py +5 -0
- sclab/tools/utils/_aggregate_and_filter.py +290 -0
- sclab/utils/__init__.py +5 -0
- sclab/utils/_write_excel.py +510 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/METADATA +29 -12
- sclab-0.3.4.dist-info/RECORD +93 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/WHEEL +1 -1
- sclab-0.3.4.dist-info/licenses/LICENSE +29 -0
- sclab-0.1.7.dist-info/RECORD +0 -30
|
@@ -0,0 +1,1108 @@
|
|
|
1
|
+
"""Reading and Writing"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from pathlib import Path, PurePath
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import anndata.utils
|
|
11
|
+
import h5py
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from packaging.version import Version
|
|
15
|
+
|
|
16
|
+
if Version(anndata.__version__) >= Version("0.11.0rc2"):
|
|
17
|
+
from anndata.io import (
|
|
18
|
+
read_csv,
|
|
19
|
+
read_excel,
|
|
20
|
+
read_h5ad,
|
|
21
|
+
read_hdf,
|
|
22
|
+
read_loom,
|
|
23
|
+
read_mtx,
|
|
24
|
+
read_text,
|
|
25
|
+
)
|
|
26
|
+
else:
|
|
27
|
+
from anndata import (
|
|
28
|
+
read_csv,
|
|
29
|
+
read_excel,
|
|
30
|
+
read_h5ad,
|
|
31
|
+
read_hdf,
|
|
32
|
+
read_loom,
|
|
33
|
+
read_mtx,
|
|
34
|
+
read_text,
|
|
35
|
+
)
|
|
36
|
+
from anndata import AnnData
|
|
37
|
+
from matplotlib.image import imread
|
|
38
|
+
|
|
39
|
+
from . import logging as logg
|
|
40
|
+
from ._compat import old_positionals
|
|
41
|
+
from ._settings import settings
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Empty(Enum):
|
|
45
|
+
token = 0
|
|
46
|
+
|
|
47
|
+
def __repr__(self) -> str:
|
|
48
|
+
return "_empty"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_empty = Empty.token
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if TYPE_CHECKING:
|
|
55
|
+
from typing import BinaryIO, Literal
|
|
56
|
+
|
|
57
|
+
# from ._utils import Empty
|
|
58
|
+
|
|
59
|
+
# .gz and .bz2 suffixes are also allowed for text formats
|
|
60
|
+
text_exts = {
|
|
61
|
+
"csv",
|
|
62
|
+
"tsv",
|
|
63
|
+
"tab",
|
|
64
|
+
"data",
|
|
65
|
+
"txt", # these four are all equivalent
|
|
66
|
+
}
|
|
67
|
+
avail_exts = {
|
|
68
|
+
"anndata",
|
|
69
|
+
"xlsx",
|
|
70
|
+
"h5",
|
|
71
|
+
"h5ad",
|
|
72
|
+
"mtx",
|
|
73
|
+
"mtx.gz",
|
|
74
|
+
"soft.gz",
|
|
75
|
+
"loom",
|
|
76
|
+
} | text_exts
|
|
77
|
+
"""Available file formats for reading data. """
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# --------------------------------------------------------------------------------
|
|
81
|
+
# Reading and Writing data files and AnnData objects
|
|
82
|
+
# --------------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@old_positionals(
|
|
86
|
+
"sheet",
|
|
87
|
+
"ext",
|
|
88
|
+
"delimiter",
|
|
89
|
+
"first_column_names",
|
|
90
|
+
"backup_url",
|
|
91
|
+
"cache",
|
|
92
|
+
"cache_compression",
|
|
93
|
+
)
|
|
94
|
+
def read(
|
|
95
|
+
filename: Path | str,
|
|
96
|
+
backed: Literal["r", "r+"] | None = None,
|
|
97
|
+
*,
|
|
98
|
+
sheet: str | None = None,
|
|
99
|
+
ext: str | None = None,
|
|
100
|
+
delimiter: str | None = None,
|
|
101
|
+
first_column_names: bool = False,
|
|
102
|
+
backup_url: str | None = None,
|
|
103
|
+
cache: bool = False,
|
|
104
|
+
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
|
|
105
|
+
**kwargs,
|
|
106
|
+
) -> AnnData:
|
|
107
|
+
"""\
|
|
108
|
+
Read file and return :class:`~anndata.AnnData` object.
|
|
109
|
+
|
|
110
|
+
To speed up reading, consider passing ``cache=True``, which creates an hdf5
|
|
111
|
+
cache file.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
filename
|
|
116
|
+
If the filename has no file extension, it is interpreted as a key for
|
|
117
|
+
generating a filename via ``sc.settings.writedir / (filename +
|
|
118
|
+
sc.settings.file_format_data)``. This is the same behavior as in
|
|
119
|
+
``sc.read(filename, ...)``.
|
|
120
|
+
backed
|
|
121
|
+
If ``'r'``, load :class:`~anndata.AnnData` in ``backed`` mode instead
|
|
122
|
+
of fully loading it into memory (`memory` mode). If you want to modify
|
|
123
|
+
backed attributes of the AnnData object, you need to choose ``'r+'``.
|
|
124
|
+
sheet
|
|
125
|
+
Name of sheet/table in hdf5 or Excel file.
|
|
126
|
+
ext
|
|
127
|
+
Extension that indicates the file type. If ``None``, uses extension of
|
|
128
|
+
filename.
|
|
129
|
+
delimiter
|
|
130
|
+
Delimiter that separates data within text file. If ``None``, will split at
|
|
131
|
+
arbitrary number of white spaces, which is different from enforcing
|
|
132
|
+
splitting at any single white space ``' '``.
|
|
133
|
+
first_column_names
|
|
134
|
+
Assume the first column stores row names. This is only necessary if
|
|
135
|
+
these are not strings: strings in the first column are automatically
|
|
136
|
+
assumed to be row names.
|
|
137
|
+
backup_url
|
|
138
|
+
Retrieve the file from an URL if not present on disk.
|
|
139
|
+
cache
|
|
140
|
+
If `False`, read from source, if `True`, read from fast 'h5ad' cache.
|
|
141
|
+
cache_compression
|
|
142
|
+
See the h5py :ref:`dataset_compression`.
|
|
143
|
+
(Default: `settings.cache_compression`)
|
|
144
|
+
kwargs
|
|
145
|
+
Parameters passed to :func:`~anndata.io.read_loom`.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
An :class:`~anndata.AnnData` object
|
|
150
|
+
"""
|
|
151
|
+
filename = Path(filename) # allow passing strings
|
|
152
|
+
if is_valid_filename(filename):
|
|
153
|
+
return _read(
|
|
154
|
+
filename,
|
|
155
|
+
backed=backed,
|
|
156
|
+
sheet=sheet,
|
|
157
|
+
ext=ext,
|
|
158
|
+
delimiter=delimiter,
|
|
159
|
+
first_column_names=first_column_names,
|
|
160
|
+
backup_url=backup_url,
|
|
161
|
+
cache=cache,
|
|
162
|
+
cache_compression=cache_compression,
|
|
163
|
+
**kwargs,
|
|
164
|
+
)
|
|
165
|
+
# generate filename and read to dict
|
|
166
|
+
filekey = str(filename)
|
|
167
|
+
filename = settings.writedir / (filekey + "." + settings.file_format_data)
|
|
168
|
+
if not filename.exists():
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"Reading with filekey {filekey!r} failed, "
|
|
171
|
+
f"the inferred filename {filename!r} does not exist. "
|
|
172
|
+
"If you intended to provide a filename, either use a filename "
|
|
173
|
+
f"ending on one of the available extensions {avail_exts} "
|
|
174
|
+
"or pass the parameter `ext`."
|
|
175
|
+
)
|
|
176
|
+
return read_h5ad(filename, backed=backed)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@old_positionals("genome", "gex_only", "backup_url")
|
|
180
|
+
def read_10x_h5(
|
|
181
|
+
filename: Path | str,
|
|
182
|
+
*,
|
|
183
|
+
genome: str | None = None,
|
|
184
|
+
gex_only: bool = True,
|
|
185
|
+
backup_url: str | None = None,
|
|
186
|
+
) -> AnnData:
|
|
187
|
+
"""\
|
|
188
|
+
Read 10x-Genomics-formatted hdf5 file.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
filename
|
|
193
|
+
Path to a 10x hdf5 file.
|
|
194
|
+
genome
|
|
195
|
+
Filter expression to genes within this genome. For legacy 10x h5
|
|
196
|
+
files, this must be provided if the data contains more than one genome.
|
|
197
|
+
gex_only
|
|
198
|
+
Only keep 'Gene Expression' data and ignore other feature types,
|
|
199
|
+
e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
|
|
200
|
+
backup_url
|
|
201
|
+
Retrieve the file from an URL if not present on disk.
|
|
202
|
+
|
|
203
|
+
Returns
|
|
204
|
+
-------
|
|
205
|
+
Annotated data matrix, where observations/cells are named by their
|
|
206
|
+
barcode and variables/genes by gene name. Stores the following information:
|
|
207
|
+
|
|
208
|
+
:attr:`~anndata.AnnData.X`
|
|
209
|
+
The data matrix is stored
|
|
210
|
+
:attr:`~anndata.AnnData.obs_names`
|
|
211
|
+
Cell names
|
|
212
|
+
:attr:`~anndata.AnnData.var_names`
|
|
213
|
+
Gene names for a feature barcode matrix, probe names for a probe bc matrix
|
|
214
|
+
:attr:`~anndata.AnnData.var`\\ `['gene_ids']`
|
|
215
|
+
Gene IDs
|
|
216
|
+
:attr:`~anndata.AnnData.var`\\ `['feature_types']`
|
|
217
|
+
Feature types
|
|
218
|
+
:attr:`~anndata.AnnData.obs`\\ `[filtered_barcodes]`
|
|
219
|
+
filtered barcodes if present in the matrix
|
|
220
|
+
:attr:`~anndata.AnnData.var`
|
|
221
|
+
Any additional metadata present in /matrix/features is read in.
|
|
222
|
+
"""
|
|
223
|
+
start = logg.info(f"reading {filename}")
|
|
224
|
+
is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
|
|
225
|
+
if not is_present:
|
|
226
|
+
logg.debug(f"... did not find original file {filename}")
|
|
227
|
+
with h5py.File(str(filename), "r") as f:
|
|
228
|
+
v3 = "/matrix" in f
|
|
229
|
+
if v3:
|
|
230
|
+
adata = _read_v3_10x_h5(filename, start=start)
|
|
231
|
+
if genome:
|
|
232
|
+
if genome not in adata.var["genome"].values:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
f"Could not find data corresponding to genome '{genome}' in '{filename}'. "
|
|
235
|
+
f"Available genomes are: {list(adata.var['genome'].unique())}."
|
|
236
|
+
)
|
|
237
|
+
adata = adata[:, adata.var["genome"] == genome]
|
|
238
|
+
if gex_only:
|
|
239
|
+
adata = adata[:, adata.var["feature_types"] == "Gene Expression"]
|
|
240
|
+
if adata.is_view:
|
|
241
|
+
adata = adata.copy()
|
|
242
|
+
else:
|
|
243
|
+
adata = _read_legacy_10x_h5(filename, genome=genome, start=start)
|
|
244
|
+
return adata
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _read_legacy_10x_h5(filename, *, genome=None, start=None):
|
|
248
|
+
"""
|
|
249
|
+
Read hdf5 file from Cell Ranger v2 or earlier versions.
|
|
250
|
+
"""
|
|
251
|
+
with h5py.File(str(filename), "r") as f:
|
|
252
|
+
try:
|
|
253
|
+
children = list(f.keys())
|
|
254
|
+
if not genome:
|
|
255
|
+
if len(children) > 1:
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"'{filename}' contains more than one genome. For legacy 10x h5 "
|
|
258
|
+
"files you must specify the genome if more than one is present. "
|
|
259
|
+
f"Available genomes are: {children}"
|
|
260
|
+
)
|
|
261
|
+
genome = children[0]
|
|
262
|
+
elif genome not in children:
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"Could not find genome '{genome}' in '{filename}'. "
|
|
265
|
+
f"Available genomes are: {children}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
dsets = {}
|
|
269
|
+
_collect_datasets(dsets, f[genome])
|
|
270
|
+
|
|
271
|
+
# AnnData works with csr matrices
|
|
272
|
+
# 10x stores the transposed data, so we do the transposition right away
|
|
273
|
+
from scipy.sparse import csr_matrix
|
|
274
|
+
|
|
275
|
+
M, N = dsets["shape"]
|
|
276
|
+
data = dsets["data"]
|
|
277
|
+
if dsets["data"].dtype == np.dtype("int32"):
|
|
278
|
+
data = dsets["data"].view("float32")
|
|
279
|
+
data[:] = dsets["data"]
|
|
280
|
+
matrix = csr_matrix(
|
|
281
|
+
(data, dsets["indices"], dsets["indptr"]),
|
|
282
|
+
shape=(N, M),
|
|
283
|
+
)
|
|
284
|
+
# the csc matrix is automatically the transposed csr matrix
|
|
285
|
+
# as scanpy expects it, so, no need for a further transpostion
|
|
286
|
+
adata = AnnData(
|
|
287
|
+
matrix,
|
|
288
|
+
obs=dict(obs_names=dsets["barcodes"].astype(str)),
|
|
289
|
+
var=dict(
|
|
290
|
+
var_names=dsets["gene_names"].astype(str),
|
|
291
|
+
gene_ids=dsets["genes"].astype(str),
|
|
292
|
+
),
|
|
293
|
+
)
|
|
294
|
+
logg.info("", time=start)
|
|
295
|
+
return adata
|
|
296
|
+
except KeyError:
|
|
297
|
+
raise Exception("File is missing one or more required datasets.")
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _collect_datasets(dsets: dict, group: h5py.Group):
|
|
301
|
+
for k, v in group.items():
|
|
302
|
+
if isinstance(v, h5py.Dataset):
|
|
303
|
+
dsets[k] = v[()]
|
|
304
|
+
else:
|
|
305
|
+
_collect_datasets(dsets, v)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _read_v3_10x_h5(filename, *, start=None):
|
|
309
|
+
"""
|
|
310
|
+
Read hdf5 file from Cell Ranger v3 or later versions.
|
|
311
|
+
"""
|
|
312
|
+
with h5py.File(str(filename), "r") as f:
|
|
313
|
+
try:
|
|
314
|
+
dsets = {}
|
|
315
|
+
_collect_datasets(dsets, f["matrix"])
|
|
316
|
+
|
|
317
|
+
from scipy.sparse import csr_matrix
|
|
318
|
+
|
|
319
|
+
M, N = dsets["shape"]
|
|
320
|
+
data = dsets["data"]
|
|
321
|
+
if dsets["data"].dtype == np.dtype("int32"):
|
|
322
|
+
data = dsets["data"].view("float32")
|
|
323
|
+
data[:] = dsets["data"]
|
|
324
|
+
matrix = csr_matrix(
|
|
325
|
+
(data, dsets["indices"], dsets["indptr"]),
|
|
326
|
+
shape=(N, M),
|
|
327
|
+
)
|
|
328
|
+
obs_dict = {"obs_names": dsets["barcodes"].astype(str)}
|
|
329
|
+
var_dict = {"var_names": dsets["name"].astype(str)}
|
|
330
|
+
|
|
331
|
+
if "gene_id" not in dsets:
|
|
332
|
+
# Read metadata specific to a feature-barcode matrix
|
|
333
|
+
var_dict["gene_ids"] = dsets["id"].astype(str)
|
|
334
|
+
else:
|
|
335
|
+
# Read metadata specific to a probe-barcode matrix
|
|
336
|
+
var_dict.update(
|
|
337
|
+
{
|
|
338
|
+
"gene_ids": dsets["gene_id"].astype(str),
|
|
339
|
+
"probe_ids": dsets["id"].astype(str),
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
var_dict["feature_types"] = dsets["feature_type"].astype(str)
|
|
343
|
+
if "filtered_barcodes" in f["matrix"]:
|
|
344
|
+
obs_dict["filtered_barcodes"] = dsets["filtered_barcodes"].astype(bool)
|
|
345
|
+
|
|
346
|
+
if "features" in f["matrix"]:
|
|
347
|
+
var_dict.update(
|
|
348
|
+
(
|
|
349
|
+
feature_metadata_name,
|
|
350
|
+
dsets[feature_metadata_name].astype(
|
|
351
|
+
bool if feature_metadata_item.dtype.kind == "b" else str
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
for feature_metadata_name, feature_metadata_item in f["matrix"][
|
|
355
|
+
"features"
|
|
356
|
+
].items()
|
|
357
|
+
if isinstance(feature_metadata_item, h5py.Dataset)
|
|
358
|
+
and feature_metadata_name
|
|
359
|
+
not in [
|
|
360
|
+
"name",
|
|
361
|
+
"feature_type",
|
|
362
|
+
"id",
|
|
363
|
+
"gene_id",
|
|
364
|
+
"_all_tag_keys",
|
|
365
|
+
]
|
|
366
|
+
)
|
|
367
|
+
else:
|
|
368
|
+
raise ValueError("10x h5 has no features group")
|
|
369
|
+
adata = AnnData(
|
|
370
|
+
matrix,
|
|
371
|
+
obs=obs_dict,
|
|
372
|
+
var=var_dict,
|
|
373
|
+
)
|
|
374
|
+
logg.info("", time=start)
|
|
375
|
+
return adata
|
|
376
|
+
except KeyError:
|
|
377
|
+
raise Exception("File is missing one or more required datasets.")
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def read_visium(
|
|
381
|
+
path: Path | str,
|
|
382
|
+
genome: str | None = None,
|
|
383
|
+
*,
|
|
384
|
+
count_file: str = "filtered_feature_bc_matrix.h5",
|
|
385
|
+
library_id: str | None = None,
|
|
386
|
+
load_images: bool | None = True,
|
|
387
|
+
source_image_path: Path | str | None = None,
|
|
388
|
+
) -> AnnData:
|
|
389
|
+
"""\
|
|
390
|
+
Read 10x-Genomics-formatted visum dataset.
|
|
391
|
+
|
|
392
|
+
In addition to reading regular 10x output,
|
|
393
|
+
this looks for the `spatial` folder and loads images,
|
|
394
|
+
coordinates and scale factors.
|
|
395
|
+
Based on the `Space Ranger output docs`_.
|
|
396
|
+
|
|
397
|
+
See :func:`~scanpy.pl.spatial` for a compatible plotting function.
|
|
398
|
+
|
|
399
|
+
.. _Space Ranger output docs: https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview
|
|
400
|
+
|
|
401
|
+
Parameters
|
|
402
|
+
----------
|
|
403
|
+
path
|
|
404
|
+
Path to directory for visium datafiles.
|
|
405
|
+
genome
|
|
406
|
+
Filter expression to genes within this genome.
|
|
407
|
+
count_file
|
|
408
|
+
Which file in the passed directory to use as the count file. Typically would be one of:
|
|
409
|
+
'filtered_feature_bc_matrix.h5' or 'raw_feature_bc_matrix.h5'.
|
|
410
|
+
library_id
|
|
411
|
+
Identifier for the visium library. Can be modified when concatenating multiple adata objects.
|
|
412
|
+
source_image_path
|
|
413
|
+
Path to the high-resolution tissue image. Path will be included in
|
|
414
|
+
`.uns["spatial"][library_id]["metadata"]["source_image_path"]`.
|
|
415
|
+
|
|
416
|
+
Returns
|
|
417
|
+
-------
|
|
418
|
+
Annotated data matrix, where observations/cells are named by their
|
|
419
|
+
barcode and variables/genes by gene name. Stores the following information:
|
|
420
|
+
|
|
421
|
+
:attr:`~anndata.AnnData.X`
|
|
422
|
+
The data matrix is stored
|
|
423
|
+
:attr:`~anndata.AnnData.obs_names`
|
|
424
|
+
Cell names
|
|
425
|
+
:attr:`~anndata.AnnData.var_names`
|
|
426
|
+
Gene names for a feature barcode matrix, probe names for a probe bc matrix
|
|
427
|
+
:attr:`~anndata.AnnData.var`\\ `['gene_ids']`
|
|
428
|
+
Gene IDs
|
|
429
|
+
:attr:`~anndata.AnnData.var`\\ `['feature_types']`
|
|
430
|
+
Feature types
|
|
431
|
+
:attr:`~anndata.AnnData.obs`\\ `[filtered_barcodes]`
|
|
432
|
+
filtered barcodes if present in the matrix
|
|
433
|
+
:attr:`~anndata.AnnData.var`
|
|
434
|
+
Any additional metadata present in /matrix/features is read in.
|
|
435
|
+
:attr:`~anndata.AnnData.uns`\\ `['spatial']`
|
|
436
|
+
Dict of spaceranger output files with 'library_id' as key
|
|
437
|
+
:attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['images']`
|
|
438
|
+
Dict of images (`'hires'` and `'lowres'`)
|
|
439
|
+
:attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['scalefactors']`
|
|
440
|
+
Scale factors for the spots
|
|
441
|
+
:attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['metadata']`
|
|
442
|
+
Files metadata: 'chemistry_description', 'software_version', 'source_image_path'
|
|
443
|
+
:attr:`~anndata.AnnData.obsm`\\ `['spatial']`
|
|
444
|
+
Spatial spot coordinates, usable as `basis` by :func:`~scanpy.pl.embedding`.
|
|
445
|
+
"""
|
|
446
|
+
path = Path(path)
|
|
447
|
+
adata = read_10x_h5(path / count_file, genome=genome)
|
|
448
|
+
|
|
449
|
+
adata.uns["spatial"] = dict()
|
|
450
|
+
|
|
451
|
+
from h5py import File
|
|
452
|
+
|
|
453
|
+
with File(path / count_file, mode="r") as f:
|
|
454
|
+
attrs = dict(f.attrs)
|
|
455
|
+
if library_id is None:
|
|
456
|
+
library_id = str(attrs.pop("library_ids")[0], "utf-8")
|
|
457
|
+
|
|
458
|
+
adata.uns["spatial"][library_id] = dict()
|
|
459
|
+
|
|
460
|
+
if load_images:
|
|
461
|
+
tissue_positions_file = (
|
|
462
|
+
path / "spatial/tissue_positions.csv"
|
|
463
|
+
if (path / "spatial/tissue_positions.csv").exists()
|
|
464
|
+
else path / "spatial/tissue_positions_list.csv"
|
|
465
|
+
)
|
|
466
|
+
files = dict(
|
|
467
|
+
tissue_positions_file=tissue_positions_file,
|
|
468
|
+
scalefactors_json_file=path / "spatial/scalefactors_json.json",
|
|
469
|
+
hires_image=path / "spatial/tissue_hires_image.png",
|
|
470
|
+
lowres_image=path / "spatial/tissue_lowres_image.png",
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# check if files exists, continue if images are missing
|
|
474
|
+
for f in files.values():
|
|
475
|
+
if not f.exists():
|
|
476
|
+
if any(x in str(f) for x in ["hires_image", "lowres_image"]):
|
|
477
|
+
logg.warning(
|
|
478
|
+
f"You seem to be missing an image file.\nCould not find '{f}'."
|
|
479
|
+
)
|
|
480
|
+
else:
|
|
481
|
+
raise OSError(f"Could not find '{f}'")
|
|
482
|
+
|
|
483
|
+
adata.uns["spatial"][library_id]["images"] = dict()
|
|
484
|
+
for res in ["hires", "lowres"]:
|
|
485
|
+
try:
|
|
486
|
+
adata.uns["spatial"][library_id]["images"][res] = imread(
|
|
487
|
+
str(files[f"{res}_image"])
|
|
488
|
+
)
|
|
489
|
+
except Exception:
|
|
490
|
+
raise OSError(f"Could not find '{res}_image'")
|
|
491
|
+
|
|
492
|
+
# read json scalefactors
|
|
493
|
+
adata.uns["spatial"][library_id]["scalefactors"] = json.loads(
|
|
494
|
+
files["scalefactors_json_file"].read_bytes()
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
adata.uns["spatial"][library_id]["metadata"] = {
|
|
498
|
+
k: (str(attrs[k], "utf-8") if isinstance(attrs[k], bytes) else attrs[k])
|
|
499
|
+
for k in ("chemistry_description", "software_version")
|
|
500
|
+
if k in attrs
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
# read coordinates
|
|
504
|
+
positions = pd.read_csv(
|
|
505
|
+
files["tissue_positions_file"],
|
|
506
|
+
header=0 if tissue_positions_file.name == "tissue_positions.csv" else None,
|
|
507
|
+
index_col=0,
|
|
508
|
+
)
|
|
509
|
+
positions.columns = [
|
|
510
|
+
"in_tissue",
|
|
511
|
+
"array_row",
|
|
512
|
+
"array_col",
|
|
513
|
+
"pxl_col_in_fullres",
|
|
514
|
+
"pxl_row_in_fullres",
|
|
515
|
+
]
|
|
516
|
+
|
|
517
|
+
adata.obs = adata.obs.join(positions, how="left")
|
|
518
|
+
|
|
519
|
+
adata.obsm["spatial"] = adata.obs[
|
|
520
|
+
["pxl_row_in_fullres", "pxl_col_in_fullres"]
|
|
521
|
+
].to_numpy()
|
|
522
|
+
adata.obs.drop(
|
|
523
|
+
columns=["pxl_row_in_fullres", "pxl_col_in_fullres"],
|
|
524
|
+
inplace=True,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# put image path in uns
|
|
528
|
+
if source_image_path is not None:
|
|
529
|
+
# get an absolute path
|
|
530
|
+
source_image_path = str(Path(source_image_path).resolve())
|
|
531
|
+
adata.uns["spatial"][library_id]["metadata"]["source_image_path"] = str(
|
|
532
|
+
source_image_path
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
return adata
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
@old_positionals("var_names", "make_unique", "cache", "cache_compression", "gex_only")
|
|
539
|
+
def read_10x_mtx(
|
|
540
|
+
path: Path | str,
|
|
541
|
+
*,
|
|
542
|
+
var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
|
|
543
|
+
make_unique: bool = True,
|
|
544
|
+
cache: bool = False,
|
|
545
|
+
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
|
|
546
|
+
gex_only: bool = True,
|
|
547
|
+
prefix: str | None = None,
|
|
548
|
+
) -> AnnData:
|
|
549
|
+
"""\
|
|
550
|
+
Read 10x-Genomics-formatted mtx directory.
|
|
551
|
+
|
|
552
|
+
Parameters
|
|
553
|
+
----------
|
|
554
|
+
path
|
|
555
|
+
Path to directory for `.mtx` and `.tsv` files,
|
|
556
|
+
e.g. './filtered_gene_bc_matrices/hg19/'.
|
|
557
|
+
var_names
|
|
558
|
+
The variables index.
|
|
559
|
+
make_unique
|
|
560
|
+
Whether to make the variables index unique by appending '-1',
|
|
561
|
+
'-2' etc. or not.
|
|
562
|
+
cache
|
|
563
|
+
If `False`, read from source, if `True`, read from fast 'h5ad' cache.
|
|
564
|
+
cache_compression
|
|
565
|
+
See the h5py :ref:`dataset_compression`.
|
|
566
|
+
(Default: `settings.cache_compression`)
|
|
567
|
+
gex_only
|
|
568
|
+
Only keep 'Gene Expression' data and ignore other feature types,
|
|
569
|
+
e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
|
|
570
|
+
prefix
|
|
571
|
+
Any prefix before `matrix.mtx`, `genes.tsv` and `barcodes.tsv`. For instance,
|
|
572
|
+
if the files are named `patientA_matrix.mtx`, `patientA_genes.tsv` and
|
|
573
|
+
`patientA_barcodes.tsv` the prefix is `patientA_`.
|
|
574
|
+
(Default: no prefix)
|
|
575
|
+
|
|
576
|
+
Returns
|
|
577
|
+
-------
|
|
578
|
+
An :class:`~anndata.AnnData` object
|
|
579
|
+
"""
|
|
580
|
+
path = Path(path)
|
|
581
|
+
prefix = "" if prefix is None else prefix
|
|
582
|
+
is_legacy = (path / f"{prefix}genes.tsv").is_file()
|
|
583
|
+
adata = _read_10x_mtx(
|
|
584
|
+
path,
|
|
585
|
+
var_names=var_names,
|
|
586
|
+
make_unique=make_unique,
|
|
587
|
+
cache=cache,
|
|
588
|
+
cache_compression=cache_compression,
|
|
589
|
+
prefix=prefix,
|
|
590
|
+
is_legacy=is_legacy,
|
|
591
|
+
)
|
|
592
|
+
if is_legacy or not gex_only:
|
|
593
|
+
return adata
|
|
594
|
+
gex_rows = adata.var["feature_types"] == "Gene Expression"
|
|
595
|
+
return adata[:, gex_rows].copy()
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def _read_10x_mtx(
|
|
599
|
+
path: Path,
|
|
600
|
+
*,
|
|
601
|
+
var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
|
|
602
|
+
make_unique: bool = True,
|
|
603
|
+
cache: bool = False,
|
|
604
|
+
cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
|
|
605
|
+
prefix: str = "",
|
|
606
|
+
is_legacy: bool,
|
|
607
|
+
) -> AnnData:
|
|
608
|
+
"""
|
|
609
|
+
Read mex from output from Cell Ranger v2- or v3+
|
|
610
|
+
"""
|
|
611
|
+
suffix = "" if is_legacy else ".gz"
|
|
612
|
+
adata = read(
|
|
613
|
+
path / f"{prefix}matrix.mtx{suffix}",
|
|
614
|
+
cache=cache,
|
|
615
|
+
cache_compression=cache_compression,
|
|
616
|
+
).T # transpose the data
|
|
617
|
+
genes = pd.read_csv(
|
|
618
|
+
path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
|
|
619
|
+
header=None,
|
|
620
|
+
sep="\t",
|
|
621
|
+
)
|
|
622
|
+
if var_names == "gene_symbols":
|
|
623
|
+
var_names_idx = pd.Index(genes[1].values)
|
|
624
|
+
if make_unique:
|
|
625
|
+
var_names_idx = anndata.utils.make_index_unique(var_names_idx)
|
|
626
|
+
adata.var_names = var_names_idx
|
|
627
|
+
adata.var["gene_ids"] = genes[0].values
|
|
628
|
+
elif var_names == "gene_ids":
|
|
629
|
+
adata.var_names = genes[0].values
|
|
630
|
+
adata.var["gene_symbols"] = genes[1].values
|
|
631
|
+
else:
|
|
632
|
+
raise ValueError("`var_names` needs to be 'gene_symbols' or 'gene_ids'")
|
|
633
|
+
if not is_legacy:
|
|
634
|
+
adata.var["feature_types"] = genes[2].values
|
|
635
|
+
barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None)
|
|
636
|
+
adata.obs_names = barcodes[0].values
|
|
637
|
+
return adata
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
@old_positionals("ext", "compression", "compression_opts")
|
|
641
|
+
def write(
|
|
642
|
+
filename: Path | str,
|
|
643
|
+
adata: AnnData,
|
|
644
|
+
*,
|
|
645
|
+
ext: Literal["h5", "csv", "txt", "npz"] | None = None,
|
|
646
|
+
compression: Literal["gzip", "lzf"] | None = "gzip",
|
|
647
|
+
compression_opts: int | None = None,
|
|
648
|
+
):
|
|
649
|
+
"""\
|
|
650
|
+
Write :class:`~anndata.AnnData` objects to file.
|
|
651
|
+
|
|
652
|
+
Parameters
|
|
653
|
+
----------
|
|
654
|
+
filename
|
|
655
|
+
If the filename has no file extension, it is interpreted as a key for
|
|
656
|
+
generating a filename via `sc.settings.writedir / (filename +
|
|
657
|
+
sc.settings.file_format_data)`. This is the same behavior as in
|
|
658
|
+
:func:`~scanpy.read`.
|
|
659
|
+
adata
|
|
660
|
+
Annotated data matrix.
|
|
661
|
+
ext
|
|
662
|
+
File extension from wich to infer file format. If `None`, defaults to
|
|
663
|
+
`sc.settings.file_format_data`.
|
|
664
|
+
compression
|
|
665
|
+
See https://docs.h5py.org/en/latest/high/dataset.html.
|
|
666
|
+
compression_opts
|
|
667
|
+
See https://docs.h5py.org/en/latest/high/dataset.html.
|
|
668
|
+
"""
|
|
669
|
+
filename = Path(filename) # allow passing strings
|
|
670
|
+
if is_valid_filename(filename):
|
|
671
|
+
filename = filename
|
|
672
|
+
ext_ = is_valid_filename(filename, return_ext=True)
|
|
673
|
+
if ext is None:
|
|
674
|
+
ext = ext_
|
|
675
|
+
elif ext != ext_:
|
|
676
|
+
raise ValueError(
|
|
677
|
+
"It suffices to provide the file type by "
|
|
678
|
+
"providing a proper extension to the filename."
|
|
679
|
+
'One of "txt", "csv", "h5" or "npz".'
|
|
680
|
+
)
|
|
681
|
+
else:
|
|
682
|
+
key = filename
|
|
683
|
+
ext = settings.file_format_data if ext is None else ext
|
|
684
|
+
filename = _get_filename_from_key(key, ext)
|
|
685
|
+
if ext == "csv":
|
|
686
|
+
adata.write_csvs(filename)
|
|
687
|
+
else:
|
|
688
|
+
adata.write(
|
|
689
|
+
filename, compression=compression, compression_opts=compression_opts
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
# -------------------------------------------------------------------------------
|
|
694
|
+
# Reading and writing parameter files
|
|
695
|
+
# -------------------------------------------------------------------------------
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
@old_positionals("as_header")
|
|
699
|
+
def read_params(
|
|
700
|
+
filename: Path | str, *, as_header: bool = False
|
|
701
|
+
) -> dict[str, int | float | bool | str | None]:
|
|
702
|
+
"""\
|
|
703
|
+
Read parameter dictionary from text file.
|
|
704
|
+
|
|
705
|
+
Assumes that parameters are specified in the format::
|
|
706
|
+
|
|
707
|
+
par1 = value1
|
|
708
|
+
par2 = value2
|
|
709
|
+
|
|
710
|
+
Comments that start with '#' are allowed.
|
|
711
|
+
|
|
712
|
+
Parameters
|
|
713
|
+
----------
|
|
714
|
+
filename
|
|
715
|
+
Filename of data file.
|
|
716
|
+
asheader
|
|
717
|
+
Read the dictionary from the header (comment section) of a file.
|
|
718
|
+
|
|
719
|
+
Returns
|
|
720
|
+
-------
|
|
721
|
+
Dictionary that stores parameters.
|
|
722
|
+
"""
|
|
723
|
+
filename = Path(filename) # allow passing str objects
|
|
724
|
+
from collections import OrderedDict
|
|
725
|
+
|
|
726
|
+
params = OrderedDict([])
|
|
727
|
+
for line in filename.open():
|
|
728
|
+
if "=" in line and (not as_header or line.startswith("#")):
|
|
729
|
+
line = line[1:] if line.startswith("#") else line
|
|
730
|
+
key, val = line.split("=")
|
|
731
|
+
key = key.strip()
|
|
732
|
+
val = val.strip()
|
|
733
|
+
params[key] = convert_string(val)
|
|
734
|
+
return params
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def write_params(path: Path | str, *args, **maps):
|
|
738
|
+
"""\
|
|
739
|
+
Write parameters to file, so that it's readable by read_params.
|
|
740
|
+
|
|
741
|
+
Uses INI file format.
|
|
742
|
+
"""
|
|
743
|
+
path = Path(path)
|
|
744
|
+
if not path.parent.is_dir():
|
|
745
|
+
path.parent.mkdir(parents=True)
|
|
746
|
+
if len(args) == 1:
|
|
747
|
+
maps[None] = args[0]
|
|
748
|
+
with path.open("w") as f:
|
|
749
|
+
for header, map in maps.items():
|
|
750
|
+
if header is not None:
|
|
751
|
+
f.write(f"[{header}]\n")
|
|
752
|
+
for key, val in map.items():
|
|
753
|
+
f.write(f"{key} = {val}\n")
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
# -------------------------------------------------------------------------------
|
|
757
|
+
# Reading and Writing data files
|
|
758
|
+
# -------------------------------------------------------------------------------
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _read(
|
|
762
|
+
filename: Path,
|
|
763
|
+
*,
|
|
764
|
+
backed=None,
|
|
765
|
+
sheet=None,
|
|
766
|
+
ext=None,
|
|
767
|
+
delimiter=None,
|
|
768
|
+
first_column_names=None,
|
|
769
|
+
backup_url=None,
|
|
770
|
+
cache=False,
|
|
771
|
+
cache_compression=None,
|
|
772
|
+
suppress_cache_warning=False,
|
|
773
|
+
**kwargs,
|
|
774
|
+
):
|
|
775
|
+
if ext is not None and ext not in avail_exts:
|
|
776
|
+
raise ValueError(
|
|
777
|
+
f"Please provide one of the available extensions.\n{avail_exts}"
|
|
778
|
+
)
|
|
779
|
+
else:
|
|
780
|
+
ext = is_valid_filename(filename, return_ext=True)
|
|
781
|
+
is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
|
|
782
|
+
if not is_present:
|
|
783
|
+
logg.debug(f"... did not find original file {filename}")
|
|
784
|
+
# read hdf5 files
|
|
785
|
+
if ext in {"h5", "h5ad"}:
|
|
786
|
+
if sheet is None:
|
|
787
|
+
return read_h5ad(filename, backed=backed)
|
|
788
|
+
else:
|
|
789
|
+
logg.debug(f"reading sheet {sheet} from file {filename}")
|
|
790
|
+
return read_hdf(filename, sheet)
|
|
791
|
+
# read other file types
|
|
792
|
+
path_cache: Path = settings.cachedir / _slugify(filename).replace(
|
|
793
|
+
f".{ext}", ".h5ad"
|
|
794
|
+
)
|
|
795
|
+
if path_cache.suffix in {".gz", ".bz2"}:
|
|
796
|
+
path_cache = path_cache.with_suffix("")
|
|
797
|
+
if cache and path_cache.is_file():
|
|
798
|
+
logg.info(f"... reading from cache file {path_cache}")
|
|
799
|
+
return read_h5ad(path_cache)
|
|
800
|
+
|
|
801
|
+
if not is_present:
|
|
802
|
+
raise FileNotFoundError(f"Did not find file {filename}.")
|
|
803
|
+
logg.debug(f"reading {filename}")
|
|
804
|
+
if not cache and not suppress_cache_warning:
|
|
805
|
+
logg.hint(
|
|
806
|
+
"This might be very slow. Consider passing `cache=True`, "
|
|
807
|
+
"which enables much faster reading from a cache file."
|
|
808
|
+
)
|
|
809
|
+
# do the actual reading
|
|
810
|
+
if ext == "xlsx" or ext == "xls":
|
|
811
|
+
if sheet is None:
|
|
812
|
+
raise ValueError("Provide `sheet` parameter when reading '.xlsx' files.")
|
|
813
|
+
else:
|
|
814
|
+
adata = read_excel(filename, sheet)
|
|
815
|
+
elif ext in {"mtx", "mtx.gz"}:
|
|
816
|
+
adata = read_mtx(filename)
|
|
817
|
+
elif ext == "csv":
|
|
818
|
+
if delimiter is None:
|
|
819
|
+
delimiter = ","
|
|
820
|
+
adata = read_csv(
|
|
821
|
+
filename, first_column_names=first_column_names, delimiter=delimiter
|
|
822
|
+
)
|
|
823
|
+
elif ext in {"txt", "tab", "data", "tsv"}:
|
|
824
|
+
if ext == "data":
|
|
825
|
+
logg.hint(
|
|
826
|
+
"... assuming '.data' means tab or white-space separated text file",
|
|
827
|
+
)
|
|
828
|
+
logg.hint("change this by passing `ext` to sc.read")
|
|
829
|
+
adata = read_text(filename, delimiter, first_column_names)
|
|
830
|
+
elif ext == "soft.gz":
|
|
831
|
+
adata = _read_softgz(filename)
|
|
832
|
+
elif ext == "loom":
|
|
833
|
+
adata = read_loom(filename=filename, **kwargs)
|
|
834
|
+
else:
|
|
835
|
+
raise ValueError(f"Unknown extension {ext}.")
|
|
836
|
+
if cache:
|
|
837
|
+
logg.info(
|
|
838
|
+
f"... writing an {settings.file_format_data} "
|
|
839
|
+
"cache file to speedup reading next time"
|
|
840
|
+
)
|
|
841
|
+
if cache_compression is _empty:
|
|
842
|
+
cache_compression = settings.cache_compression
|
|
843
|
+
if not path_cache.parent.is_dir():
|
|
844
|
+
path_cache.parent.mkdir(parents=True)
|
|
845
|
+
# write for faster reading when calling the next time
|
|
846
|
+
adata.write(path_cache, compression=cache_compression)
|
|
847
|
+
return adata
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def _slugify(path: str | PurePath) -> str:
|
|
851
|
+
"""Make a path into a filename."""
|
|
852
|
+
if not isinstance(path, PurePath):
|
|
853
|
+
path = PurePath(path)
|
|
854
|
+
parts = list(path.parts)
|
|
855
|
+
if parts[0] == "/":
|
|
856
|
+
parts.pop(0)
|
|
857
|
+
elif len(parts[0]) == 3 and parts[0][1:] == ":\\":
|
|
858
|
+
parts[0] = parts[0][0] # C:\ → C
|
|
859
|
+
filename = "-".join(parts)
|
|
860
|
+
assert "/" not in filename, filename
|
|
861
|
+
assert not filename[1:].startswith(":"), filename
|
|
862
|
+
return filename
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def _read_softgz(filename: str | bytes | Path | BinaryIO) -> AnnData:
|
|
866
|
+
"""\
|
|
867
|
+
Read a SOFT format data file.
|
|
868
|
+
|
|
869
|
+
The SOFT format is documented here
|
|
870
|
+
https://www.ncbi.nlm.nih.gov/geo/info/soft.html.
|
|
871
|
+
|
|
872
|
+
Notes
|
|
873
|
+
-----
|
|
874
|
+
The function is based on a script by Kerby Shedden.
|
|
875
|
+
https://dept.stat.lsa.umich.edu/~kshedden/Python-Workshop/gene_expression_comparison.html
|
|
876
|
+
"""
|
|
877
|
+
import gzip
|
|
878
|
+
|
|
879
|
+
with gzip.open(filename, mode="rt") as file:
|
|
880
|
+
# The header part of the file contains information about the
|
|
881
|
+
# samples. Read that information first.
|
|
882
|
+
samples_info = {}
|
|
883
|
+
for line in file:
|
|
884
|
+
if line.startswith("!dataset_table_begin"):
|
|
885
|
+
break
|
|
886
|
+
elif line.startswith("!subset_description"):
|
|
887
|
+
subset_description = line.split("=")[1].strip()
|
|
888
|
+
elif line.startswith("!subset_sample_id"):
|
|
889
|
+
subset_ids = line.split("=")[1].split(",")
|
|
890
|
+
subset_ids = [x.strip() for x in subset_ids]
|
|
891
|
+
for k in subset_ids:
|
|
892
|
+
samples_info[k] = subset_description
|
|
893
|
+
# Next line is the column headers (sample id's)
|
|
894
|
+
sample_names = file.readline().strip().split("\t")
|
|
895
|
+
# The column indices that contain gene expression data
|
|
896
|
+
indices = [i for i, x in enumerate(sample_names) if x.startswith("GSM")]
|
|
897
|
+
# Restrict the column headers to those that we keep
|
|
898
|
+
sample_names = [sample_names[i] for i in indices]
|
|
899
|
+
# Get a list of sample labels
|
|
900
|
+
groups = [samples_info[k] for k in sample_names]
|
|
901
|
+
# Read the gene expression data as a list of lists, also get the gene
|
|
902
|
+
# identifiers
|
|
903
|
+
gene_names, X = [], []
|
|
904
|
+
for line in file:
|
|
905
|
+
# This is what signals the end of the gene expression data
|
|
906
|
+
# section in the file
|
|
907
|
+
if line.startswith("!dataset_table_end"):
|
|
908
|
+
break
|
|
909
|
+
V = line.split("\t")
|
|
910
|
+
# Extract the values that correspond to gene expression measures
|
|
911
|
+
# and convert the strings to numbers
|
|
912
|
+
x = [float(V[i]) for i in indices]
|
|
913
|
+
X.append(x)
|
|
914
|
+
gene_names.append(V[1])
|
|
915
|
+
# Convert the Python list of lists to a Numpy array and transpose to match
|
|
916
|
+
# the Scanpy convention of storing samples in rows and variables in colums.
|
|
917
|
+
X = np.array(X).T
|
|
918
|
+
obs = pd.DataFrame({"groups": groups}, index=sample_names)
|
|
919
|
+
var = pd.DataFrame(index=gene_names)
|
|
920
|
+
return AnnData(X=X, obs=obs, var=var)
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
# -------------------------------------------------------------------------------
|
|
924
|
+
# Type conversion
|
|
925
|
+
# -------------------------------------------------------------------------------
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def is_float(string: str) -> float:
|
|
929
|
+
"""Check whether string is float.
|
|
930
|
+
|
|
931
|
+
See also
|
|
932
|
+
--------
|
|
933
|
+
https://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python
|
|
934
|
+
"""
|
|
935
|
+
try:
|
|
936
|
+
float(string)
|
|
937
|
+
return True
|
|
938
|
+
except ValueError:
|
|
939
|
+
return False
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def is_int(string: str) -> bool:
|
|
943
|
+
"""Check whether string is integer."""
|
|
944
|
+
try:
|
|
945
|
+
int(string)
|
|
946
|
+
return True
|
|
947
|
+
except ValueError:
|
|
948
|
+
return False
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
def convert_bool(string: str) -> tuple[bool, bool]:
|
|
952
|
+
"""Check whether string is boolean."""
|
|
953
|
+
if string == "True":
|
|
954
|
+
return True, True
|
|
955
|
+
elif string == "False":
|
|
956
|
+
return True, False
|
|
957
|
+
else:
|
|
958
|
+
return False, False
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def convert_string(string: str) -> int | float | bool | str | None:
|
|
962
|
+
"""Convert string to int, float or bool."""
|
|
963
|
+
if is_int(string):
|
|
964
|
+
return int(string)
|
|
965
|
+
elif is_float(string):
|
|
966
|
+
return float(string)
|
|
967
|
+
elif convert_bool(string)[0]:
|
|
968
|
+
return convert_bool(string)[1]
|
|
969
|
+
elif string == "None":
|
|
970
|
+
return None
|
|
971
|
+
else:
|
|
972
|
+
return string
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
# -------------------------------------------------------------------------------
|
|
976
|
+
# Helper functions for reading and writing
|
|
977
|
+
# -------------------------------------------------------------------------------
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
def get_used_files():
|
|
981
|
+
"""Get files used by processes with name scanpy."""
|
|
982
|
+
import psutil
|
|
983
|
+
|
|
984
|
+
loop_over_scanpy_processes = (
|
|
985
|
+
proc for proc in psutil.process_iter() if proc.name() == "scanpy"
|
|
986
|
+
)
|
|
987
|
+
filenames = []
|
|
988
|
+
for proc in loop_over_scanpy_processes:
|
|
989
|
+
try:
|
|
990
|
+
flist = proc.open_files()
|
|
991
|
+
for nt in flist:
|
|
992
|
+
filenames.append(nt.path)
|
|
993
|
+
# This catches a race condition where a process ends
|
|
994
|
+
# before we can examine its files
|
|
995
|
+
except psutil.NoSuchProcess:
|
|
996
|
+
pass
|
|
997
|
+
return set(filenames)
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _get_filename_from_key(key, ext=None) -> Path:
|
|
1001
|
+
ext = settings.file_format_data if ext is None else ext
|
|
1002
|
+
return settings.writedir / f"{key}.{ext}"
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
def _download(url: str, path: Path):
|
|
1006
|
+
try:
|
|
1007
|
+
import ipywidgets # noqa: F401
|
|
1008
|
+
from tqdm.auto import tqdm
|
|
1009
|
+
except ImportError:
|
|
1010
|
+
from tqdm import tqdm
|
|
1011
|
+
|
|
1012
|
+
from urllib.error import URLError
|
|
1013
|
+
from urllib.request import Request, urlopen
|
|
1014
|
+
|
|
1015
|
+
blocksize = 1024 * 8
|
|
1016
|
+
blocknum = 0
|
|
1017
|
+
|
|
1018
|
+
try:
|
|
1019
|
+
req = Request(url, headers={"User-agent": "scanpy-user"})
|
|
1020
|
+
|
|
1021
|
+
try:
|
|
1022
|
+
open_url = urlopen(req)
|
|
1023
|
+
except URLError:
|
|
1024
|
+
logg.warning(
|
|
1025
|
+
"Failed to open the url with default certificates, trying with certifi."
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
from ssl import create_default_context
|
|
1029
|
+
|
|
1030
|
+
from certifi import where
|
|
1031
|
+
|
|
1032
|
+
open_url = urlopen(req, context=create_default_context(cafile=where()))
|
|
1033
|
+
|
|
1034
|
+
with open_url as resp:
|
|
1035
|
+
total = resp.info().get("content-length", None)
|
|
1036
|
+
with (
|
|
1037
|
+
tqdm(
|
|
1038
|
+
unit="B",
|
|
1039
|
+
unit_scale=True,
|
|
1040
|
+
miniters=1,
|
|
1041
|
+
unit_divisor=1024,
|
|
1042
|
+
total=total if total is None else int(total),
|
|
1043
|
+
) as t,
|
|
1044
|
+
path.open("wb") as f,
|
|
1045
|
+
):
|
|
1046
|
+
block = resp.read(blocksize)
|
|
1047
|
+
while block:
|
|
1048
|
+
f.write(block)
|
|
1049
|
+
blocknum += 1
|
|
1050
|
+
t.update(len(block))
|
|
1051
|
+
block = resp.read(blocksize)
|
|
1052
|
+
|
|
1053
|
+
except (KeyboardInterrupt, Exception):
|
|
1054
|
+
# Make sure file doesn’t exist half-downloaded
|
|
1055
|
+
if path.is_file():
|
|
1056
|
+
path.unlink()
|
|
1057
|
+
raise
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def _check_datafile_present_and_download(path, backup_url=None):
|
|
1061
|
+
"""Check whether the file is present, otherwise download."""
|
|
1062
|
+
path = Path(path)
|
|
1063
|
+
if path.is_file():
|
|
1064
|
+
return True
|
|
1065
|
+
if backup_url is None:
|
|
1066
|
+
return False
|
|
1067
|
+
logg.info(
|
|
1068
|
+
f"try downloading from url\n{backup_url}\n"
|
|
1069
|
+
"... this may take a while but only happens once"
|
|
1070
|
+
)
|
|
1071
|
+
if not path.parent.is_dir():
|
|
1072
|
+
logg.info(f"creating directory {path.parent}/ for saving data")
|
|
1073
|
+
path.parent.mkdir(parents=True)
|
|
1074
|
+
|
|
1075
|
+
_download(backup_url, path)
|
|
1076
|
+
return True
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def is_valid_filename(filename: Path, *, return_ext: bool = False):
|
|
1080
|
+
"""Check whether the argument is a filename."""
|
|
1081
|
+
ext = filename.suffixes
|
|
1082
|
+
|
|
1083
|
+
if len(ext) > 2:
|
|
1084
|
+
logg.warning(
|
|
1085
|
+
f"Your filename has more than two extensions: {ext}.\n"
|
|
1086
|
+
f"Only considering the two last: {ext[-2:]}."
|
|
1087
|
+
)
|
|
1088
|
+
ext = ext[-2:]
|
|
1089
|
+
|
|
1090
|
+
# cases for gzipped/bzipped text files
|
|
1091
|
+
if len(ext) == 2 and ext[0][1:] in text_exts and ext[1][1:] in ("gz", "bz2"):
|
|
1092
|
+
return ext[0][1:] if return_ext else True
|
|
1093
|
+
elif ext and ext[-1][1:] in avail_exts:
|
|
1094
|
+
return ext[-1][1:] if return_ext else True
|
|
1095
|
+
elif "".join(ext) == ".soft.gz":
|
|
1096
|
+
return "soft.gz" if return_ext else True
|
|
1097
|
+
elif "".join(ext) == ".mtx.gz":
|
|
1098
|
+
return "mtx.gz" if return_ext else True
|
|
1099
|
+
elif not return_ext:
|
|
1100
|
+
return False
|
|
1101
|
+
raise ValueError(
|
|
1102
|
+
f"""\
|
|
1103
|
+
{filename!r} does not end on a valid extension.
|
|
1104
|
+
Please, provide one of the available extensions.
|
|
1105
|
+
{avail_exts}
|
|
1106
|
+
Text files with .gz and .bz2 extensions are also supported.\
|
|
1107
|
+
"""
|
|
1108
|
+
)
|