sclab 0.1.8__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sclab might be problematic. Click here for more details.

@@ -0,0 +1,1108 @@
1
+ """Reading and Writing"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from enum import Enum
7
+ from pathlib import Path, PurePath
8
+ from typing import TYPE_CHECKING
9
+
10
+ import anndata.utils
11
+ import h5py
12
+ import numpy as np
13
+ import pandas as pd
14
+ from packaging.version import Version
15
+
16
+ if Version(anndata.__version__) >= Version("0.11.0rc2"):
17
+ from anndata.io import (
18
+ read_csv,
19
+ read_excel,
20
+ read_h5ad,
21
+ read_hdf,
22
+ read_loom,
23
+ read_mtx,
24
+ read_text,
25
+ )
26
+ else:
27
+ from anndata import (
28
+ read_csv,
29
+ read_excel,
30
+ read_h5ad,
31
+ read_hdf,
32
+ read_loom,
33
+ read_mtx,
34
+ read_text,
35
+ )
36
+ from anndata import AnnData
37
+ from matplotlib.image import imread
38
+
39
+ from . import logging as logg
40
+ from ._compat import old_positionals
41
+ from ._settings import settings
42
+
43
+
44
+ class Empty(Enum):
45
+ token = 0
46
+
47
+ def __repr__(self) -> str:
48
+ return "_empty"
49
+
50
+
51
+ _empty = Empty.token
52
+
53
+
54
+ if TYPE_CHECKING:
55
+ from typing import BinaryIO, Literal
56
+
57
+ # from ._utils import Empty
58
+
59
+ # .gz and .bz2 suffixes are also allowed for text formats
60
+ text_exts = {
61
+ "csv",
62
+ "tsv",
63
+ "tab",
64
+ "data",
65
+ "txt", # these four are all equivalent
66
+ }
67
+ avail_exts = {
68
+ "anndata",
69
+ "xlsx",
70
+ "h5",
71
+ "h5ad",
72
+ "mtx",
73
+ "mtx.gz",
74
+ "soft.gz",
75
+ "loom",
76
+ } | text_exts
77
+ """Available file formats for reading data. """
78
+
79
+
80
+ # --------------------------------------------------------------------------------
81
+ # Reading and Writing data files and AnnData objects
82
+ # --------------------------------------------------------------------------------
83
+
84
+
85
+ @old_positionals(
86
+ "sheet",
87
+ "ext",
88
+ "delimiter",
89
+ "first_column_names",
90
+ "backup_url",
91
+ "cache",
92
+ "cache_compression",
93
+ )
94
+ def read(
95
+ filename: Path | str,
96
+ backed: Literal["r", "r+"] | None = None,
97
+ *,
98
+ sheet: str | None = None,
99
+ ext: str | None = None,
100
+ delimiter: str | None = None,
101
+ first_column_names: bool = False,
102
+ backup_url: str | None = None,
103
+ cache: bool = False,
104
+ cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
105
+ **kwargs,
106
+ ) -> AnnData:
107
+ """\
108
+ Read file and return :class:`~anndata.AnnData` object.
109
+
110
+ To speed up reading, consider passing ``cache=True``, which creates an hdf5
111
+ cache file.
112
+
113
+ Parameters
114
+ ----------
115
+ filename
116
+ If the filename has no file extension, it is interpreted as a key for
117
+ generating a filename via ``sc.settings.writedir / (filename +
118
+ sc.settings.file_format_data)``. This is the same behavior as in
119
+ ``sc.read(filename, ...)``.
120
+ backed
121
+ If ``'r'``, load :class:`~anndata.AnnData` in ``backed`` mode instead
122
+ of fully loading it into memory (`memory` mode). If you want to modify
123
+ backed attributes of the AnnData object, you need to choose ``'r+'``.
124
+ sheet
125
+ Name of sheet/table in hdf5 or Excel file.
126
+ ext
127
+ Extension that indicates the file type. If ``None``, uses extension of
128
+ filename.
129
+ delimiter
130
+ Delimiter that separates data within text file. If ``None``, will split at
131
+ arbitrary number of white spaces, which is different from enforcing
132
+ splitting at any single white space ``' '``.
133
+ first_column_names
134
+ Assume the first column stores row names. This is only necessary if
135
+ these are not strings: strings in the first column are automatically
136
+ assumed to be row names.
137
+ backup_url
138
+ Retrieve the file from an URL if not present on disk.
139
+ cache
140
+ If `False`, read from source, if `True`, read from fast 'h5ad' cache.
141
+ cache_compression
142
+ See the h5py :ref:`dataset_compression`.
143
+ (Default: `settings.cache_compression`)
144
+ kwargs
145
+ Parameters passed to :func:`~anndata.io.read_loom`.
146
+
147
+ Returns
148
+ -------
149
+ An :class:`~anndata.AnnData` object
150
+ """
151
+ filename = Path(filename) # allow passing strings
152
+ if is_valid_filename(filename):
153
+ return _read(
154
+ filename,
155
+ backed=backed,
156
+ sheet=sheet,
157
+ ext=ext,
158
+ delimiter=delimiter,
159
+ first_column_names=first_column_names,
160
+ backup_url=backup_url,
161
+ cache=cache,
162
+ cache_compression=cache_compression,
163
+ **kwargs,
164
+ )
165
+ # generate filename and read to dict
166
+ filekey = str(filename)
167
+ filename = settings.writedir / (filekey + "." + settings.file_format_data)
168
+ if not filename.exists():
169
+ raise ValueError(
170
+ f"Reading with filekey {filekey!r} failed, "
171
+ f"the inferred filename {filename!r} does not exist. "
172
+ "If you intended to provide a filename, either use a filename "
173
+ f"ending on one of the available extensions {avail_exts} "
174
+ "or pass the parameter `ext`."
175
+ )
176
+ return read_h5ad(filename, backed=backed)
177
+
178
+
179
+ @old_positionals("genome", "gex_only", "backup_url")
180
+ def read_10x_h5(
181
+ filename: Path | str,
182
+ *,
183
+ genome: str | None = None,
184
+ gex_only: bool = True,
185
+ backup_url: str | None = None,
186
+ ) -> AnnData:
187
+ """\
188
+ Read 10x-Genomics-formatted hdf5 file.
189
+
190
+ Parameters
191
+ ----------
192
+ filename
193
+ Path to a 10x hdf5 file.
194
+ genome
195
+ Filter expression to genes within this genome. For legacy 10x h5
196
+ files, this must be provided if the data contains more than one genome.
197
+ gex_only
198
+ Only keep 'Gene Expression' data and ignore other feature types,
199
+ e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
200
+ backup_url
201
+ Retrieve the file from an URL if not present on disk.
202
+
203
+ Returns
204
+ -------
205
+ Annotated data matrix, where observations/cells are named by their
206
+ barcode and variables/genes by gene name. Stores the following information:
207
+
208
+ :attr:`~anndata.AnnData.X`
209
+ The data matrix is stored
210
+ :attr:`~anndata.AnnData.obs_names`
211
+ Cell names
212
+ :attr:`~anndata.AnnData.var_names`
213
+ Gene names for a feature barcode matrix, probe names for a probe bc matrix
214
+ :attr:`~anndata.AnnData.var`\\ `['gene_ids']`
215
+ Gene IDs
216
+ :attr:`~anndata.AnnData.var`\\ `['feature_types']`
217
+ Feature types
218
+ :attr:`~anndata.AnnData.obs`\\ `[filtered_barcodes]`
219
+ filtered barcodes if present in the matrix
220
+ :attr:`~anndata.AnnData.var`
221
+ Any additional metadata present in /matrix/features is read in.
222
+ """
223
+ start = logg.info(f"reading {filename}")
224
+ is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
225
+ if not is_present:
226
+ logg.debug(f"... did not find original file {filename}")
227
+ with h5py.File(str(filename), "r") as f:
228
+ v3 = "/matrix" in f
229
+ if v3:
230
+ adata = _read_v3_10x_h5(filename, start=start)
231
+ if genome:
232
+ if genome not in adata.var["genome"].values:
233
+ raise ValueError(
234
+ f"Could not find data corresponding to genome '{genome}' in '{filename}'. "
235
+ f"Available genomes are: {list(adata.var['genome'].unique())}."
236
+ )
237
+ adata = adata[:, adata.var["genome"] == genome]
238
+ if gex_only:
239
+ adata = adata[:, adata.var["feature_types"] == "Gene Expression"]
240
+ if adata.is_view:
241
+ adata = adata.copy()
242
+ else:
243
+ adata = _read_legacy_10x_h5(filename, genome=genome, start=start)
244
+ return adata
245
+
246
+
247
+ def _read_legacy_10x_h5(filename, *, genome=None, start=None):
248
+ """
249
+ Read hdf5 file from Cell Ranger v2 or earlier versions.
250
+ """
251
+ with h5py.File(str(filename), "r") as f:
252
+ try:
253
+ children = list(f.keys())
254
+ if not genome:
255
+ if len(children) > 1:
256
+ raise ValueError(
257
+ f"'{filename}' contains more than one genome. For legacy 10x h5 "
258
+ "files you must specify the genome if more than one is present. "
259
+ f"Available genomes are: {children}"
260
+ )
261
+ genome = children[0]
262
+ elif genome not in children:
263
+ raise ValueError(
264
+ f"Could not find genome '{genome}' in '{filename}'. "
265
+ f"Available genomes are: {children}"
266
+ )
267
+
268
+ dsets = {}
269
+ _collect_datasets(dsets, f[genome])
270
+
271
+ # AnnData works with csr matrices
272
+ # 10x stores the transposed data, so we do the transposition right away
273
+ from scipy.sparse import csr_matrix
274
+
275
+ M, N = dsets["shape"]
276
+ data = dsets["data"]
277
+ if dsets["data"].dtype == np.dtype("int32"):
278
+ data = dsets["data"].view("float32")
279
+ data[:] = dsets["data"]
280
+ matrix = csr_matrix(
281
+ (data, dsets["indices"], dsets["indptr"]),
282
+ shape=(N, M),
283
+ )
284
+ # the csc matrix is automatically the transposed csr matrix
285
+ # as scanpy expects it, so, no need for a further transpostion
286
+ adata = AnnData(
287
+ matrix,
288
+ obs=dict(obs_names=dsets["barcodes"].astype(str)),
289
+ var=dict(
290
+ var_names=dsets["gene_names"].astype(str),
291
+ gene_ids=dsets["genes"].astype(str),
292
+ ),
293
+ )
294
+ logg.info("", time=start)
295
+ return adata
296
+ except KeyError:
297
+ raise Exception("File is missing one or more required datasets.")
298
+
299
+
300
+ def _collect_datasets(dsets: dict, group: h5py.Group):
301
+ for k, v in group.items():
302
+ if isinstance(v, h5py.Dataset):
303
+ dsets[k] = v[()]
304
+ else:
305
+ _collect_datasets(dsets, v)
306
+
307
+
308
+ def _read_v3_10x_h5(filename, *, start=None):
309
+ """
310
+ Read hdf5 file from Cell Ranger v3 or later versions.
311
+ """
312
+ with h5py.File(str(filename), "r") as f:
313
+ try:
314
+ dsets = {}
315
+ _collect_datasets(dsets, f["matrix"])
316
+
317
+ from scipy.sparse import csr_matrix
318
+
319
+ M, N = dsets["shape"]
320
+ data = dsets["data"]
321
+ if dsets["data"].dtype == np.dtype("int32"):
322
+ data = dsets["data"].view("float32")
323
+ data[:] = dsets["data"]
324
+ matrix = csr_matrix(
325
+ (data, dsets["indices"], dsets["indptr"]),
326
+ shape=(N, M),
327
+ )
328
+ obs_dict = {"obs_names": dsets["barcodes"].astype(str)}
329
+ var_dict = {"var_names": dsets["name"].astype(str)}
330
+
331
+ if "gene_id" not in dsets:
332
+ # Read metadata specific to a feature-barcode matrix
333
+ var_dict["gene_ids"] = dsets["id"].astype(str)
334
+ else:
335
+ # Read metadata specific to a probe-barcode matrix
336
+ var_dict.update(
337
+ {
338
+ "gene_ids": dsets["gene_id"].astype(str),
339
+ "probe_ids": dsets["id"].astype(str),
340
+ }
341
+ )
342
+ var_dict["feature_types"] = dsets["feature_type"].astype(str)
343
+ if "filtered_barcodes" in f["matrix"]:
344
+ obs_dict["filtered_barcodes"] = dsets["filtered_barcodes"].astype(bool)
345
+
346
+ if "features" in f["matrix"]:
347
+ var_dict.update(
348
+ (
349
+ feature_metadata_name,
350
+ dsets[feature_metadata_name].astype(
351
+ bool if feature_metadata_item.dtype.kind == "b" else str
352
+ ),
353
+ )
354
+ for feature_metadata_name, feature_metadata_item in f["matrix"][
355
+ "features"
356
+ ].items()
357
+ if isinstance(feature_metadata_item, h5py.Dataset)
358
+ and feature_metadata_name
359
+ not in [
360
+ "name",
361
+ "feature_type",
362
+ "id",
363
+ "gene_id",
364
+ "_all_tag_keys",
365
+ ]
366
+ )
367
+ else:
368
+ raise ValueError("10x h5 has no features group")
369
+ adata = AnnData(
370
+ matrix,
371
+ obs=obs_dict,
372
+ var=var_dict,
373
+ )
374
+ logg.info("", time=start)
375
+ return adata
376
+ except KeyError:
377
+ raise Exception("File is missing one or more required datasets.")
378
+
379
+
380
+ def read_visium(
381
+ path: Path | str,
382
+ genome: str | None = None,
383
+ *,
384
+ count_file: str = "filtered_feature_bc_matrix.h5",
385
+ library_id: str | None = None,
386
+ load_images: bool | None = True,
387
+ source_image_path: Path | str | None = None,
388
+ ) -> AnnData:
389
+ """\
390
+ Read 10x-Genomics-formatted visum dataset.
391
+
392
+ In addition to reading regular 10x output,
393
+ this looks for the `spatial` folder and loads images,
394
+ coordinates and scale factors.
395
+ Based on the `Space Ranger output docs`_.
396
+
397
+ See :func:`~scanpy.pl.spatial` for a compatible plotting function.
398
+
399
+ .. _Space Ranger output docs: https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview
400
+
401
+ Parameters
402
+ ----------
403
+ path
404
+ Path to directory for visium datafiles.
405
+ genome
406
+ Filter expression to genes within this genome.
407
+ count_file
408
+ Which file in the passed directory to use as the count file. Typically would be one of:
409
+ 'filtered_feature_bc_matrix.h5' or 'raw_feature_bc_matrix.h5'.
410
+ library_id
411
+ Identifier for the visium library. Can be modified when concatenating multiple adata objects.
412
+ source_image_path
413
+ Path to the high-resolution tissue image. Path will be included in
414
+ `.uns["spatial"][library_id]["metadata"]["source_image_path"]`.
415
+
416
+ Returns
417
+ -------
418
+ Annotated data matrix, where observations/cells are named by their
419
+ barcode and variables/genes by gene name. Stores the following information:
420
+
421
+ :attr:`~anndata.AnnData.X`
422
+ The data matrix is stored
423
+ :attr:`~anndata.AnnData.obs_names`
424
+ Cell names
425
+ :attr:`~anndata.AnnData.var_names`
426
+ Gene names for a feature barcode matrix, probe names for a probe bc matrix
427
+ :attr:`~anndata.AnnData.var`\\ `['gene_ids']`
428
+ Gene IDs
429
+ :attr:`~anndata.AnnData.var`\\ `['feature_types']`
430
+ Feature types
431
+ :attr:`~anndata.AnnData.obs`\\ `[filtered_barcodes]`
432
+ filtered barcodes if present in the matrix
433
+ :attr:`~anndata.AnnData.var`
434
+ Any additional metadata present in /matrix/features is read in.
435
+ :attr:`~anndata.AnnData.uns`\\ `['spatial']`
436
+ Dict of spaceranger output files with 'library_id' as key
437
+ :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['images']`
438
+ Dict of images (`'hires'` and `'lowres'`)
439
+ :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['scalefactors']`
440
+ Scale factors for the spots
441
+ :attr:`~anndata.AnnData.uns`\\ `['spatial'][library_id]['metadata']`
442
+ Files metadata: 'chemistry_description', 'software_version', 'source_image_path'
443
+ :attr:`~anndata.AnnData.obsm`\\ `['spatial']`
444
+ Spatial spot coordinates, usable as `basis` by :func:`~scanpy.pl.embedding`.
445
+ """
446
+ path = Path(path)
447
+ adata = read_10x_h5(path / count_file, genome=genome)
448
+
449
+ adata.uns["spatial"] = dict()
450
+
451
+ from h5py import File
452
+
453
+ with File(path / count_file, mode="r") as f:
454
+ attrs = dict(f.attrs)
455
+ if library_id is None:
456
+ library_id = str(attrs.pop("library_ids")[0], "utf-8")
457
+
458
+ adata.uns["spatial"][library_id] = dict()
459
+
460
+ if load_images:
461
+ tissue_positions_file = (
462
+ path / "spatial/tissue_positions.csv"
463
+ if (path / "spatial/tissue_positions.csv").exists()
464
+ else path / "spatial/tissue_positions_list.csv"
465
+ )
466
+ files = dict(
467
+ tissue_positions_file=tissue_positions_file,
468
+ scalefactors_json_file=path / "spatial/scalefactors_json.json",
469
+ hires_image=path / "spatial/tissue_hires_image.png",
470
+ lowres_image=path / "spatial/tissue_lowres_image.png",
471
+ )
472
+
473
+ # check if files exists, continue if images are missing
474
+ for f in files.values():
475
+ if not f.exists():
476
+ if any(x in str(f) for x in ["hires_image", "lowres_image"]):
477
+ logg.warning(
478
+ f"You seem to be missing an image file.\nCould not find '{f}'."
479
+ )
480
+ else:
481
+ raise OSError(f"Could not find '{f}'")
482
+
483
+ adata.uns["spatial"][library_id]["images"] = dict()
484
+ for res in ["hires", "lowres"]:
485
+ try:
486
+ adata.uns["spatial"][library_id]["images"][res] = imread(
487
+ str(files[f"{res}_image"])
488
+ )
489
+ except Exception:
490
+ raise OSError(f"Could not find '{res}_image'")
491
+
492
+ # read json scalefactors
493
+ adata.uns["spatial"][library_id]["scalefactors"] = json.loads(
494
+ files["scalefactors_json_file"].read_bytes()
495
+ )
496
+
497
+ adata.uns["spatial"][library_id]["metadata"] = {
498
+ k: (str(attrs[k], "utf-8") if isinstance(attrs[k], bytes) else attrs[k])
499
+ for k in ("chemistry_description", "software_version")
500
+ if k in attrs
501
+ }
502
+
503
+ # read coordinates
504
+ positions = pd.read_csv(
505
+ files["tissue_positions_file"],
506
+ header=0 if tissue_positions_file.name == "tissue_positions.csv" else None,
507
+ index_col=0,
508
+ )
509
+ positions.columns = [
510
+ "in_tissue",
511
+ "array_row",
512
+ "array_col",
513
+ "pxl_col_in_fullres",
514
+ "pxl_row_in_fullres",
515
+ ]
516
+
517
+ adata.obs = adata.obs.join(positions, how="left")
518
+
519
+ adata.obsm["spatial"] = adata.obs[
520
+ ["pxl_row_in_fullres", "pxl_col_in_fullres"]
521
+ ].to_numpy()
522
+ adata.obs.drop(
523
+ columns=["pxl_row_in_fullres", "pxl_col_in_fullres"],
524
+ inplace=True,
525
+ )
526
+
527
+ # put image path in uns
528
+ if source_image_path is not None:
529
+ # get an absolute path
530
+ source_image_path = str(Path(source_image_path).resolve())
531
+ adata.uns["spatial"][library_id]["metadata"]["source_image_path"] = str(
532
+ source_image_path
533
+ )
534
+
535
+ return adata
536
+
537
+
538
+ @old_positionals("var_names", "make_unique", "cache", "cache_compression", "gex_only")
539
+ def read_10x_mtx(
540
+ path: Path | str,
541
+ *,
542
+ var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
543
+ make_unique: bool = True,
544
+ cache: bool = False,
545
+ cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
546
+ gex_only: bool = True,
547
+ prefix: str | None = None,
548
+ ) -> AnnData:
549
+ """\
550
+ Read 10x-Genomics-formatted mtx directory.
551
+
552
+ Parameters
553
+ ----------
554
+ path
555
+ Path to directory for `.mtx` and `.tsv` files,
556
+ e.g. './filtered_gene_bc_matrices/hg19/'.
557
+ var_names
558
+ The variables index.
559
+ make_unique
560
+ Whether to make the variables index unique by appending '-1',
561
+ '-2' etc. or not.
562
+ cache
563
+ If `False`, read from source, if `True`, read from fast 'h5ad' cache.
564
+ cache_compression
565
+ See the h5py :ref:`dataset_compression`.
566
+ (Default: `settings.cache_compression`)
567
+ gex_only
568
+ Only keep 'Gene Expression' data and ignore other feature types,
569
+ e.g. 'Antibody Capture', 'CRISPR Guide Capture', or 'Custom'
570
+ prefix
571
+ Any prefix before `matrix.mtx`, `genes.tsv` and `barcodes.tsv`. For instance,
572
+ if the files are named `patientA_matrix.mtx`, `patientA_genes.tsv` and
573
+ `patientA_barcodes.tsv` the prefix is `patientA_`.
574
+ (Default: no prefix)
575
+
576
+ Returns
577
+ -------
578
+ An :class:`~anndata.AnnData` object
579
+ """
580
+ path = Path(path)
581
+ prefix = "" if prefix is None else prefix
582
+ is_legacy = (path / f"{prefix}genes.tsv").is_file()
583
+ adata = _read_10x_mtx(
584
+ path,
585
+ var_names=var_names,
586
+ make_unique=make_unique,
587
+ cache=cache,
588
+ cache_compression=cache_compression,
589
+ prefix=prefix,
590
+ is_legacy=is_legacy,
591
+ )
592
+ if is_legacy or not gex_only:
593
+ return adata
594
+ gex_rows = adata.var["feature_types"] == "Gene Expression"
595
+ return adata[:, gex_rows].copy()
596
+
597
+
598
+ def _read_10x_mtx(
599
+ path: Path,
600
+ *,
601
+ var_names: Literal["gene_symbols", "gene_ids"] = "gene_symbols",
602
+ make_unique: bool = True,
603
+ cache: bool = False,
604
+ cache_compression: Literal["gzip", "lzf"] | None | Empty = _empty,
605
+ prefix: str = "",
606
+ is_legacy: bool,
607
+ ) -> AnnData:
608
+ """
609
+ Read mex from output from Cell Ranger v2- or v3+
610
+ """
611
+ suffix = "" if is_legacy else ".gz"
612
+ adata = read(
613
+ path / f"{prefix}matrix.mtx{suffix}",
614
+ cache=cache,
615
+ cache_compression=cache_compression,
616
+ ).T # transpose the data
617
+ genes = pd.read_csv(
618
+ path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
619
+ header=None,
620
+ sep="\t",
621
+ )
622
+ if var_names == "gene_symbols":
623
+ var_names_idx = pd.Index(genes[1].values)
624
+ if make_unique:
625
+ var_names_idx = anndata.utils.make_index_unique(var_names_idx)
626
+ adata.var_names = var_names_idx
627
+ adata.var["gene_ids"] = genes[0].values
628
+ elif var_names == "gene_ids":
629
+ adata.var_names = genes[0].values
630
+ adata.var["gene_symbols"] = genes[1].values
631
+ else:
632
+ raise ValueError("`var_names` needs to be 'gene_symbols' or 'gene_ids'")
633
+ if not is_legacy:
634
+ adata.var["feature_types"] = genes[2].values
635
+ barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None)
636
+ adata.obs_names = barcodes[0].values
637
+ return adata
638
+
639
+
640
+ @old_positionals("ext", "compression", "compression_opts")
641
+ def write(
642
+ filename: Path | str,
643
+ adata: AnnData,
644
+ *,
645
+ ext: Literal["h5", "csv", "txt", "npz"] | None = None,
646
+ compression: Literal["gzip", "lzf"] | None = "gzip",
647
+ compression_opts: int | None = None,
648
+ ):
649
+ """\
650
+ Write :class:`~anndata.AnnData` objects to file.
651
+
652
+ Parameters
653
+ ----------
654
+ filename
655
+ If the filename has no file extension, it is interpreted as a key for
656
+ generating a filename via `sc.settings.writedir / (filename +
657
+ sc.settings.file_format_data)`. This is the same behavior as in
658
+ :func:`~scanpy.read`.
659
+ adata
660
+ Annotated data matrix.
661
+ ext
662
+ File extension from wich to infer file format. If `None`, defaults to
663
+ `sc.settings.file_format_data`.
664
+ compression
665
+ See https://docs.h5py.org/en/latest/high/dataset.html.
666
+ compression_opts
667
+ See https://docs.h5py.org/en/latest/high/dataset.html.
668
+ """
669
+ filename = Path(filename) # allow passing strings
670
+ if is_valid_filename(filename):
671
+ filename = filename
672
+ ext_ = is_valid_filename(filename, return_ext=True)
673
+ if ext is None:
674
+ ext = ext_
675
+ elif ext != ext_:
676
+ raise ValueError(
677
+ "It suffices to provide the file type by "
678
+ "providing a proper extension to the filename."
679
+ 'One of "txt", "csv", "h5" or "npz".'
680
+ )
681
+ else:
682
+ key = filename
683
+ ext = settings.file_format_data if ext is None else ext
684
+ filename = _get_filename_from_key(key, ext)
685
+ if ext == "csv":
686
+ adata.write_csvs(filename)
687
+ else:
688
+ adata.write(
689
+ filename, compression=compression, compression_opts=compression_opts
690
+ )
691
+
692
+
693
+ # -------------------------------------------------------------------------------
694
+ # Reading and writing parameter files
695
+ # -------------------------------------------------------------------------------
696
+
697
+
698
+ @old_positionals("as_header")
699
+ def read_params(
700
+ filename: Path | str, *, as_header: bool = False
701
+ ) -> dict[str, int | float | bool | str | None]:
702
+ """\
703
+ Read parameter dictionary from text file.
704
+
705
+ Assumes that parameters are specified in the format::
706
+
707
+ par1 = value1
708
+ par2 = value2
709
+
710
+ Comments that start with '#' are allowed.
711
+
712
+ Parameters
713
+ ----------
714
+ filename
715
+ Filename of data file.
716
+ asheader
717
+ Read the dictionary from the header (comment section) of a file.
718
+
719
+ Returns
720
+ -------
721
+ Dictionary that stores parameters.
722
+ """
723
+ filename = Path(filename) # allow passing str objects
724
+ from collections import OrderedDict
725
+
726
+ params = OrderedDict([])
727
+ for line in filename.open():
728
+ if "=" in line and (not as_header or line.startswith("#")):
729
+ line = line[1:] if line.startswith("#") else line
730
+ key, val = line.split("=")
731
+ key = key.strip()
732
+ val = val.strip()
733
+ params[key] = convert_string(val)
734
+ return params
735
+
736
+
737
+ def write_params(path: Path | str, *args, **maps):
738
+ """\
739
+ Write parameters to file, so that it's readable by read_params.
740
+
741
+ Uses INI file format.
742
+ """
743
+ path = Path(path)
744
+ if not path.parent.is_dir():
745
+ path.parent.mkdir(parents=True)
746
+ if len(args) == 1:
747
+ maps[None] = args[0]
748
+ with path.open("w") as f:
749
+ for header, map in maps.items():
750
+ if header is not None:
751
+ f.write(f"[{header}]\n")
752
+ for key, val in map.items():
753
+ f.write(f"{key} = {val}\n")
754
+
755
+
756
+ # -------------------------------------------------------------------------------
757
+ # Reading and Writing data files
758
+ # -------------------------------------------------------------------------------
759
+
760
+
761
+ def _read(
762
+ filename: Path,
763
+ *,
764
+ backed=None,
765
+ sheet=None,
766
+ ext=None,
767
+ delimiter=None,
768
+ first_column_names=None,
769
+ backup_url=None,
770
+ cache=False,
771
+ cache_compression=None,
772
+ suppress_cache_warning=False,
773
+ **kwargs,
774
+ ):
775
+ if ext is not None and ext not in avail_exts:
776
+ raise ValueError(
777
+ f"Please provide one of the available extensions.\n{avail_exts}"
778
+ )
779
+ else:
780
+ ext = is_valid_filename(filename, return_ext=True)
781
+ is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
782
+ if not is_present:
783
+ logg.debug(f"... did not find original file {filename}")
784
+ # read hdf5 files
785
+ if ext in {"h5", "h5ad"}:
786
+ if sheet is None:
787
+ return read_h5ad(filename, backed=backed)
788
+ else:
789
+ logg.debug(f"reading sheet {sheet} from file {filename}")
790
+ return read_hdf(filename, sheet)
791
+ # read other file types
792
+ path_cache: Path = settings.cachedir / _slugify(filename).replace(
793
+ f".{ext}", ".h5ad"
794
+ )
795
+ if path_cache.suffix in {".gz", ".bz2"}:
796
+ path_cache = path_cache.with_suffix("")
797
+ if cache and path_cache.is_file():
798
+ logg.info(f"... reading from cache file {path_cache}")
799
+ return read_h5ad(path_cache)
800
+
801
+ if not is_present:
802
+ raise FileNotFoundError(f"Did not find file {filename}.")
803
+ logg.debug(f"reading {filename}")
804
+ if not cache and not suppress_cache_warning:
805
+ logg.hint(
806
+ "This might be very slow. Consider passing `cache=True`, "
807
+ "which enables much faster reading from a cache file."
808
+ )
809
+ # do the actual reading
810
+ if ext == "xlsx" or ext == "xls":
811
+ if sheet is None:
812
+ raise ValueError("Provide `sheet` parameter when reading '.xlsx' files.")
813
+ else:
814
+ adata = read_excel(filename, sheet)
815
+ elif ext in {"mtx", "mtx.gz"}:
816
+ adata = read_mtx(filename)
817
+ elif ext == "csv":
818
+ if delimiter is None:
819
+ delimiter = ","
820
+ adata = read_csv(
821
+ filename, first_column_names=first_column_names, delimiter=delimiter
822
+ )
823
+ elif ext in {"txt", "tab", "data", "tsv"}:
824
+ if ext == "data":
825
+ logg.hint(
826
+ "... assuming '.data' means tab or white-space separated text file",
827
+ )
828
+ logg.hint("change this by passing `ext` to sc.read")
829
+ adata = read_text(filename, delimiter, first_column_names)
830
+ elif ext == "soft.gz":
831
+ adata = _read_softgz(filename)
832
+ elif ext == "loom":
833
+ adata = read_loom(filename=filename, **kwargs)
834
+ else:
835
+ raise ValueError(f"Unknown extension {ext}.")
836
+ if cache:
837
+ logg.info(
838
+ f"... writing an {settings.file_format_data} "
839
+ "cache file to speedup reading next time"
840
+ )
841
+ if cache_compression is _empty:
842
+ cache_compression = settings.cache_compression
843
+ if not path_cache.parent.is_dir():
844
+ path_cache.parent.mkdir(parents=True)
845
+ # write for faster reading when calling the next time
846
+ adata.write(path_cache, compression=cache_compression)
847
+ return adata
848
+
849
+
850
+ def _slugify(path: str | PurePath) -> str:
851
+ """Make a path into a filename."""
852
+ if not isinstance(path, PurePath):
853
+ path = PurePath(path)
854
+ parts = list(path.parts)
855
+ if parts[0] == "/":
856
+ parts.pop(0)
857
+ elif len(parts[0]) == 3 and parts[0][1:] == ":\\":
858
+ parts[0] = parts[0][0] # C:\ → C
859
+ filename = "-".join(parts)
860
+ assert "/" not in filename, filename
861
+ assert not filename[1:].startswith(":"), filename
862
+ return filename
863
+
864
+
865
+ def _read_softgz(filename: str | bytes | Path | BinaryIO) -> AnnData:
866
+ """\
867
+ Read a SOFT format data file.
868
+
869
+ The SOFT format is documented here
870
+ https://www.ncbi.nlm.nih.gov/geo/info/soft.html.
871
+
872
+ Notes
873
+ -----
874
+ The function is based on a script by Kerby Shedden.
875
+ https://dept.stat.lsa.umich.edu/~kshedden/Python-Workshop/gene_expression_comparison.html
876
+ """
877
+ import gzip
878
+
879
+ with gzip.open(filename, mode="rt") as file:
880
+ # The header part of the file contains information about the
881
+ # samples. Read that information first.
882
+ samples_info = {}
883
+ for line in file:
884
+ if line.startswith("!dataset_table_begin"):
885
+ break
886
+ elif line.startswith("!subset_description"):
887
+ subset_description = line.split("=")[1].strip()
888
+ elif line.startswith("!subset_sample_id"):
889
+ subset_ids = line.split("=")[1].split(",")
890
+ subset_ids = [x.strip() for x in subset_ids]
891
+ for k in subset_ids:
892
+ samples_info[k] = subset_description
893
+ # Next line is the column headers (sample id's)
894
+ sample_names = file.readline().strip().split("\t")
895
+ # The column indices that contain gene expression data
896
+ indices = [i for i, x in enumerate(sample_names) if x.startswith("GSM")]
897
+ # Restrict the column headers to those that we keep
898
+ sample_names = [sample_names[i] for i in indices]
899
+ # Get a list of sample labels
900
+ groups = [samples_info[k] for k in sample_names]
901
+ # Read the gene expression data as a list of lists, also get the gene
902
+ # identifiers
903
+ gene_names, X = [], []
904
+ for line in file:
905
+ # This is what signals the end of the gene expression data
906
+ # section in the file
907
+ if line.startswith("!dataset_table_end"):
908
+ break
909
+ V = line.split("\t")
910
+ # Extract the values that correspond to gene expression measures
911
+ # and convert the strings to numbers
912
+ x = [float(V[i]) for i in indices]
913
+ X.append(x)
914
+ gene_names.append(V[1])
915
+ # Convert the Python list of lists to a Numpy array and transpose to match
916
+ # the Scanpy convention of storing samples in rows and variables in colums.
917
+ X = np.array(X).T
918
+ obs = pd.DataFrame({"groups": groups}, index=sample_names)
919
+ var = pd.DataFrame(index=gene_names)
920
+ return AnnData(X=X, obs=obs, var=var)
921
+
922
+
923
+ # -------------------------------------------------------------------------------
924
+ # Type conversion
925
+ # -------------------------------------------------------------------------------
926
+
927
+
928
+ def is_float(string: str) -> float:
929
+ """Check whether string is float.
930
+
931
+ See also
932
+ --------
933
+ https://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python
934
+ """
935
+ try:
936
+ float(string)
937
+ return True
938
+ except ValueError:
939
+ return False
940
+
941
+
942
+ def is_int(string: str) -> bool:
943
+ """Check whether string is integer."""
944
+ try:
945
+ int(string)
946
+ return True
947
+ except ValueError:
948
+ return False
949
+
950
+
951
+ def convert_bool(string: str) -> tuple[bool, bool]:
952
+ """Check whether string is boolean."""
953
+ if string == "True":
954
+ return True, True
955
+ elif string == "False":
956
+ return True, False
957
+ else:
958
+ return False, False
959
+
960
+
961
+ def convert_string(string: str) -> int | float | bool | str | None:
962
+ """Convert string to int, float or bool."""
963
+ if is_int(string):
964
+ return int(string)
965
+ elif is_float(string):
966
+ return float(string)
967
+ elif convert_bool(string)[0]:
968
+ return convert_bool(string)[1]
969
+ elif string == "None":
970
+ return None
971
+ else:
972
+ return string
973
+
974
+
975
+ # -------------------------------------------------------------------------------
976
+ # Helper functions for reading and writing
977
+ # -------------------------------------------------------------------------------
978
+
979
+
980
+ def get_used_files():
981
+ """Get files used by processes with name scanpy."""
982
+ import psutil
983
+
984
+ loop_over_scanpy_processes = (
985
+ proc for proc in psutil.process_iter() if proc.name() == "scanpy"
986
+ )
987
+ filenames = []
988
+ for proc in loop_over_scanpy_processes:
989
+ try:
990
+ flist = proc.open_files()
991
+ for nt in flist:
992
+ filenames.append(nt.path)
993
+ # This catches a race condition where a process ends
994
+ # before we can examine its files
995
+ except psutil.NoSuchProcess:
996
+ pass
997
+ return set(filenames)
998
+
999
+
1000
+ def _get_filename_from_key(key, ext=None) -> Path:
1001
+ ext = settings.file_format_data if ext is None else ext
1002
+ return settings.writedir / f"{key}.{ext}"
1003
+
1004
+
1005
+ def _download(url: str, path: Path):
1006
+ try:
1007
+ import ipywidgets # noqa: F401
1008
+ from tqdm.auto import tqdm
1009
+ except ImportError:
1010
+ from tqdm import tqdm
1011
+
1012
+ from urllib.error import URLError
1013
+ from urllib.request import Request, urlopen
1014
+
1015
+ blocksize = 1024 * 8
1016
+ blocknum = 0
1017
+
1018
+ try:
1019
+ req = Request(url, headers={"User-agent": "scanpy-user"})
1020
+
1021
+ try:
1022
+ open_url = urlopen(req)
1023
+ except URLError:
1024
+ logg.warning(
1025
+ "Failed to open the url with default certificates, trying with certifi."
1026
+ )
1027
+
1028
+ from ssl import create_default_context
1029
+
1030
+ from certifi import where
1031
+
1032
+ open_url = urlopen(req, context=create_default_context(cafile=where()))
1033
+
1034
+ with open_url as resp:
1035
+ total = resp.info().get("content-length", None)
1036
+ with (
1037
+ tqdm(
1038
+ unit="B",
1039
+ unit_scale=True,
1040
+ miniters=1,
1041
+ unit_divisor=1024,
1042
+ total=total if total is None else int(total),
1043
+ ) as t,
1044
+ path.open("wb") as f,
1045
+ ):
1046
+ block = resp.read(blocksize)
1047
+ while block:
1048
+ f.write(block)
1049
+ blocknum += 1
1050
+ t.update(len(block))
1051
+ block = resp.read(blocksize)
1052
+
1053
+ except (KeyboardInterrupt, Exception):
1054
+ # Make sure file doesn’t exist half-downloaded
1055
+ if path.is_file():
1056
+ path.unlink()
1057
+ raise
1058
+
1059
+
1060
+ def _check_datafile_present_and_download(path, backup_url=None):
1061
+ """Check whether the file is present, otherwise download."""
1062
+ path = Path(path)
1063
+ if path.is_file():
1064
+ return True
1065
+ if backup_url is None:
1066
+ return False
1067
+ logg.info(
1068
+ f"try downloading from url\n{backup_url}\n"
1069
+ "... this may take a while but only happens once"
1070
+ )
1071
+ if not path.parent.is_dir():
1072
+ logg.info(f"creating directory {path.parent}/ for saving data")
1073
+ path.parent.mkdir(parents=True)
1074
+
1075
+ _download(backup_url, path)
1076
+ return True
1077
+
1078
+
1079
+ def is_valid_filename(filename: Path, *, return_ext: bool = False):
1080
+ """Check whether the argument is a filename."""
1081
+ ext = filename.suffixes
1082
+
1083
+ if len(ext) > 2:
1084
+ logg.warning(
1085
+ f"Your filename has more than two extensions: {ext}.\n"
1086
+ f"Only considering the two last: {ext[-2:]}."
1087
+ )
1088
+ ext = ext[-2:]
1089
+
1090
+ # cases for gzipped/bzipped text files
1091
+ if len(ext) == 2 and ext[0][1:] in text_exts and ext[1][1:] in ("gz", "bz2"):
1092
+ return ext[0][1:] if return_ext else True
1093
+ elif ext and ext[-1][1:] in avail_exts:
1094
+ return ext[-1][1:] if return_ext else True
1095
+ elif "".join(ext) == ".soft.gz":
1096
+ return "soft.gz" if return_ext else True
1097
+ elif "".join(ext) == ".mtx.gz":
1098
+ return "mtx.gz" if return_ext else True
1099
+ elif not return_ext:
1100
+ return False
1101
+ raise ValueError(
1102
+ f"""\
1103
+ {filename!r} does not end on a valid extension.
1104
+ Please, provide one of the available extensions.
1105
+ {avail_exts}
1106
+ Text files with .gz and .bz2 extensions are also supported.\
1107
+ """
1108
+ )