lamindb 0.76.6__py3-none-any.whl → 0.76.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +113 -113
- lamindb/_artifact.py +1205 -1174
- lamindb/_can_validate.py +579 -579
- lamindb/_collection.py +387 -382
- lamindb/_curate.py +1601 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +242 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +256 -256
- lamindb/_from_values.py +382 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +362 -362
- lamindb/_record.py +649 -649
- lamindb/_run.py +57 -57
- lamindb/_save.py +308 -295
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +127 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -93
- lamindb/core/_context.py +574 -558
- lamindb/core/_data.py +438 -438
- lamindb/core/_feature_manager.py +867 -866
- lamindb/core/_label_manager.py +253 -252
- lamindb/core/_mapped_collection.py +597 -597
- lamindb/core/_settings.py +187 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +571 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -77
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -0
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +740 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -196
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -245
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +158 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
- {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/METADATA +5 -5
- lamindb-0.76.8.dist-info/RECORD +60 -0
- {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
- lamindb-0.76.6.dist-info/RECORD +0 -59
lamindb/core/datasets/_core.py
CHANGED
@@ -1,571 +1,571 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from pathlib import Path
|
4
|
-
from typing import TYPE_CHECKING
|
5
|
-
from urllib.request import urlretrieve
|
6
|
-
|
7
|
-
import anndata as ad
|
8
|
-
import numpy as np
|
9
|
-
import pandas as pd
|
10
|
-
from lnschema_core import ids
|
11
|
-
from upath import UPath
|
12
|
-
|
13
|
-
from lamindb.core._settings import settings
|
14
|
-
|
15
|
-
if TYPE_CHECKING:
|
16
|
-
from mudata import MuData
|
17
|
-
|
18
|
-
|
19
|
-
def file_fcs() -> Path:
|
20
|
-
"""Example FCS artifact."""
|
21
|
-
filepath, _ = urlretrieve(
|
22
|
-
"https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
|
23
|
-
)
|
24
|
-
return Path(filepath)
|
25
|
-
|
26
|
-
|
27
|
-
def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no cover
|
28
|
-
"""FCS file from Alpert19.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
populate_registries: pre-populate metadata records to simulate existing registries # noqa
|
32
|
-
"""
|
33
|
-
filepath, _ = urlretrieve(
|
34
|
-
"https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",
|
35
|
-
"Alpert19.fcs",
|
36
|
-
)
|
37
|
-
if populate_registries:
|
38
|
-
import bionty as bt
|
39
|
-
import readfcs
|
40
|
-
|
41
|
-
import lamindb as ln
|
42
|
-
|
43
|
-
verbosity = ln.settings.verbosity
|
44
|
-
ln.settings.verbosity = "error"
|
45
|
-
adata = readfcs.read(filepath)
|
46
|
-
std = bt.CellMarker.public().standardize(adata.var.index)
|
47
|
-
ln.save(
|
48
|
-
bt.CellMarker.from_values(
|
49
|
-
bt.CellMarker.public().inspect(std, "name").validated, "name"
|
50
|
-
)
|
51
|
-
)
|
52
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
53
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
54
|
-
ln.settings.verbosity = verbosity
|
55
|
-
return Path(filepath)
|
56
|
-
|
57
|
-
|
58
|
-
def file_jpg_paradisi05() -> Path:
|
59
|
-
"""Return jpg file example.
|
60
|
-
|
61
|
-
Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg
|
62
|
-
"""
|
63
|
-
filepath, _ = urlretrieve(
|
64
|
-
"https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg",
|
65
|
-
"paradisi05_laminopathic_nuclei.jpg",
|
66
|
-
)
|
67
|
-
return Path(filepath)
|
68
|
-
|
69
|
-
|
70
|
-
def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
|
71
|
-
populate_registries: bool = False,
|
72
|
-
) -> Path: # pragma: no cover
|
73
|
-
"""Gene counts table from nf-core RNA-seq pipeline.
|
74
|
-
|
75
|
-
Output of: https://nf-co.re/rnaseq
|
76
|
-
"""
|
77
|
-
filepath, _ = urlretrieve(
|
78
|
-
"https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
|
79
|
-
"salmon.merged.gene_counts.tsv",
|
80
|
-
)
|
81
|
-
if populate_registries:
|
82
|
-
import bionty as bt
|
83
|
-
|
84
|
-
import lamindb as ln
|
85
|
-
|
86
|
-
verbosity = ln.settings.verbosity
|
87
|
-
ln.settings.verbosity = "error"
|
88
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
89
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
90
|
-
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
|
91
|
-
ln.settings.verbosity = verbosity
|
92
|
-
|
93
|
-
return Path(filepath)
|
94
|
-
|
95
|
-
|
96
|
-
def file_fastq(in_storage_root=False) -> Path:
|
97
|
-
"""Mini mock fastq artifact."""
|
98
|
-
basedir = Path() if not in_storage_root else settings.storage.root
|
99
|
-
filepath = basedir / "input.fastq.gz"
|
100
|
-
with open(filepath, "w") as f:
|
101
|
-
f.write("Mock fastq artifact.")
|
102
|
-
return filepath
|
103
|
-
|
104
|
-
|
105
|
-
def file_bam(in_storage_root=False) -> Path: # pragma: no cover
|
106
|
-
"""Mini mock bam artifact."""
|
107
|
-
basedir = Path() if not in_storage_root else settings.storage.root
|
108
|
-
filepath = basedir / "output.bam"
|
109
|
-
with open(filepath, "w") as f:
|
110
|
-
f.write("Mock bam artifact.")
|
111
|
-
return filepath
|
112
|
-
|
113
|
-
|
114
|
-
def file_mini_csv(in_storage_root=False) -> Path:
|
115
|
-
"""Mini csv artifact."""
|
116
|
-
basedir = Path() if not in_storage_root else settings.storage.root
|
117
|
-
filepath = basedir / "mini.csv"
|
118
|
-
df = pd.DataFrame([1, 2, 3], columns=["test"])
|
119
|
-
df.to_csv(filepath, index=False)
|
120
|
-
return filepath
|
121
|
-
|
122
|
-
|
123
|
-
def file_tiff_suo22() -> Path: # pragma: no cover
|
124
|
-
"""Image file from Suo22.
|
125
|
-
|
126
|
-
Pair with anndata_suo22_Visium10X
|
127
|
-
"""
|
128
|
-
filepath, _ = urlretrieve(
|
129
|
-
"https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
|
130
|
-
"F121_LP1_4LIV.tiff",
|
131
|
-
)
|
132
|
-
Path("suo22/").mkdir(exist_ok=True)
|
133
|
-
filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff") # type: ignore
|
134
|
-
return Path(filepath)
|
135
|
-
|
136
|
-
|
137
|
-
def dir_iris_images() -> UPath: # pragma: no cover
|
138
|
-
"""Directory with 3 studies of the Iris flower: 405 images & metadata.
|
139
|
-
|
140
|
-
Based on: https://github.com/laminlabs/lamindb-dev-datasets/pull/2
|
141
|
-
"""
|
142
|
-
return UPath("s3://lamindata/iris_studies")
|
143
|
-
|
144
|
-
|
145
|
-
def anndata_mouse_sc_lymph_node(
|
146
|
-
populate_registries: bool = False,
|
147
|
-
) -> ad.AnnData: # pragma: no cover
|
148
|
-
"""Mouse lymph node scRNA-seq collection from EBI.
|
149
|
-
|
150
|
-
Subsampled to 10k genes.
|
151
|
-
|
152
|
-
From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/
|
153
|
-
|
154
|
-
Args:
|
155
|
-
populate_registries: pre-populate metadata records to simulate existing registries # noqa
|
156
|
-
"""
|
157
|
-
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
|
158
|
-
adata = ad.read_h5ad(filepath)
|
159
|
-
|
160
|
-
# The column names are a bit lengthy, let's abbreviate them:
|
161
|
-
adata.obs.columns = (
|
162
|
-
adata.obs.columns.str.replace("Sample Characteristic", "")
|
163
|
-
.str.replace("Factor Value ", "Factor Value:", regex=True)
|
164
|
-
.str.replace("Factor Value\\[", "Factor Value:", regex=True)
|
165
|
-
.str.replace(" Ontology Term\\[", "ontology_id:", regex=True)
|
166
|
-
.str.strip("[]")
|
167
|
-
.str.replace("organism part", "tissue")
|
168
|
-
.str.replace("organism", "organism")
|
169
|
-
.str.replace("developmental stage", "developmental_stage")
|
170
|
-
.str.replace("cell type", "cell_type")
|
171
|
-
# the last one could be interesting, too
|
172
|
-
# .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
|
173
|
-
)
|
174
|
-
# subset columns to only the ones with names
|
175
|
-
columns = [
|
176
|
-
col
|
177
|
-
for col in adata.obs.columns
|
178
|
-
if not col.startswith("ontology_id")
|
179
|
-
and not col.startswith("Factor Value")
|
180
|
-
and col != "strain"
|
181
|
-
]
|
182
|
-
adata.obs = adata.obs[columns]
|
183
|
-
|
184
|
-
# pre-populate registries
|
185
|
-
if populate_registries:
|
186
|
-
import bionty as bt
|
187
|
-
|
188
|
-
import lamindb as ln
|
189
|
-
|
190
|
-
verbosity = ln.settings.verbosity
|
191
|
-
ln.settings.verbosity = "error"
|
192
|
-
# strain
|
193
|
-
bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save()
|
194
|
-
# developmental stage
|
195
|
-
bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save()
|
196
|
-
# tissue
|
197
|
-
bt.Tissue.from_source(ontology_id="UBERON:0001542").save()
|
198
|
-
# cell types
|
199
|
-
ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
|
200
|
-
# assays
|
201
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
202
|
-
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
203
|
-
# genes
|
204
|
-
validated = bt.Gene.public(organism="mouse").validate(
|
205
|
-
adata.var.index, field="ensembl_gene_id"
|
206
|
-
)
|
207
|
-
ln.save(
|
208
|
-
bt.Gene.from_values(
|
209
|
-
adata.var.index[validated][:-19],
|
210
|
-
field="ensembl_gene_id",
|
211
|
-
organism="mouse",
|
212
|
-
)
|
213
|
-
)
|
214
|
-
# labels
|
215
|
-
labels = []
|
216
|
-
for col in ["sex", "age", "genotype", "immunophenotype"]:
|
217
|
-
labels += [ln.ULabel(name=name) for name in adata.obs[col]]
|
218
|
-
ln.save(labels)
|
219
|
-
ln.settings.verbosity = verbosity
|
220
|
-
|
221
|
-
return adata
|
222
|
-
|
223
|
-
|
224
|
-
def anndata_pbmc68k_reduced() -> ad.AnnData:
|
225
|
-
"""Modified from scanpy.collections.pbmc68k_reduced().
|
226
|
-
|
227
|
-
This code was run::
|
228
|
-
|
229
|
-
pbmc68k = sc.collections.pbmc68k_reduced()
|
230
|
-
pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
|
231
|
-
pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
|
232
|
-
{"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
|
233
|
-
)
|
234
|
-
del pbmc68k.obs["G2M_score"]
|
235
|
-
del pbmc68k.obs["S_score"]
|
236
|
-
del pbmc68k.obs["phase"]
|
237
|
-
del pbmc68k.obs["n_counts"]
|
238
|
-
del pbmc68k.var["dispersions"]
|
239
|
-
del pbmc68k.var["dispersions_norm"]
|
240
|
-
del pbmc68k.var["means"]
|
241
|
-
del pbmc68k.uns["rank_genes_groups"]
|
242
|
-
del pbmc68k.uns["bulk_labels_colors"]
|
243
|
-
sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)
|
244
|
-
pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad")
|
245
|
-
"""
|
246
|
-
filepath, _ = urlretrieve(
|
247
|
-
"https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
|
248
|
-
)
|
249
|
-
return ad.read_h5ad(filepath)
|
250
|
-
|
251
|
-
|
252
|
-
def anndata_file_pbmc68k_test() -> Path:
|
253
|
-
"""Modified from scanpy.collections.pbmc68k_reduced().
|
254
|
-
|
255
|
-
Additional slots were added for testing purposes. Returns the filepath.
|
256
|
-
|
257
|
-
To reproduce::
|
258
|
-
|
259
|
-
pbmc68k = ln.core.datasets.anndata_pbmc68k_reduced()
|
260
|
-
pbmc68k_test = pbmc68k[:30, :200].copy()
|
261
|
-
pbmc68k_test.raw = pbmc68k_test[:, :100]
|
262
|
-
pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
|
263
|
-
pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
|
264
|
-
pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
|
265
|
-
pbmc68k_test.layers["test"][0] = 1.
|
266
|
-
pbmc68k_test.write("pbmc68k_test.h5ad")
|
267
|
-
"""
|
268
|
-
filepath, _ = urlretrieve(
|
269
|
-
"https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
|
270
|
-
)
|
271
|
-
return Path(filepath)
|
272
|
-
|
273
|
-
|
274
|
-
def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover
|
275
|
-
"""Modified from scanpy.pbmc3k_processed()."""
|
276
|
-
filepath, _ = urlretrieve(
|
277
|
-
"https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
|
278
|
-
)
|
279
|
-
pbmc3k = ad.read_h5ad(filepath)
|
280
|
-
pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
|
281
|
-
return pbmc3k
|
282
|
-
|
283
|
-
|
284
|
-
def anndata_human_immune_cells(
|
285
|
-
populate_registries: bool = False,
|
286
|
-
) -> ad.AnnData: # pragma: no cover
|
287
|
-
"""Cross-tissue immune cell analysis reveals tissue-specific features in humans.
|
288
|
-
|
289
|
-
From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3
|
290
|
-
Collection: Global
|
291
|
-
|
292
|
-
To reproduce the subsample::
|
293
|
-
>>> adata = sc.read('Global.h5ad')
|
294
|
-
>>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()
|
295
|
-
>>> sc.pp.subsample(adata, fraction=0.005)
|
296
|
-
>>> del adata.uns["development_cache_ontology_term_id_colors"]
|
297
|
-
>>> del adata.uns["sex_ontology_term_id_colors"]
|
298
|
-
>>> adata.write('human_immune.h5ad')
|
299
|
-
"""
|
300
|
-
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
|
301
|
-
adata = ad.read_h5ad(filepath)
|
302
|
-
adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
|
303
|
-
adata.uns.pop("cell_type_ontology_term_id_colors")
|
304
|
-
adata.uns.pop("title")
|
305
|
-
adata.uns.pop("schema_version")
|
306
|
-
adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
|
307
|
-
columns = [col for col in adata.obs.columns if "ontology_term" not in col]
|
308
|
-
adata.obs = adata.obs[columns]
|
309
|
-
if populate_registries:
|
310
|
-
import bionty as bt
|
311
|
-
|
312
|
-
import lamindb as ln
|
313
|
-
|
314
|
-
verbosity = ln.settings.verbosity
|
315
|
-
ln.settings.verbosity = "error"
|
316
|
-
ln.save(
|
317
|
-
bt.Gene.from_values(
|
318
|
-
adata.var.index, field="ensembl_gene_id", organism="human"
|
319
|
-
)
|
320
|
-
)
|
321
|
-
ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
|
322
|
-
ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
|
323
|
-
ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
|
324
|
-
ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
|
325
|
-
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
326
|
-
ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
|
327
|
-
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
328
|
-
ln.Feature(name="donor", dtype=[ln.ULabel]).save()
|
329
|
-
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
330
|
-
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
331
|
-
ln.settings.verbosity = verbosity
|
332
|
-
return adata
|
333
|
-
|
334
|
-
|
335
|
-
def anndata_with_obs() -> ad.AnnData:
|
336
|
-
"""Create a mini anndata with cell_type, disease and tissue."""
|
337
|
-
import anndata as ad
|
338
|
-
import bionty.base as bionty_base
|
339
|
-
|
340
|
-
celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
|
341
|
-
celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
|
342
|
-
diseases = [
|
343
|
-
"chronic kidney disease",
|
344
|
-
"liver lymphoma",
|
345
|
-
"cardiac ventricle disorder",
|
346
|
-
"Alzheimer disease",
|
347
|
-
]
|
348
|
-
tissues = ["kidney", "liver", "heart", "brain"]
|
349
|
-
df = pd.DataFrame()
|
350
|
-
df["cell_type"] = celltypes * 10
|
351
|
-
df["cell_type_id"] = celltype_ids * 10
|
352
|
-
df["tissue"] = tissues * 10
|
353
|
-
df["disease"] = diseases * 10
|
354
|
-
df.index = "obs" + df.index.astype(str)
|
355
|
-
|
356
|
-
adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
|
357
|
-
adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
|
358
|
-
|
359
|
-
return adata
|
360
|
-
|
361
|
-
|
362
|
-
def anndata_suo22_Visium10X(): # pragma: no cover
|
363
|
-
"""AnnData from Suo22 generated by 10x Visium."""
|
364
|
-
import anndata as ad
|
365
|
-
|
366
|
-
filepath, _ = urlretrieve(
|
367
|
-
"https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
|
368
|
-
"Visium10X_data_LI_subset.h5ad",
|
369
|
-
)
|
370
|
-
Path("suo22/").mkdir(exist_ok=True)
|
371
|
-
filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
|
372
|
-
return ad.read_h5ad(filepath)
|
373
|
-
|
374
|
-
|
375
|
-
def mudata_papalexi21_subset() -> MuData: # pragma: no cover
|
376
|
-
"""A subsetted mudata from papalexi21.
|
377
|
-
|
378
|
-
To reproduce the subsetting:
|
379
|
-
>>> !wget https://figshare.com/ndownloader/files/36509460
|
380
|
-
>>> import mudata as md
|
381
|
-
>>> import scanpy as sc
|
382
|
-
>>> mdata = md.read_h5mu("36509460")
|
383
|
-
>>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
|
384
|
-
>>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")
|
385
|
-
"""
|
386
|
-
import mudata as md
|
387
|
-
|
388
|
-
md.set_options(pull_on_update=False)
|
389
|
-
|
390
|
-
filepath, _ = urlretrieve(
|
391
|
-
"https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",
|
392
|
-
"papalexi21_subset.h5mu",
|
393
|
-
)
|
394
|
-
|
395
|
-
mdata = md.read_h5mu(filepath)
|
396
|
-
|
397
|
-
mdata.pull_obs()
|
398
|
-
|
399
|
-
# The MuData object is malformed with duplicated information
|
400
|
-
# Drop all columns for the modalities and add them again correspondingly
|
401
|
-
for mod in ["rna", "adt", "hto", "gdo"]:
|
402
|
-
mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)
|
403
|
-
for col in mdata.obs.columns:
|
404
|
-
for mod in ["rna", "adt", "hto", "gdo"]:
|
405
|
-
if col.endswith(f"_{mod.upper()}"):
|
406
|
-
new_col = col.replace(f"{mod}:", "")
|
407
|
-
if new_col != col:
|
408
|
-
mdata[mod].obs[new_col] = mdata.obs.pop(col)
|
409
|
-
else:
|
410
|
-
new_col = col.replace(f"{mod}:", "")
|
411
|
-
if new_col not in mdata.obs.columns and col in mdata.obs.columns:
|
412
|
-
mdata.obs[new_col] = mdata.obs.pop(col)
|
413
|
-
|
414
|
-
for col in mdata.obs.columns:
|
415
|
-
for mod in ["rna", "adt", "hto", "gdo"]:
|
416
|
-
if col.endswith(f"_{mod.upper()}"):
|
417
|
-
del mdata.obs[col]
|
418
|
-
|
419
|
-
for col in [
|
420
|
-
"orig.ident",
|
421
|
-
"MULTI_ID",
|
422
|
-
"NT",
|
423
|
-
"S.Score",
|
424
|
-
"G2M.Score",
|
425
|
-
"Phase",
|
426
|
-
"gene_target",
|
427
|
-
"guide_ID",
|
428
|
-
"HTO_classification",
|
429
|
-
]:
|
430
|
-
del mdata.obs[col]
|
431
|
-
|
432
|
-
mdata.push_obs(["percent.mito"], mods=["rna"], drop=True)
|
433
|
-
mdata["hto"].obs["technique"] = "cell hashing"
|
434
|
-
mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
|
435
|
-
mdata.pull_obs(["technique"], mods="hto")
|
436
|
-
|
437
|
-
return mdata
|
438
|
-
|
439
|
-
|
440
|
-
def df_iris() -> pd.DataFrame:
|
441
|
-
"""The iris collection as in sklearn.
|
442
|
-
|
443
|
-
Original code::
|
444
|
-
|
445
|
-
sklearn.collections.load_iris(as_frame=True).frame
|
446
|
-
"""
|
447
|
-
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
|
448
|
-
return pd.read_parquet(filepath)
|
449
|
-
|
450
|
-
|
451
|
-
def df_iris_in_meter() -> pd.DataFrame:
|
452
|
-
"""The iris collection with lengths in meter."""
|
453
|
-
df = df_iris()
|
454
|
-
# rename columns
|
455
|
-
df.rename(
|
456
|
-
columns={
|
457
|
-
"sepal length (cm)": "sepal_length",
|
458
|
-
"sepal width (cm)": "sepal_width",
|
459
|
-
"petal length (cm)": "petal_length",
|
460
|
-
"petal width (cm)": "petal_width",
|
461
|
-
},
|
462
|
-
inplace=True,
|
463
|
-
)
|
464
|
-
df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
|
465
|
-
df["iris_organism_name"] = df["target"].map(
|
466
|
-
{0: "setosa", 1: "versicolor", 2: "virginica"}
|
467
|
-
)
|
468
|
-
del df["target"]
|
469
|
-
return df
|
470
|
-
|
471
|
-
|
472
|
-
def df_iris_in_meter_study1() -> pd.DataFrame:
|
473
|
-
"""The iris collection with lengths in meter."""
|
474
|
-
df_iris = df_iris_in_meter()
|
475
|
-
return df_iris.iloc[: len(df_iris) // 2]
|
476
|
-
|
477
|
-
|
478
|
-
def df_iris_in_meter_study2() -> pd.DataFrame:
|
479
|
-
"""The iris collection with lengths in meter."""
|
480
|
-
df_iris = df_iris_in_meter()
|
481
|
-
return df_iris.iloc[len(df_iris) // 2 :]
|
482
|
-
|
483
|
-
|
484
|
-
def dir_scrnaseq_cellranger(
|
485
|
-
sample_name: str, basedir: str | Path = "./", output_only: bool = True
|
486
|
-
): # pragma: no cover
|
487
|
-
"""Generate mock cell ranger outputs.
|
488
|
-
|
489
|
-
Args:
|
490
|
-
sample_name: name of the sample
|
491
|
-
basedir: run directory
|
492
|
-
output_only: only generate output files
|
493
|
-
"""
|
494
|
-
basedir = Path(basedir)
|
495
|
-
|
496
|
-
if not output_only: # pragma: no cover
|
497
|
-
fastqdir = basedir / "fastq"
|
498
|
-
fastqdir.mkdir(parents=True, exist_ok=True)
|
499
|
-
fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz"
|
500
|
-
with open(fastqfile1, "w") as f:
|
501
|
-
f.write(f"{ids.base62(n_char=6)}")
|
502
|
-
fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz"
|
503
|
-
fastqfile2.touch(exist_ok=True)
|
504
|
-
with open(fastqfile2, "w") as f:
|
505
|
-
f.write(f"{ids.base62(n_char=6)}")
|
506
|
-
|
507
|
-
sampledir = basedir / f"{sample_name}"
|
508
|
-
for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]:
|
509
|
-
filedir = sampledir / folder
|
510
|
-
filedir.mkdir(parents=True, exist_ok=True)
|
511
|
-
|
512
|
-
for filename in [
|
513
|
-
"web_summary.html",
|
514
|
-
"metrics_summary.csv",
|
515
|
-
"possorted_genome_bam.bam",
|
516
|
-
"possorted_genome_bam.bam.bai",
|
517
|
-
"molecule_info.h5",
|
518
|
-
"cloupe.cloupe",
|
519
|
-
"raw_feature_bc_matrix.h5",
|
520
|
-
"raw_feature_bc_matrix/barcodes.tsv.gz",
|
521
|
-
"raw_feature_bc_matrix/features.tsv.gz",
|
522
|
-
"raw_feature_bc_matrix/matrix.mtx.gz",
|
523
|
-
"filtered_feature_bc_matrix.h5",
|
524
|
-
"filtered_feature_bc_matrix/barcodes.tsv.gz",
|
525
|
-
"filtered_feature_bc_matrix/features.tsv.gz",
|
526
|
-
"filtered_feature_bc_matrix/matrix.mtx.gz",
|
527
|
-
"analysis/analysis.csv",
|
528
|
-
]:
|
529
|
-
file = sampledir / filename
|
530
|
-
with open(file, "w") as f:
|
531
|
-
f.write(f"{ids.base62(n_char=6)}")
|
532
|
-
|
533
|
-
return sampledir
|
534
|
-
|
535
|
-
|
536
|
-
def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
|
537
|
-
"""CRISPRi screen collection of Schmidt22.
|
538
|
-
|
539
|
-
Originally from: https://zenodo.org/record/5784651
|
540
|
-
"""
|
541
|
-
filepath, _ = urlretrieve(
|
542
|
-
"https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
|
543
|
-
"schmidt22-crispra-gws-IFNG.csv",
|
544
|
-
)
|
545
|
-
return Path(filepath).rename(Path(basedir) / filepath)
|
546
|
-
|
547
|
-
|
548
|
-
def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
|
549
|
-
"""Perturb-seq collection of Schmidt22.
|
550
|
-
|
551
|
-
Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
|
552
|
-
|
553
|
-
To reproduce the subsample:
|
554
|
-
>>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
|
555
|
-
>>> adata.obs = adata.obs[['cluster_name']]
|
556
|
-
>>> del adata.obsp
|
557
|
-
>>> del adata.var['features']
|
558
|
-
>>> del adata.obsm['X_pca']
|
559
|
-
>>> del adata.uns
|
560
|
-
>>> del adata.raw
|
561
|
-
>>> del adata.varm
|
562
|
-
>>> adata.obs = adata.obs.reset_index()
|
563
|
-
>>> del adata.obs['index']
|
564
|
-
>>> sc.pp.subsample(adata, 0.03)
|
565
|
-
>>> adata.write('schmidt22_perturbseq.h5ad')
|
566
|
-
"""
|
567
|
-
filepath, _ = urlretrieve(
|
568
|
-
"https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
|
569
|
-
"schmidt22_perturbseq.h5ad",
|
570
|
-
)
|
571
|
-
return Path(filepath).rename(Path(basedir) / filepath)
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import TYPE_CHECKING
|
5
|
+
from urllib.request import urlretrieve
|
6
|
+
|
7
|
+
import anndata as ad
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
from lnschema_core import ids
|
11
|
+
from upath import UPath
|
12
|
+
|
13
|
+
from lamindb.core._settings import settings
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from mudata import MuData
|
17
|
+
|
18
|
+
|
19
|
+
def file_fcs() -> Path:
|
20
|
+
"""Example FCS artifact."""
|
21
|
+
filepath, _ = urlretrieve(
|
22
|
+
"https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
|
23
|
+
)
|
24
|
+
return Path(filepath)
|
25
|
+
|
26
|
+
|
27
|
+
def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no cover
|
28
|
+
"""FCS file from Alpert19.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
populate_registries: pre-populate metadata records to simulate existing registries # noqa
|
32
|
+
"""
|
33
|
+
filepath, _ = urlretrieve(
|
34
|
+
"https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",
|
35
|
+
"Alpert19.fcs",
|
36
|
+
)
|
37
|
+
if populate_registries:
|
38
|
+
import bionty as bt
|
39
|
+
import readfcs
|
40
|
+
|
41
|
+
import lamindb as ln
|
42
|
+
|
43
|
+
verbosity = ln.settings.verbosity
|
44
|
+
ln.settings.verbosity = "error"
|
45
|
+
adata = readfcs.read(filepath)
|
46
|
+
std = bt.CellMarker.public().standardize(adata.var.index)
|
47
|
+
ln.save(
|
48
|
+
bt.CellMarker.from_values(
|
49
|
+
bt.CellMarker.public().inspect(std, "name").validated, "name"
|
50
|
+
)
|
51
|
+
)
|
52
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
53
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
54
|
+
ln.settings.verbosity = verbosity
|
55
|
+
return Path(filepath)
|
56
|
+
|
57
|
+
|
58
|
+
def file_jpg_paradisi05() -> Path:
|
59
|
+
"""Return jpg file example.
|
60
|
+
|
61
|
+
Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg
|
62
|
+
"""
|
63
|
+
filepath, _ = urlretrieve(
|
64
|
+
"https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg",
|
65
|
+
"paradisi05_laminopathic_nuclei.jpg",
|
66
|
+
)
|
67
|
+
return Path(filepath)
|
68
|
+
|
69
|
+
|
70
|
+
def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
|
71
|
+
populate_registries: bool = False,
|
72
|
+
) -> Path: # pragma: no cover
|
73
|
+
"""Gene counts table from nf-core RNA-seq pipeline.
|
74
|
+
|
75
|
+
Output of: https://nf-co.re/rnaseq
|
76
|
+
"""
|
77
|
+
filepath, _ = urlretrieve(
|
78
|
+
"https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
|
79
|
+
"salmon.merged.gene_counts.tsv",
|
80
|
+
)
|
81
|
+
if populate_registries:
|
82
|
+
import bionty as bt
|
83
|
+
|
84
|
+
import lamindb as ln
|
85
|
+
|
86
|
+
verbosity = ln.settings.verbosity
|
87
|
+
ln.settings.verbosity = "error"
|
88
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
89
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
90
|
+
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
|
91
|
+
ln.settings.verbosity = verbosity
|
92
|
+
|
93
|
+
return Path(filepath)
|
94
|
+
|
95
|
+
|
96
|
+
def file_fastq(in_storage_root=False) -> Path:
|
97
|
+
"""Mini mock fastq artifact."""
|
98
|
+
basedir = Path() if not in_storage_root else settings.storage.root
|
99
|
+
filepath = basedir / "input.fastq.gz"
|
100
|
+
with open(filepath, "w") as f:
|
101
|
+
f.write("Mock fastq artifact.")
|
102
|
+
return filepath
|
103
|
+
|
104
|
+
|
105
|
+
def file_bam(in_storage_root=False) -> Path: # pragma: no cover
|
106
|
+
"""Mini mock bam artifact."""
|
107
|
+
basedir = Path() if not in_storage_root else settings.storage.root
|
108
|
+
filepath = basedir / "output.bam"
|
109
|
+
with open(filepath, "w") as f:
|
110
|
+
f.write("Mock bam artifact.")
|
111
|
+
return filepath
|
112
|
+
|
113
|
+
|
114
|
+
def file_mini_csv(in_storage_root=False) -> Path:
|
115
|
+
"""Mini csv artifact."""
|
116
|
+
basedir = Path() if not in_storage_root else settings.storage.root
|
117
|
+
filepath = basedir / "mini.csv"
|
118
|
+
df = pd.DataFrame([1, 2, 3], columns=["test"])
|
119
|
+
df.to_csv(filepath, index=False)
|
120
|
+
return filepath
|
121
|
+
|
122
|
+
|
123
|
+
def file_tiff_suo22() -> Path: # pragma: no cover
|
124
|
+
"""Image file from Suo22.
|
125
|
+
|
126
|
+
Pair with anndata_suo22_Visium10X
|
127
|
+
"""
|
128
|
+
filepath, _ = urlretrieve(
|
129
|
+
"https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
|
130
|
+
"F121_LP1_4LIV.tiff",
|
131
|
+
)
|
132
|
+
Path("suo22/").mkdir(exist_ok=True)
|
133
|
+
filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff") # type: ignore
|
134
|
+
return Path(filepath)
|
135
|
+
|
136
|
+
|
137
|
+
def dir_iris_images() -> UPath: # pragma: no cover
|
138
|
+
"""Directory with 3 studies of the Iris flower: 405 images & metadata.
|
139
|
+
|
140
|
+
Based on: https://github.com/laminlabs/lamindb-dev-datasets/pull/2
|
141
|
+
"""
|
142
|
+
return UPath("s3://lamindata/iris_studies")
|
143
|
+
|
144
|
+
|
145
|
+
def anndata_mouse_sc_lymph_node(
|
146
|
+
populate_registries: bool = False,
|
147
|
+
) -> ad.AnnData: # pragma: no cover
|
148
|
+
"""Mouse lymph node scRNA-seq collection from EBI.
|
149
|
+
|
150
|
+
Subsampled to 10k genes.
|
151
|
+
|
152
|
+
From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/
|
153
|
+
|
154
|
+
Args:
|
155
|
+
populate_registries: pre-populate metadata records to simulate existing registries # noqa
|
156
|
+
"""
|
157
|
+
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
|
158
|
+
adata = ad.read_h5ad(filepath)
|
159
|
+
|
160
|
+
# The column names are a bit lengthy, let's abbreviate them:
|
161
|
+
adata.obs.columns = (
|
162
|
+
adata.obs.columns.str.replace("Sample Characteristic", "")
|
163
|
+
.str.replace("Factor Value ", "Factor Value:", regex=True)
|
164
|
+
.str.replace("Factor Value\\[", "Factor Value:", regex=True)
|
165
|
+
.str.replace(" Ontology Term\\[", "ontology_id:", regex=True)
|
166
|
+
.str.strip("[]")
|
167
|
+
.str.replace("organism part", "tissue")
|
168
|
+
.str.replace("organism", "organism")
|
169
|
+
.str.replace("developmental stage", "developmental_stage")
|
170
|
+
.str.replace("cell type", "cell_type")
|
171
|
+
# the last one could be interesting, too
|
172
|
+
# .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
|
173
|
+
)
|
174
|
+
# subset columns to only the ones with names
|
175
|
+
columns = [
|
176
|
+
col
|
177
|
+
for col in adata.obs.columns
|
178
|
+
if not col.startswith("ontology_id")
|
179
|
+
and not col.startswith("Factor Value")
|
180
|
+
and col != "strain"
|
181
|
+
]
|
182
|
+
adata.obs = adata.obs[columns]
|
183
|
+
|
184
|
+
# pre-populate registries
|
185
|
+
if populate_registries:
|
186
|
+
import bionty as bt
|
187
|
+
|
188
|
+
import lamindb as ln
|
189
|
+
|
190
|
+
verbosity = ln.settings.verbosity
|
191
|
+
ln.settings.verbosity = "error"
|
192
|
+
# strain
|
193
|
+
bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save()
|
194
|
+
# developmental stage
|
195
|
+
bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save()
|
196
|
+
# tissue
|
197
|
+
bt.Tissue.from_source(ontology_id="UBERON:0001542").save()
|
198
|
+
# cell types
|
199
|
+
ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
|
200
|
+
# assays
|
201
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
202
|
+
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
203
|
+
# genes
|
204
|
+
validated = bt.Gene.public(organism="mouse").validate(
|
205
|
+
adata.var.index, field="ensembl_gene_id"
|
206
|
+
)
|
207
|
+
ln.save(
|
208
|
+
bt.Gene.from_values(
|
209
|
+
adata.var.index[validated][:-19],
|
210
|
+
field="ensembl_gene_id",
|
211
|
+
organism="mouse",
|
212
|
+
)
|
213
|
+
)
|
214
|
+
# labels
|
215
|
+
labels = []
|
216
|
+
for col in ["sex", "age", "genotype", "immunophenotype"]:
|
217
|
+
labels += [ln.ULabel(name=name) for name in adata.obs[col]]
|
218
|
+
ln.save(labels)
|
219
|
+
ln.settings.verbosity = verbosity
|
220
|
+
|
221
|
+
return adata
|
222
|
+
|
223
|
+
|
224
|
+
def anndata_pbmc68k_reduced() -> ad.AnnData:
|
225
|
+
"""Modified from scanpy.collections.pbmc68k_reduced().
|
226
|
+
|
227
|
+
This code was run::
|
228
|
+
|
229
|
+
pbmc68k = sc.collections.pbmc68k_reduced()
|
230
|
+
pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
|
231
|
+
pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
|
232
|
+
{"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
|
233
|
+
)
|
234
|
+
del pbmc68k.obs["G2M_score"]
|
235
|
+
del pbmc68k.obs["S_score"]
|
236
|
+
del pbmc68k.obs["phase"]
|
237
|
+
del pbmc68k.obs["n_counts"]
|
238
|
+
del pbmc68k.var["dispersions"]
|
239
|
+
del pbmc68k.var["dispersions_norm"]
|
240
|
+
del pbmc68k.var["means"]
|
241
|
+
del pbmc68k.uns["rank_genes_groups"]
|
242
|
+
del pbmc68k.uns["bulk_labels_colors"]
|
243
|
+
sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)
|
244
|
+
pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad")
|
245
|
+
"""
|
246
|
+
filepath, _ = urlretrieve(
|
247
|
+
"https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
|
248
|
+
)
|
249
|
+
return ad.read_h5ad(filepath)
|
250
|
+
|
251
|
+
|
252
|
+
def anndata_file_pbmc68k_test() -> Path:
|
253
|
+
"""Modified from scanpy.collections.pbmc68k_reduced().
|
254
|
+
|
255
|
+
Additional slots were added for testing purposes. Returns the filepath.
|
256
|
+
|
257
|
+
To reproduce::
|
258
|
+
|
259
|
+
pbmc68k = ln.core.datasets.anndata_pbmc68k_reduced()
|
260
|
+
pbmc68k_test = pbmc68k[:30, :200].copy()
|
261
|
+
pbmc68k_test.raw = pbmc68k_test[:, :100]
|
262
|
+
pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
|
263
|
+
pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
|
264
|
+
pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
|
265
|
+
pbmc68k_test.layers["test"][0] = 1.
|
266
|
+
pbmc68k_test.write("pbmc68k_test.h5ad")
|
267
|
+
"""
|
268
|
+
filepath, _ = urlretrieve(
|
269
|
+
"https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
|
270
|
+
)
|
271
|
+
return Path(filepath)
|
272
|
+
|
273
|
+
|
274
|
+
def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover
|
275
|
+
"""Modified from scanpy.pbmc3k_processed()."""
|
276
|
+
filepath, _ = urlretrieve(
|
277
|
+
"https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
|
278
|
+
)
|
279
|
+
pbmc3k = ad.read_h5ad(filepath)
|
280
|
+
pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
|
281
|
+
return pbmc3k
|
282
|
+
|
283
|
+
|
284
|
+
def anndata_human_immune_cells(
|
285
|
+
populate_registries: bool = False,
|
286
|
+
) -> ad.AnnData: # pragma: no cover
|
287
|
+
"""Cross-tissue immune cell analysis reveals tissue-specific features in humans.
|
288
|
+
|
289
|
+
From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3
|
290
|
+
Collection: Global
|
291
|
+
|
292
|
+
To reproduce the subsample::
|
293
|
+
>>> adata = sc.read('Global.h5ad')
|
294
|
+
>>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()
|
295
|
+
>>> sc.pp.subsample(adata, fraction=0.005)
|
296
|
+
>>> del adata.uns["development_cache_ontology_term_id_colors"]
|
297
|
+
>>> del adata.uns["sex_ontology_term_id_colors"]
|
298
|
+
>>> adata.write('human_immune.h5ad')
|
299
|
+
"""
|
300
|
+
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
|
301
|
+
adata = ad.read_h5ad(filepath)
|
302
|
+
adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
|
303
|
+
adata.uns.pop("cell_type_ontology_term_id_colors")
|
304
|
+
adata.uns.pop("title")
|
305
|
+
adata.uns.pop("schema_version")
|
306
|
+
adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
|
307
|
+
columns = [col for col in adata.obs.columns if "ontology_term" not in col]
|
308
|
+
adata.obs = adata.obs[columns]
|
309
|
+
if populate_registries:
|
310
|
+
import bionty as bt
|
311
|
+
|
312
|
+
import lamindb as ln
|
313
|
+
|
314
|
+
verbosity = ln.settings.verbosity
|
315
|
+
ln.settings.verbosity = "error"
|
316
|
+
ln.save(
|
317
|
+
bt.Gene.from_values(
|
318
|
+
adata.var.index, field="ensembl_gene_id", organism="human"
|
319
|
+
)
|
320
|
+
)
|
321
|
+
ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
|
322
|
+
ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
|
323
|
+
ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
|
324
|
+
ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
|
325
|
+
ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
|
326
|
+
ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
|
327
|
+
ln.Feature(name="organism", dtype=[bt.Organism]).save()
|
328
|
+
ln.Feature(name="donor", dtype=[ln.ULabel]).save()
|
329
|
+
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
330
|
+
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
331
|
+
ln.settings.verbosity = verbosity
|
332
|
+
return adata
|
333
|
+
|
334
|
+
|
335
|
+
def anndata_with_obs() -> ad.AnnData:
|
336
|
+
"""Create a mini anndata with cell_type, disease and tissue."""
|
337
|
+
import anndata as ad
|
338
|
+
import bionty.base as bionty_base
|
339
|
+
|
340
|
+
celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
|
341
|
+
celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
|
342
|
+
diseases = [
|
343
|
+
"chronic kidney disease",
|
344
|
+
"liver lymphoma",
|
345
|
+
"cardiac ventricle disorder",
|
346
|
+
"Alzheimer disease",
|
347
|
+
]
|
348
|
+
tissues = ["kidney", "liver", "heart", "brain"]
|
349
|
+
df = pd.DataFrame()
|
350
|
+
df["cell_type"] = celltypes * 10
|
351
|
+
df["cell_type_id"] = celltype_ids * 10
|
352
|
+
df["tissue"] = tissues * 10
|
353
|
+
df["disease"] = diseases * 10
|
354
|
+
df.index = "obs" + df.index.astype(str)
|
355
|
+
|
356
|
+
adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
|
357
|
+
adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
|
358
|
+
|
359
|
+
return adata
|
360
|
+
|
361
|
+
|
362
|
+
def anndata_suo22_Visium10X(): # pragma: no cover
|
363
|
+
"""AnnData from Suo22 generated by 10x Visium."""
|
364
|
+
import anndata as ad
|
365
|
+
|
366
|
+
filepath, _ = urlretrieve(
|
367
|
+
"https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
|
368
|
+
"Visium10X_data_LI_subset.h5ad",
|
369
|
+
)
|
370
|
+
Path("suo22/").mkdir(exist_ok=True)
|
371
|
+
filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
|
372
|
+
return ad.read_h5ad(filepath)
|
373
|
+
|
374
|
+
|
375
|
+
def mudata_papalexi21_subset() -> MuData: # pragma: no cover
|
376
|
+
"""A subsetted mudata from papalexi21.
|
377
|
+
|
378
|
+
To reproduce the subsetting:
|
379
|
+
>>> !wget https://figshare.com/ndownloader/files/36509460
|
380
|
+
>>> import mudata as md
|
381
|
+
>>> import scanpy as sc
|
382
|
+
>>> mdata = md.read_h5mu("36509460")
|
383
|
+
>>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
|
384
|
+
>>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")
|
385
|
+
"""
|
386
|
+
import mudata as md
|
387
|
+
|
388
|
+
md.set_options(pull_on_update=False)
|
389
|
+
|
390
|
+
filepath, _ = urlretrieve(
|
391
|
+
"https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",
|
392
|
+
"papalexi21_subset.h5mu",
|
393
|
+
)
|
394
|
+
|
395
|
+
mdata = md.read_h5mu(filepath)
|
396
|
+
|
397
|
+
mdata.pull_obs()
|
398
|
+
|
399
|
+
# The MuData object is malformed with duplicated information
|
400
|
+
# Drop all columns for the modalities and add them again correspondingly
|
401
|
+
for mod in ["rna", "adt", "hto", "gdo"]:
|
402
|
+
mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)
|
403
|
+
for col in mdata.obs.columns:
|
404
|
+
for mod in ["rna", "adt", "hto", "gdo"]:
|
405
|
+
if col.endswith(f"_{mod.upper()}"):
|
406
|
+
new_col = col.replace(f"{mod}:", "")
|
407
|
+
if new_col != col:
|
408
|
+
mdata[mod].obs[new_col] = mdata.obs.pop(col)
|
409
|
+
else:
|
410
|
+
new_col = col.replace(f"{mod}:", "")
|
411
|
+
if new_col not in mdata.obs.columns and col in mdata.obs.columns:
|
412
|
+
mdata.obs[new_col] = mdata.obs.pop(col)
|
413
|
+
|
414
|
+
for col in mdata.obs.columns:
|
415
|
+
for mod in ["rna", "adt", "hto", "gdo"]:
|
416
|
+
if col.endswith(f"_{mod.upper()}"):
|
417
|
+
del mdata.obs[col]
|
418
|
+
|
419
|
+
for col in [
|
420
|
+
"orig.ident",
|
421
|
+
"MULTI_ID",
|
422
|
+
"NT",
|
423
|
+
"S.Score",
|
424
|
+
"G2M.Score",
|
425
|
+
"Phase",
|
426
|
+
"gene_target",
|
427
|
+
"guide_ID",
|
428
|
+
"HTO_classification",
|
429
|
+
]:
|
430
|
+
del mdata.obs[col]
|
431
|
+
|
432
|
+
mdata.push_obs(["percent.mito"], mods=["rna"], drop=True)
|
433
|
+
mdata["hto"].obs["technique"] = "cell hashing"
|
434
|
+
mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
|
435
|
+
mdata.pull_obs(["technique"], mods="hto")
|
436
|
+
|
437
|
+
return mdata
|
438
|
+
|
439
|
+
|
440
|
+
def df_iris() -> pd.DataFrame:
|
441
|
+
"""The iris collection as in sklearn.
|
442
|
+
|
443
|
+
Original code::
|
444
|
+
|
445
|
+
sklearn.collections.load_iris(as_frame=True).frame
|
446
|
+
"""
|
447
|
+
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
|
448
|
+
return pd.read_parquet(filepath)
|
449
|
+
|
450
|
+
|
451
|
+
def df_iris_in_meter() -> pd.DataFrame:
|
452
|
+
"""The iris collection with lengths in meter."""
|
453
|
+
df = df_iris()
|
454
|
+
# rename columns
|
455
|
+
df.rename(
|
456
|
+
columns={
|
457
|
+
"sepal length (cm)": "sepal_length",
|
458
|
+
"sepal width (cm)": "sepal_width",
|
459
|
+
"petal length (cm)": "petal_length",
|
460
|
+
"petal width (cm)": "petal_width",
|
461
|
+
},
|
462
|
+
inplace=True,
|
463
|
+
)
|
464
|
+
df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
|
465
|
+
df["iris_organism_name"] = df["target"].map(
|
466
|
+
{0: "setosa", 1: "versicolor", 2: "virginica"}
|
467
|
+
)
|
468
|
+
del df["target"]
|
469
|
+
return df
|
470
|
+
|
471
|
+
|
472
|
+
def df_iris_in_meter_study1() -> pd.DataFrame:
|
473
|
+
"""The iris collection with lengths in meter."""
|
474
|
+
df_iris = df_iris_in_meter()
|
475
|
+
return df_iris.iloc[: len(df_iris) // 2]
|
476
|
+
|
477
|
+
|
478
|
+
def df_iris_in_meter_study2() -> pd.DataFrame:
|
479
|
+
"""The iris collection with lengths in meter."""
|
480
|
+
df_iris = df_iris_in_meter()
|
481
|
+
return df_iris.iloc[len(df_iris) // 2 :]
|
482
|
+
|
483
|
+
|
484
|
+
def dir_scrnaseq_cellranger(
|
485
|
+
sample_name: str, basedir: str | Path = "./", output_only: bool = True
|
486
|
+
): # pragma: no cover
|
487
|
+
"""Generate mock cell ranger outputs.
|
488
|
+
|
489
|
+
Args:
|
490
|
+
sample_name: name of the sample
|
491
|
+
basedir: run directory
|
492
|
+
output_only: only generate output files
|
493
|
+
"""
|
494
|
+
basedir = Path(basedir)
|
495
|
+
|
496
|
+
if not output_only: # pragma: no cover
|
497
|
+
fastqdir = basedir / "fastq"
|
498
|
+
fastqdir.mkdir(parents=True, exist_ok=True)
|
499
|
+
fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz"
|
500
|
+
with open(fastqfile1, "w") as f:
|
501
|
+
f.write(f"{ids.base62(n_char=6)}")
|
502
|
+
fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz"
|
503
|
+
fastqfile2.touch(exist_ok=True)
|
504
|
+
with open(fastqfile2, "w") as f:
|
505
|
+
f.write(f"{ids.base62(n_char=6)}")
|
506
|
+
|
507
|
+
sampledir = basedir / f"{sample_name}"
|
508
|
+
for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]:
|
509
|
+
filedir = sampledir / folder
|
510
|
+
filedir.mkdir(parents=True, exist_ok=True)
|
511
|
+
|
512
|
+
for filename in [
|
513
|
+
"web_summary.html",
|
514
|
+
"metrics_summary.csv",
|
515
|
+
"possorted_genome_bam.bam",
|
516
|
+
"possorted_genome_bam.bam.bai",
|
517
|
+
"molecule_info.h5",
|
518
|
+
"cloupe.cloupe",
|
519
|
+
"raw_feature_bc_matrix.h5",
|
520
|
+
"raw_feature_bc_matrix/barcodes.tsv.gz",
|
521
|
+
"raw_feature_bc_matrix/features.tsv.gz",
|
522
|
+
"raw_feature_bc_matrix/matrix.mtx.gz",
|
523
|
+
"filtered_feature_bc_matrix.h5",
|
524
|
+
"filtered_feature_bc_matrix/barcodes.tsv.gz",
|
525
|
+
"filtered_feature_bc_matrix/features.tsv.gz",
|
526
|
+
"filtered_feature_bc_matrix/matrix.mtx.gz",
|
527
|
+
"analysis/analysis.csv",
|
528
|
+
]:
|
529
|
+
file = sampledir / filename
|
530
|
+
with open(file, "w") as f:
|
531
|
+
f.write(f"{ids.base62(n_char=6)}")
|
532
|
+
|
533
|
+
return sampledir
|
534
|
+
|
535
|
+
|
536
|
+
def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
|
537
|
+
"""CRISPRi screen collection of Schmidt22.
|
538
|
+
|
539
|
+
Originally from: https://zenodo.org/record/5784651
|
540
|
+
"""
|
541
|
+
filepath, _ = urlretrieve(
|
542
|
+
"https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
|
543
|
+
"schmidt22-crispra-gws-IFNG.csv",
|
544
|
+
)
|
545
|
+
return Path(filepath).rename(Path(basedir) / filepath)
|
546
|
+
|
547
|
+
|
548
|
+
def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
|
549
|
+
"""Perturb-seq collection of Schmidt22.
|
550
|
+
|
551
|
+
Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
|
552
|
+
|
553
|
+
To reproduce the subsample:
|
554
|
+
>>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
|
555
|
+
>>> adata.obs = adata.obs[['cluster_name']]
|
556
|
+
>>> del adata.obsp
|
557
|
+
>>> del adata.var['features']
|
558
|
+
>>> del adata.obsm['X_pca']
|
559
|
+
>>> del adata.uns
|
560
|
+
>>> del adata.raw
|
561
|
+
>>> del adata.varm
|
562
|
+
>>> adata.obs = adata.obs.reset_index()
|
563
|
+
>>> del adata.obs['index']
|
564
|
+
>>> sc.pp.subsample(adata, 0.03)
|
565
|
+
>>> adata.write('schmidt22_perturbseq.h5ad')
|
566
|
+
"""
|
567
|
+
filepath, _ = urlretrieve(
|
568
|
+
"https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
|
569
|
+
"schmidt22_perturbseq.h5ad",
|
570
|
+
)
|
571
|
+
return Path(filepath).rename(Path(basedir) / filepath)
|