lamindb 0.76.8__py3-none-any.whl → 0.76.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1205
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +389 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +631 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +581 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -90
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -172
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/METADATA +4 -4
  59. lamindb-0.76.9.dist-info/RECORD +60 -0
  60. {lamindb-0.76.8.dist-info → lamindb-0.76.9.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.8.dist-info/RECORD +0 -60
@@ -1,571 +1,581 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING
5
- from urllib.request import urlretrieve
6
-
7
- import anndata as ad
8
- import numpy as np
9
- import pandas as pd
10
- from lnschema_core import ids
11
- from upath import UPath
12
-
13
- from lamindb.core._settings import settings
14
-
15
- if TYPE_CHECKING:
16
- from mudata import MuData
17
-
18
-
19
- def file_fcs() -> Path:
20
- """Example FCS artifact."""
21
- filepath, _ = urlretrieve(
22
- "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
23
- )
24
- return Path(filepath)
25
-
26
-
27
- def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no cover
28
- """FCS file from Alpert19.
29
-
30
- Args:
31
- populate_registries: pre-populate metadata records to simulate existing registries # noqa
32
- """
33
- filepath, _ = urlretrieve(
34
- "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",
35
- "Alpert19.fcs",
36
- )
37
- if populate_registries:
38
- import bionty as bt
39
- import readfcs
40
-
41
- import lamindb as ln
42
-
43
- verbosity = ln.settings.verbosity
44
- ln.settings.verbosity = "error"
45
- adata = readfcs.read(filepath)
46
- std = bt.CellMarker.public().standardize(adata.var.index)
47
- ln.save(
48
- bt.CellMarker.from_values(
49
- bt.CellMarker.public().inspect(std, "name").validated, "name"
50
- )
51
- )
52
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
53
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
54
- ln.settings.verbosity = verbosity
55
- return Path(filepath)
56
-
57
-
58
- def file_jpg_paradisi05() -> Path:
59
- """Return jpg file example.
60
-
61
- Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg
62
- """
63
- filepath, _ = urlretrieve(
64
- "https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg",
65
- "paradisi05_laminopathic_nuclei.jpg",
66
- )
67
- return Path(filepath)
68
-
69
-
70
- def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
71
- populate_registries: bool = False,
72
- ) -> Path: # pragma: no cover
73
- """Gene counts table from nf-core RNA-seq pipeline.
74
-
75
- Output of: https://nf-co.re/rnaseq
76
- """
77
- filepath, _ = urlretrieve(
78
- "https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
79
- "salmon.merged.gene_counts.tsv",
80
- )
81
- if populate_registries:
82
- import bionty as bt
83
-
84
- import lamindb as ln
85
-
86
- verbosity = ln.settings.verbosity
87
- ln.settings.verbosity = "error"
88
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
89
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
90
- bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
91
- ln.settings.verbosity = verbosity
92
-
93
- return Path(filepath)
94
-
95
-
96
- def file_fastq(in_storage_root=False) -> Path:
97
- """Mini mock fastq artifact."""
98
- basedir = Path() if not in_storage_root else settings.storage.root
99
- filepath = basedir / "input.fastq.gz"
100
- with open(filepath, "w") as f:
101
- f.write("Mock fastq artifact.")
102
- return filepath
103
-
104
-
105
- def file_bam(in_storage_root=False) -> Path: # pragma: no cover
106
- """Mini mock bam artifact."""
107
- basedir = Path() if not in_storage_root else settings.storage.root
108
- filepath = basedir / "output.bam"
109
- with open(filepath, "w") as f:
110
- f.write("Mock bam artifact.")
111
- return filepath
112
-
113
-
114
- def file_mini_csv(in_storage_root=False) -> Path:
115
- """Mini csv artifact."""
116
- basedir = Path() if not in_storage_root else settings.storage.root
117
- filepath = basedir / "mini.csv"
118
- df = pd.DataFrame([1, 2, 3], columns=["test"])
119
- df.to_csv(filepath, index=False)
120
- return filepath
121
-
122
-
123
- def file_tiff_suo22() -> Path: # pragma: no cover
124
- """Image file from Suo22.
125
-
126
- Pair with anndata_suo22_Visium10X
127
- """
128
- filepath, _ = urlretrieve(
129
- "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
130
- "F121_LP1_4LIV.tiff",
131
- )
132
- Path("suo22/").mkdir(exist_ok=True)
133
- filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff") # type: ignore
134
- return Path(filepath)
135
-
136
-
137
- def dir_iris_images() -> UPath: # pragma: no cover
138
- """Directory with 3 studies of the Iris flower: 405 images & metadata.
139
-
140
- Based on: https://github.com/laminlabs/lamindb-dev-datasets/pull/2
141
- """
142
- return UPath("s3://lamindata/iris_studies")
143
-
144
-
145
- def anndata_mouse_sc_lymph_node(
146
- populate_registries: bool = False,
147
- ) -> ad.AnnData: # pragma: no cover
148
- """Mouse lymph node scRNA-seq collection from EBI.
149
-
150
- Subsampled to 10k genes.
151
-
152
- From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/
153
-
154
- Args:
155
- populate_registries: pre-populate metadata records to simulate existing registries # noqa
156
- """
157
- filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
158
- adata = ad.read_h5ad(filepath)
159
-
160
- # The column names are a bit lengthy, let's abbreviate them:
161
- adata.obs.columns = (
162
- adata.obs.columns.str.replace("Sample Characteristic", "")
163
- .str.replace("Factor Value ", "Factor Value:", regex=True)
164
- .str.replace("Factor Value\\[", "Factor Value:", regex=True)
165
- .str.replace(" Ontology Term\\[", "ontology_id:", regex=True)
166
- .str.strip("[]")
167
- .str.replace("organism part", "tissue")
168
- .str.replace("organism", "organism")
169
- .str.replace("developmental stage", "developmental_stage")
170
- .str.replace("cell type", "cell_type")
171
- # the last one could be interesting, too
172
- # .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
173
- )
174
- # subset columns to only the ones with names
175
- columns = [
176
- col
177
- for col in adata.obs.columns
178
- if not col.startswith("ontology_id")
179
- and not col.startswith("Factor Value")
180
- and col != "strain"
181
- ]
182
- adata.obs = adata.obs[columns]
183
-
184
- # pre-populate registries
185
- if populate_registries:
186
- import bionty as bt
187
-
188
- import lamindb as ln
189
-
190
- verbosity = ln.settings.verbosity
191
- ln.settings.verbosity = "error"
192
- # strain
193
- bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save()
194
- # developmental stage
195
- bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save()
196
- # tissue
197
- bt.Tissue.from_source(ontology_id="UBERON:0001542").save()
198
- # cell types
199
- ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
200
- # assays
201
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
202
- bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
203
- # genes
204
- validated = bt.Gene.public(organism="mouse").validate(
205
- adata.var.index, field="ensembl_gene_id"
206
- )
207
- ln.save(
208
- bt.Gene.from_values(
209
- adata.var.index[validated][:-19],
210
- field="ensembl_gene_id",
211
- organism="mouse",
212
- )
213
- )
214
- # labels
215
- labels = []
216
- for col in ["sex", "age", "genotype", "immunophenotype"]:
217
- labels += [ln.ULabel(name=name) for name in adata.obs[col]]
218
- ln.save(labels)
219
- ln.settings.verbosity = verbosity
220
-
221
- return adata
222
-
223
-
224
- def anndata_pbmc68k_reduced() -> ad.AnnData:
225
- """Modified from scanpy.collections.pbmc68k_reduced().
226
-
227
- This code was run::
228
-
229
- pbmc68k = sc.collections.pbmc68k_reduced()
230
- pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
231
- pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
232
- {"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
233
- )
234
- del pbmc68k.obs["G2M_score"]
235
- del pbmc68k.obs["S_score"]
236
- del pbmc68k.obs["phase"]
237
- del pbmc68k.obs["n_counts"]
238
- del pbmc68k.var["dispersions"]
239
- del pbmc68k.var["dispersions_norm"]
240
- del pbmc68k.var["means"]
241
- del pbmc68k.uns["rank_genes_groups"]
242
- del pbmc68k.uns["bulk_labels_colors"]
243
- sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)
244
- pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad")
245
- """
246
- filepath, _ = urlretrieve(
247
- "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
248
- )
249
- return ad.read_h5ad(filepath)
250
-
251
-
252
- def anndata_file_pbmc68k_test() -> Path:
253
- """Modified from scanpy.collections.pbmc68k_reduced().
254
-
255
- Additional slots were added for testing purposes. Returns the filepath.
256
-
257
- To reproduce::
258
-
259
- pbmc68k = ln.core.datasets.anndata_pbmc68k_reduced()
260
- pbmc68k_test = pbmc68k[:30, :200].copy()
261
- pbmc68k_test.raw = pbmc68k_test[:, :100]
262
- pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
263
- pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
264
- pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
265
- pbmc68k_test.layers["test"][0] = 1.
266
- pbmc68k_test.write("pbmc68k_test.h5ad")
267
- """
268
- filepath, _ = urlretrieve(
269
- "https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
270
- )
271
- return Path(filepath)
272
-
273
-
274
- def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover
275
- """Modified from scanpy.pbmc3k_processed()."""
276
- filepath, _ = urlretrieve(
277
- "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
278
- )
279
- pbmc3k = ad.read_h5ad(filepath)
280
- pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
281
- return pbmc3k
282
-
283
-
284
- def anndata_human_immune_cells(
285
- populate_registries: bool = False,
286
- ) -> ad.AnnData: # pragma: no cover
287
- """Cross-tissue immune cell analysis reveals tissue-specific features in humans.
288
-
289
- From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3
290
- Collection: Global
291
-
292
- To reproduce the subsample::
293
- >>> adata = sc.read('Global.h5ad')
294
- >>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()
295
- >>> sc.pp.subsample(adata, fraction=0.005)
296
- >>> del adata.uns["development_cache_ontology_term_id_colors"]
297
- >>> del adata.uns["sex_ontology_term_id_colors"]
298
- >>> adata.write('human_immune.h5ad')
299
- """
300
- filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
301
- adata = ad.read_h5ad(filepath)
302
- adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
303
- adata.uns.pop("cell_type_ontology_term_id_colors")
304
- adata.uns.pop("title")
305
- adata.uns.pop("schema_version")
306
- adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
307
- columns = [col for col in adata.obs.columns if "ontology_term" not in col]
308
- adata.obs = adata.obs[columns]
309
- if populate_registries:
310
- import bionty as bt
311
-
312
- import lamindb as ln
313
-
314
- verbosity = ln.settings.verbosity
315
- ln.settings.verbosity = "error"
316
- ln.save(
317
- bt.Gene.from_values(
318
- adata.var.index, field="ensembl_gene_id", organism="human"
319
- )
320
- )
321
- ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
322
- ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
323
- ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
324
- ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
325
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
326
- ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
327
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
328
- ln.Feature(name="donor", dtype=[ln.ULabel]).save()
329
- bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
330
- ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
331
- ln.settings.verbosity = verbosity
332
- return adata
333
-
334
-
335
- def anndata_with_obs() -> ad.AnnData:
336
- """Create a mini anndata with cell_type, disease and tissue."""
337
- import anndata as ad
338
- import bionty.base as bionty_base
339
-
340
- celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
341
- celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
342
- diseases = [
343
- "chronic kidney disease",
344
- "liver lymphoma",
345
- "cardiac ventricle disorder",
346
- "Alzheimer disease",
347
- ]
348
- tissues = ["kidney", "liver", "heart", "brain"]
349
- df = pd.DataFrame()
350
- df["cell_type"] = celltypes * 10
351
- df["cell_type_id"] = celltype_ids * 10
352
- df["tissue"] = tissues * 10
353
- df["disease"] = diseases * 10
354
- df.index = "obs" + df.index.astype(str)
355
-
356
- adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
357
- adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
358
-
359
- return adata
360
-
361
-
362
- def anndata_suo22_Visium10X(): # pragma: no cover
363
- """AnnData from Suo22 generated by 10x Visium."""
364
- import anndata as ad
365
-
366
- filepath, _ = urlretrieve(
367
- "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
368
- "Visium10X_data_LI_subset.h5ad",
369
- )
370
- Path("suo22/").mkdir(exist_ok=True)
371
- filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
372
- return ad.read_h5ad(filepath)
373
-
374
-
375
- def mudata_papalexi21_subset() -> MuData: # pragma: no cover
376
- """A subsetted mudata from papalexi21.
377
-
378
- To reproduce the subsetting:
379
- >>> !wget https://figshare.com/ndownloader/files/36509460
380
- >>> import mudata as md
381
- >>> import scanpy as sc
382
- >>> mdata = md.read_h5mu("36509460")
383
- >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
384
- >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")
385
- """
386
- import mudata as md
387
-
388
- md.set_options(pull_on_update=False)
389
-
390
- filepath, _ = urlretrieve(
391
- "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",
392
- "papalexi21_subset.h5mu",
393
- )
394
-
395
- mdata = md.read_h5mu(filepath)
396
-
397
- mdata.pull_obs()
398
-
399
- # The MuData object is malformed with duplicated information
400
- # Drop all columns for the modalities and add them again correspondingly
401
- for mod in ["rna", "adt", "hto", "gdo"]:
402
- mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)
403
- for col in mdata.obs.columns:
404
- for mod in ["rna", "adt", "hto", "gdo"]:
405
- if col.endswith(f"_{mod.upper()}"):
406
- new_col = col.replace(f"{mod}:", "")
407
- if new_col != col:
408
- mdata[mod].obs[new_col] = mdata.obs.pop(col)
409
- else:
410
- new_col = col.replace(f"{mod}:", "")
411
- if new_col not in mdata.obs.columns and col in mdata.obs.columns:
412
- mdata.obs[new_col] = mdata.obs.pop(col)
413
-
414
- for col in mdata.obs.columns:
415
- for mod in ["rna", "adt", "hto", "gdo"]:
416
- if col.endswith(f"_{mod.upper()}"):
417
- del mdata.obs[col]
418
-
419
- for col in [
420
- "orig.ident",
421
- "MULTI_ID",
422
- "NT",
423
- "S.Score",
424
- "G2M.Score",
425
- "Phase",
426
- "gene_target",
427
- "guide_ID",
428
- "HTO_classification",
429
- ]:
430
- del mdata.obs[col]
431
-
432
- mdata.push_obs(["percent.mito"], mods=["rna"], drop=True)
433
- mdata["hto"].obs["technique"] = "cell hashing"
434
- mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
435
- mdata.pull_obs(["technique"], mods="hto")
436
-
437
- return mdata
438
-
439
-
440
- def df_iris() -> pd.DataFrame:
441
- """The iris collection as in sklearn.
442
-
443
- Original code::
444
-
445
- sklearn.collections.load_iris(as_frame=True).frame
446
- """
447
- filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
448
- return pd.read_parquet(filepath)
449
-
450
-
451
- def df_iris_in_meter() -> pd.DataFrame:
452
- """The iris collection with lengths in meter."""
453
- df = df_iris()
454
- # rename columns
455
- df.rename(
456
- columns={
457
- "sepal length (cm)": "sepal_length",
458
- "sepal width (cm)": "sepal_width",
459
- "petal length (cm)": "petal_length",
460
- "petal width (cm)": "petal_width",
461
- },
462
- inplace=True,
463
- )
464
- df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
465
- df["iris_organism_name"] = df["target"].map(
466
- {0: "setosa", 1: "versicolor", 2: "virginica"}
467
- )
468
- del df["target"]
469
- return df
470
-
471
-
472
- def df_iris_in_meter_study1() -> pd.DataFrame:
473
- """The iris collection with lengths in meter."""
474
- df_iris = df_iris_in_meter()
475
- return df_iris.iloc[: len(df_iris) // 2]
476
-
477
-
478
- def df_iris_in_meter_study2() -> pd.DataFrame:
479
- """The iris collection with lengths in meter."""
480
- df_iris = df_iris_in_meter()
481
- return df_iris.iloc[len(df_iris) // 2 :]
482
-
483
-
484
- def dir_scrnaseq_cellranger(
485
- sample_name: str, basedir: str | Path = "./", output_only: bool = True
486
- ): # pragma: no cover
487
- """Generate mock cell ranger outputs.
488
-
489
- Args:
490
- sample_name: name of the sample
491
- basedir: run directory
492
- output_only: only generate output files
493
- """
494
- basedir = Path(basedir)
495
-
496
- if not output_only: # pragma: no cover
497
- fastqdir = basedir / "fastq"
498
- fastqdir.mkdir(parents=True, exist_ok=True)
499
- fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz"
500
- with open(fastqfile1, "w") as f:
501
- f.write(f"{ids.base62(n_char=6)}")
502
- fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz"
503
- fastqfile2.touch(exist_ok=True)
504
- with open(fastqfile2, "w") as f:
505
- f.write(f"{ids.base62(n_char=6)}")
506
-
507
- sampledir = basedir / f"{sample_name}"
508
- for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]:
509
- filedir = sampledir / folder
510
- filedir.mkdir(parents=True, exist_ok=True)
511
-
512
- for filename in [
513
- "web_summary.html",
514
- "metrics_summary.csv",
515
- "possorted_genome_bam.bam",
516
- "possorted_genome_bam.bam.bai",
517
- "molecule_info.h5",
518
- "cloupe.cloupe",
519
- "raw_feature_bc_matrix.h5",
520
- "raw_feature_bc_matrix/barcodes.tsv.gz",
521
- "raw_feature_bc_matrix/features.tsv.gz",
522
- "raw_feature_bc_matrix/matrix.mtx.gz",
523
- "filtered_feature_bc_matrix.h5",
524
- "filtered_feature_bc_matrix/barcodes.tsv.gz",
525
- "filtered_feature_bc_matrix/features.tsv.gz",
526
- "filtered_feature_bc_matrix/matrix.mtx.gz",
527
- "analysis/analysis.csv",
528
- ]:
529
- file = sampledir / filename
530
- with open(file, "w") as f:
531
- f.write(f"{ids.base62(n_char=6)}")
532
-
533
- return sampledir
534
-
535
-
536
- def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
537
- """CRISPRi screen collection of Schmidt22.
538
-
539
- Originally from: https://zenodo.org/record/5784651
540
- """
541
- filepath, _ = urlretrieve(
542
- "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
543
- "schmidt22-crispra-gws-IFNG.csv",
544
- )
545
- return Path(filepath).rename(Path(basedir) / filepath)
546
-
547
-
548
- def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
549
- """Perturb-seq collection of Schmidt22.
550
-
551
- Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
552
-
553
- To reproduce the subsample:
554
- >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
555
- >>> adata.obs = adata.obs[['cluster_name']]
556
- >>> del adata.obsp
557
- >>> del adata.var['features']
558
- >>> del adata.obsm['X_pca']
559
- >>> del adata.uns
560
- >>> del adata.raw
561
- >>> del adata.varm
562
- >>> adata.obs = adata.obs.reset_index()
563
- >>> del adata.obs['index']
564
- >>> sc.pp.subsample(adata, 0.03)
565
- >>> adata.write('schmidt22_perturbseq.h5ad')
566
- """
567
- filepath, _ = urlretrieve(
568
- "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
569
- "schmidt22_perturbseq.h5ad",
570
- )
571
- return Path(filepath).rename(Path(basedir) / filepath)
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+ from urllib.request import urlretrieve
6
+
7
+ import anndata as ad
8
+ import numpy as np
9
+ import pandas as pd
10
+ from lnschema_core import ids
11
+ from upath import UPath
12
+
13
+ from lamindb.core._settings import settings
14
+
15
+ if TYPE_CHECKING:
16
+ from mudata import MuData
17
+
18
+
19
+ def file_fcs() -> Path:
20
+ """Example FCS artifact."""
21
+ filepath, _ = urlretrieve(
22
+ "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
23
+ )
24
+ return Path(filepath)
25
+
26
+
27
+ def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no cover
28
+ """FCS file from Alpert19.
29
+
30
+ Args:
31
+ populate_registries: pre-populate metadata records to simulate existing registries # noqa
32
+ """
33
+ filepath, _ = urlretrieve(
34
+ "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs",
35
+ "Alpert19.fcs",
36
+ )
37
+ if populate_registries:
38
+ import bionty as bt
39
+ import readfcs
40
+
41
+ import lamindb as ln
42
+
43
+ verbosity = ln.settings.verbosity
44
+ ln.settings.verbosity = "error"
45
+ adata = readfcs.read(filepath)
46
+ std = bt.CellMarker.public().standardize(adata.var.index)
47
+ ln.save(
48
+ bt.CellMarker.from_values(
49
+ bt.CellMarker.public().inspect(std, "name").validated, "name"
50
+ )
51
+ )
52
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
53
+ ln.Feature(name="organism", dtype=[bt.Organism]).save()
54
+ ln.settings.verbosity = verbosity
55
+ return Path(filepath)
56
+
57
+
58
+ def file_jpg_paradisi05() -> Path:
59
+ """Return jpg file example.
60
+
61
+ Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg
62
+ """
63
+ filepath, _ = urlretrieve(
64
+ "https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg",
65
+ "paradisi05_laminopathic_nuclei.jpg",
66
+ )
67
+ return Path(filepath)
68
+
69
+
70
+ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
71
+ populate_registries: bool = False,
72
+ ) -> Path: # pragma: no cover
73
+ """Gene counts table from nf-core RNA-seq pipeline.
74
+
75
+ Output of: https://nf-co.re/rnaseq
76
+ """
77
+ filepath, _ = urlretrieve(
78
+ "https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
79
+ "salmon.merged.gene_counts.tsv",
80
+ )
81
+ if populate_registries:
82
+ import bionty as bt
83
+
84
+ import lamindb as ln
85
+
86
+ verbosity = ln.settings.verbosity
87
+ ln.settings.verbosity = "error"
88
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
89
+ ln.Feature(name="organism", dtype=[bt.Organism]).save()
90
+ bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
91
+ ln.settings.verbosity = verbosity
92
+
93
+ return Path(filepath)
94
+
95
+
96
+ def file_fastq(in_storage_root=False) -> Path:
97
+ """Mini mock fastq artifact."""
98
+ basedir = Path() if not in_storage_root else settings.storage.root
99
+ filepath = basedir / "input.fastq.gz"
100
+ with open(filepath, "w") as f:
101
+ f.write("Mock fastq artifact.")
102
+ return filepath
103
+
104
+
105
+ def file_bam(in_storage_root=False) -> Path: # pragma: no cover
106
+ """Mini mock bam artifact."""
107
+ basedir = Path() if not in_storage_root else settings.storage.root
108
+ filepath = basedir / "output.bam"
109
+ with open(filepath, "w") as f:
110
+ f.write("Mock bam artifact.")
111
+ return filepath
112
+
113
+
114
+ def file_mini_csv(in_storage_root=False) -> Path:
115
+ """Mini csv artifact."""
116
+ basedir = Path() if not in_storage_root else settings.storage.root
117
+ filepath = basedir / "mini.csv"
118
+ df = pd.DataFrame([1, 2, 3], columns=["test"])
119
+ df.to_csv(filepath, index=False)
120
+ return filepath
121
+
122
+
123
+ def file_tiff_suo22() -> Path: # pragma: no cover
124
+ """Image file from Suo22.
125
+
126
+ Pair with anndata_suo22_Visium10X
127
+ """
128
+ filepath, _ = urlretrieve(
129
+ "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
130
+ "F121_LP1_4LIV.tiff",
131
+ )
132
+ Path("suo22/").mkdir(exist_ok=True)
133
+ filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff") # type: ignore
134
+ return Path(filepath)
135
+
136
+
137
+ def dir_iris_images() -> UPath: # pragma: no cover
138
+ """Directory with 3 studies of the Iris flower: 405 images & metadata.
139
+
140
+ Provenance: https://lamin.ai/laminlabs/lamindata/transform/3q4MpQxRL2qZ5zKv
141
+
142
+ The problem is that the same artifact was also ingested by the downstream
143
+ demo notebook:
144
+ https://lamin.ai/laminlabs/lamindata/transform/NJvdsWWbJlZS5zKv
145
+
146
+ This is why on the UI, the artifact shows up as output of the downstream
147
+ demo notebook rather than the upstream curation notebook. The lineage
148
+ information should still be captured by
149
+ https://github.com/laminlabs/lnschema-core/blob/a90437e91dfbd6b9002f18c3e978bd0f9c9a632d/lnschema_core/models.py#L2050-L2052
150
+ but we don't use this in the UI yet.
151
+ """
152
+ return UPath("s3://lamindata/iris_studies")
153
+
154
+
155
+ def anndata_mouse_sc_lymph_node(
156
+ populate_registries: bool = False,
157
+ ) -> ad.AnnData: # pragma: no cover
158
+ """Mouse lymph node scRNA-seq collection from EBI.
159
+
160
+ Subsampled to 10k genes.
161
+
162
+ From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/
163
+
164
+ Args:
165
+ populate_registries: pre-populate metadata records to simulate existing registries # noqa
166
+ """
167
+ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad")
168
+ adata = ad.read_h5ad(filepath)
169
+
170
+ # The column names are a bit lengthy, let's abbreviate them:
171
+ adata.obs.columns = (
172
+ adata.obs.columns.str.replace("Sample Characteristic", "")
173
+ .str.replace("Factor Value ", "Factor Value:", regex=True)
174
+ .str.replace("Factor Value\\[", "Factor Value:", regex=True)
175
+ .str.replace(" Ontology Term\\[", "ontology_id:", regex=True)
176
+ .str.strip("[]")
177
+ .str.replace("organism part", "tissue")
178
+ .str.replace("organism", "organism")
179
+ .str.replace("developmental stage", "developmental_stage")
180
+ .str.replace("cell type", "cell_type")
181
+ # the last one could be interesting, too
182
+ # .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
183
+ )
184
+ # subset columns to only the ones with names
185
+ columns = [
186
+ col
187
+ for col in adata.obs.columns
188
+ if not col.startswith("ontology_id")
189
+ and not col.startswith("Factor Value")
190
+ and col != "strain"
191
+ ]
192
+ adata.obs = adata.obs[columns]
193
+
194
+ # pre-populate registries
195
+ if populate_registries:
196
+ import bionty as bt
197
+
198
+ import lamindb as ln
199
+
200
+ verbosity = ln.settings.verbosity
201
+ ln.settings.verbosity = "error"
202
+ # strain
203
+ bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save()
204
+ # developmental stage
205
+ bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save()
206
+ # tissue
207
+ bt.Tissue.from_source(ontology_id="UBERON:0001542").save()
208
+ # cell types
209
+ ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
210
+ # assays
211
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
212
+ bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
213
+ # genes
214
+ validated = bt.Gene.public(organism="mouse").validate(
215
+ adata.var.index, field="ensembl_gene_id"
216
+ )
217
+ ln.save(
218
+ bt.Gene.from_values(
219
+ adata.var.index[validated][:-19],
220
+ field="ensembl_gene_id",
221
+ organism="mouse",
222
+ )
223
+ )
224
+ # labels
225
+ labels = []
226
+ for col in ["sex", "age", "genotype", "immunophenotype"]:
227
+ labels += [ln.ULabel(name=name) for name in adata.obs[col]]
228
+ ln.save(labels)
229
+ ln.settings.verbosity = verbosity
230
+
231
+ return adata
232
+
233
+
234
+ def anndata_pbmc68k_reduced() -> ad.AnnData:
235
+ """Modified from scanpy.collections.pbmc68k_reduced().
236
+
237
+ This code was run::
238
+
239
+ pbmc68k = sc.collections.pbmc68k_reduced()
240
+ pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
241
+ pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
242
+ {"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
243
+ )
244
+ del pbmc68k.obs["G2M_score"]
245
+ del pbmc68k.obs["S_score"]
246
+ del pbmc68k.obs["phase"]
247
+ del pbmc68k.obs["n_counts"]
248
+ del pbmc68k.var["dispersions"]
249
+ del pbmc68k.var["dispersions_norm"]
250
+ del pbmc68k.var["means"]
251
+ del pbmc68k.uns["rank_genes_groups"]
252
+ del pbmc68k.uns["bulk_labels_colors"]
253
+ sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123)
254
+ pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad")
255
+ """
256
+ filepath, _ = urlretrieve(
257
+ "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad"
258
+ )
259
+ return ad.read_h5ad(filepath)
260
+
261
+
262
+ def anndata_file_pbmc68k_test() -> Path:
263
+ """Modified from scanpy.collections.pbmc68k_reduced().
264
+
265
+ Additional slots were added for testing purposes. Returns the filepath.
266
+
267
+ To reproduce::
268
+
269
+ pbmc68k = ln.core.datasets.anndata_pbmc68k_reduced()
270
+ pbmc68k_test = pbmc68k[:30, :200].copy()
271
+ pbmc68k_test.raw = pbmc68k_test[:, :100]
272
+ pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
273
+ pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
274
+ pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
275
+ pbmc68k_test.layers["test"][0] = 1.
276
+ pbmc68k_test.write("pbmc68k_test.h5ad")
277
+ """
278
+ filepath, _ = urlretrieve(
279
+ "https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
280
+ )
281
+ return Path(filepath)
282
+
283
+
284
+ def anndata_pbmc3k_processed() -> ad.AnnData: # pragma: no cover
285
+ """Modified from scanpy.pbmc3k_processed()."""
286
+ filepath, _ = urlretrieve(
287
+ "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad"
288
+ )
289
+ pbmc3k = ad.read_h5ad(filepath)
290
+ pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True)
291
+ return pbmc3k
292
+
293
+
294
+ def anndata_human_immune_cells(
295
+ populate_registries: bool = False,
296
+ ) -> ad.AnnData: # pragma: no cover
297
+ """Cross-tissue immune cell analysis reveals tissue-specific features in humans.
298
+
299
+ From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3
300
+ Collection: Global
301
+
302
+ To reproduce the subsample::
303
+ >>> adata = sc.read('Global.h5ad')
304
+ >>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy()
305
+ >>> sc.pp.subsample(adata, fraction=0.005)
306
+ >>> del adata.uns["development_cache_ontology_term_id_colors"]
307
+ >>> del adata.uns["sex_ontology_term_id_colors"]
308
+ >>> adata.write('human_immune.h5ad')
309
+ """
310
+ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
311
+ adata = ad.read_h5ad(filepath)
312
+ adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
313
+ adata.uns.pop("cell_type_ontology_term_id_colors")
314
+ adata.uns.pop("title")
315
+ adata.uns.pop("schema_version")
316
+ adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor")
317
+ columns = [col for col in adata.obs.columns if "ontology_term" not in col]
318
+ adata.obs = adata.obs[columns]
319
+ if populate_registries:
320
+ import bionty as bt
321
+
322
+ import lamindb as ln
323
+
324
+ verbosity = ln.settings.verbosity
325
+ ln.settings.verbosity = "error"
326
+ ln.save(
327
+ bt.Gene.from_values(
328
+ adata.var.index, field="ensembl_gene_id", organism="human"
329
+ )
330
+ )
331
+ ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
332
+ ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
333
+ ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
334
+ ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
335
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
336
+ ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
337
+ ln.Feature(name="organism", dtype=[bt.Organism]).save()
338
+ ln.Feature(name="donor", dtype=[ln.ULabel]).save()
339
+ bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
340
+ ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
341
+ ln.settings.verbosity = verbosity
342
+ return adata
343
+
344
+
345
+ def anndata_with_obs() -> ad.AnnData:
346
+ """Create a mini anndata with cell_type, disease and tissue."""
347
+ import anndata as ad
348
+ import bionty.base as bionty_base
349
+
350
+ celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
351
+ celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
352
+ diseases = [
353
+ "chronic kidney disease",
354
+ "liver lymphoma",
355
+ "cardiac ventricle disorder",
356
+ "Alzheimer disease",
357
+ ]
358
+ tissues = ["kidney", "liver", "heart", "brain"]
359
+ df = pd.DataFrame()
360
+ df["cell_type"] = celltypes * 10
361
+ df["cell_type_id"] = celltype_ids * 10
362
+ df["tissue"] = tissues * 10
363
+ df["disease"] = diseases * 10
364
+ df.index = "obs" + df.index.astype(str)
365
+
366
+ adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
367
+ adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
368
+
369
+ return adata
370
+
371
+
372
+ def anndata_suo22_Visium10X(): # pragma: no cover
373
+ """AnnData from Suo22 generated by 10x Visium."""
374
+ import anndata as ad
375
+
376
+ filepath, _ = urlretrieve(
377
+ "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
378
+ "Visium10X_data_LI_subset.h5ad",
379
+ )
380
+ Path("suo22/").mkdir(exist_ok=True)
381
+ filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
382
+ return ad.read_h5ad(filepath)
383
+
384
+
385
+ def mudata_papalexi21_subset() -> MuData: # pragma: no cover
386
+ """A subsetted mudata from papalexi21.
387
+
388
+ To reproduce the subsetting:
389
+ >>> !wget https://figshare.com/ndownloader/files/36509460
390
+ >>> import mudata as md
391
+ >>> import scanpy as sc
392
+ >>> mdata = md.read_h5mu("36509460")
393
+ >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
394
+ >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu")
395
+ """
396
+ import mudata as md
397
+
398
+ md.set_options(pull_on_update=False)
399
+
400
+ filepath, _ = urlretrieve(
401
+ "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu",
402
+ "papalexi21_subset.h5mu",
403
+ )
404
+
405
+ mdata = md.read_h5mu(filepath)
406
+
407
+ mdata.pull_obs()
408
+
409
+ # The MuData object is malformed with duplicated information
410
+ # Drop all columns for the modalities and add them again correspondingly
411
+ for mod in ["rna", "adt", "hto", "gdo"]:
412
+ mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True)
413
+ for col in mdata.obs.columns:
414
+ for mod in ["rna", "adt", "hto", "gdo"]:
415
+ if col.endswith(f"_{mod.upper()}"):
416
+ new_col = col.replace(f"{mod}:", "")
417
+ if new_col != col:
418
+ mdata[mod].obs[new_col] = mdata.obs.pop(col)
419
+ else:
420
+ new_col = col.replace(f"{mod}:", "")
421
+ if new_col not in mdata.obs.columns and col in mdata.obs.columns:
422
+ mdata.obs[new_col] = mdata.obs.pop(col)
423
+
424
+ for col in mdata.obs.columns:
425
+ for mod in ["rna", "adt", "hto", "gdo"]:
426
+ if col.endswith(f"_{mod.upper()}"):
427
+ del mdata.obs[col]
428
+
429
+ for col in [
430
+ "orig.ident",
431
+ "MULTI_ID",
432
+ "NT",
433
+ "S.Score",
434
+ "G2M.Score",
435
+ "Phase",
436
+ "gene_target",
437
+ "guide_ID",
438
+ "HTO_classification",
439
+ ]:
440
+ del mdata.obs[col]
441
+
442
+ mdata.push_obs(["percent.mito"], mods=["rna"], drop=True)
443
+ mdata["hto"].obs["technique"] = "cell hashing"
444
+ mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category")
445
+ mdata.pull_obs(["technique"], mods="hto")
446
+
447
+ return mdata
448
+
449
+
450
+ def df_iris() -> pd.DataFrame:
451
+ """The iris collection as in sklearn.
452
+
453
+ Original code::
454
+
455
+ sklearn.collections.load_iris(as_frame=True).frame
456
+ """
457
+ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
458
+ return pd.read_parquet(filepath)
459
+
460
+
461
+ def df_iris_in_meter() -> pd.DataFrame:
462
+ """The iris collection with lengths in meter."""
463
+ df = df_iris()
464
+ # rename columns
465
+ df.rename(
466
+ columns={
467
+ "sepal length (cm)": "sepal_length",
468
+ "sepal width (cm)": "sepal_width",
469
+ "petal length (cm)": "petal_length",
470
+ "petal width (cm)": "petal_width",
471
+ },
472
+ inplace=True,
473
+ )
474
+ df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100
475
+ df["iris_organism_name"] = df["target"].map(
476
+ {0: "setosa", 1: "versicolor", 2: "virginica"}
477
+ )
478
+ del df["target"]
479
+ return df
480
+
481
+
482
+ def df_iris_in_meter_study1() -> pd.DataFrame:
483
+ """The iris collection with lengths in meter."""
484
+ df_iris = df_iris_in_meter()
485
+ return df_iris.iloc[: len(df_iris) // 2]
486
+
487
+
488
+ def df_iris_in_meter_study2() -> pd.DataFrame:
489
+ """The iris collection with lengths in meter."""
490
+ df_iris = df_iris_in_meter()
491
+ return df_iris.iloc[len(df_iris) // 2 :]
492
+
493
+
494
+ def dir_scrnaseq_cellranger(
495
+ sample_name: str, basedir: str | Path = "./", output_only: bool = True
496
+ ): # pragma: no cover
497
+ """Generate mock cell ranger outputs.
498
+
499
+ Args:
500
+ sample_name: name of the sample
501
+ basedir: run directory
502
+ output_only: only generate output files
503
+ """
504
+ basedir = Path(basedir)
505
+
506
+ if not output_only: # pragma: no cover
507
+ fastqdir = basedir / "fastq"
508
+ fastqdir.mkdir(parents=True, exist_ok=True)
509
+ fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz"
510
+ with open(fastqfile1, "w") as f:
511
+ f.write(f"{ids.base62(n_char=6)}")
512
+ fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz"
513
+ fastqfile2.touch(exist_ok=True)
514
+ with open(fastqfile2, "w") as f:
515
+ f.write(f"{ids.base62(n_char=6)}")
516
+
517
+ sampledir = basedir / f"{sample_name}"
518
+ for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]:
519
+ filedir = sampledir / folder
520
+ filedir.mkdir(parents=True, exist_ok=True)
521
+
522
+ for filename in [
523
+ "web_summary.html",
524
+ "metrics_summary.csv",
525
+ "possorted_genome_bam.bam",
526
+ "possorted_genome_bam.bam.bai",
527
+ "molecule_info.h5",
528
+ "cloupe.cloupe",
529
+ "raw_feature_bc_matrix.h5",
530
+ "raw_feature_bc_matrix/barcodes.tsv.gz",
531
+ "raw_feature_bc_matrix/features.tsv.gz",
532
+ "raw_feature_bc_matrix/matrix.mtx.gz",
533
+ "filtered_feature_bc_matrix.h5",
534
+ "filtered_feature_bc_matrix/barcodes.tsv.gz",
535
+ "filtered_feature_bc_matrix/features.tsv.gz",
536
+ "filtered_feature_bc_matrix/matrix.mtx.gz",
537
+ "analysis/analysis.csv",
538
+ ]:
539
+ file = sampledir / filename
540
+ with open(file, "w") as f:
541
+ f.write(f"{ids.base62(n_char=6)}")
542
+
543
+ return sampledir
544
+
545
+
546
+ def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
547
+ """CRISPRi screen collection of Schmidt22.
548
+
549
+ Originally from: https://zenodo.org/record/5784651
550
+ """
551
+ filepath, _ = urlretrieve(
552
+ "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
553
+ "schmidt22-crispra-gws-IFNG.csv",
554
+ )
555
+ return Path(filepath).rename(Path(basedir) / filepath)
556
+
557
+
558
+ def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
559
+ """Perturb-seq collection of Schmidt22.
560
+
561
+ Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
562
+
563
+ To reproduce the subsample:
564
+ >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
565
+ >>> adata.obs = adata.obs[['cluster_name']]
566
+ >>> del adata.obsp
567
+ >>> del adata.var['features']
568
+ >>> del adata.obsm['X_pca']
569
+ >>> del adata.uns
570
+ >>> del adata.raw
571
+ >>> del adata.varm
572
+ >>> adata.obs = adata.obs.reset_index()
573
+ >>> del adata.obs['index']
574
+ >>> sc.pp.subsample(adata, 0.03)
575
+ >>> adata.write('schmidt22_perturbseq.h5ad')
576
+ """
577
+ filepath, _ = urlretrieve(
578
+ "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
579
+ "schmidt22_perturbseq.h5ad",
580
+ )
581
+ return Path(filepath).rename(Path(basedir) / filepath)