lamindb 0.48a2__py3-none-any.whl → 0.48.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from typing import List, Set, Union
2
2
 
3
3
  from lnschema_core import ORM, File, Run
4
+ from lnschema_core.models import format_datetime
4
5
 
5
6
 
6
7
  def view_lineage(file: File, with_children: bool = True):
@@ -32,9 +33,10 @@ def view_lineage(file: File, with_children: bool = True):
32
33
  u = graphviz.Digraph(
33
34
  file.id,
34
35
  node_attr={
35
- "fillcolor": "antiquewhite",
36
- "color": "orange",
36
+ "fillcolor": "honeydew",
37
+ "color": "seagreen",
37
38
  "fontname": "Helvetica",
39
+ "fontsize": "10",
38
40
  },
39
41
  edge_attr={"arrowsize": "0.5"},
40
42
  )
@@ -47,9 +49,9 @@ def view_lineage(file: File, with_children: bool = True):
47
49
  shape = "box"
48
50
  fillcolor = "gainsboro"
49
51
  else:
50
- shape = "oval"
51
- style = "filled"
52
- fillcolor = "antiquewhite"
52
+ style = "rounded,filled"
53
+ shape = "box"
54
+ fillcolor = "honeydew"
53
55
  u.node(
54
56
  node_id,
55
57
  label=node_label,
@@ -64,39 +66,58 @@ def view_lineage(file: File, with_children: bool = True):
64
66
  add_node(row["target_record"], row["target"], row["target_label"], u)
65
67
 
66
68
  u.edge(row["source"], row["target"], color="dimgrey")
67
- # label the searched file orange
68
- u.node(file.id, label=file_label, style="filled", fillcolor="orange", shape="oval")
69
+ # label the searched file mediumseagreen
70
+ u.node(
71
+ file.id,
72
+ label=file_label,
73
+ style="rounded,filled",
74
+ fillcolor="mediumseagreen",
75
+ shape="box",
76
+ )
69
77
 
70
78
  return u
71
79
 
72
80
 
73
- def view_parents(record: ORM, field: str, distance: int = 100):
81
+ def view_parents(
82
+ record: ORM, field: str, with_children: bool = False, distance: int = 100
83
+ ):
74
84
  """Graph of parents."""
75
85
  if not hasattr(record, "parents"):
76
86
  return NotImplementedError(
77
87
  f"Parents view is not supported for {record.__class__.__name__}!"
78
88
  )
79
89
  import graphviz
90
+ import pandas as pd
80
91
 
81
92
  df_edges = _df_edges_from_parents(record=record, field=field, distance=distance)
93
+ if with_children:
94
+ df_edges = pd.concat(
95
+ [
96
+ df_edges,
97
+ _df_edges_from_parents(
98
+ record=record, field=field, distance=distance, children=True
99
+ ),
100
+ ]
101
+ ).drop_duplicates()
82
102
 
83
103
  record_label = record.__getattribute__(field)
84
104
 
85
105
  u = graphviz.Digraph(
86
106
  record.id,
87
107
  node_attr={
88
- "color": "orange",
89
- "fillcolor": "antiquewhite",
108
+ "color": "seagreen",
109
+ "fillcolor": "honeydew",
90
110
  "shape": "box",
91
111
  "style": "rounded,filled",
92
112
  "fontname": "Helvetica",
113
+ "fontsize": "10",
93
114
  },
94
115
  edge_attr={"arrowsize": "0.5"},
95
116
  )
96
117
  u.node(
97
118
  record_label.replace(":", "_"),
98
119
  label=record_label,
99
- fillcolor="orange",
120
+ fillcolor="mediumseagreen",
100
121
  )
101
122
  for _, row in df_edges.iterrows():
102
123
  u.node(row["source"], label=row["source_label"])
@@ -105,17 +126,21 @@ def view_parents(record: ORM, field: str, distance: int = 100):
105
126
  return u
106
127
 
107
128
 
108
- def _get_parents(record: ORM, field: str, distance: int):
129
+ def _get_parents(record: ORM, field: str, distance: int, children: bool = False):
109
130
  """Recursively get parent records within a distance."""
131
+ if children:
132
+ key = "parents"
133
+ else:
134
+ key = "children"
110
135
  model = record.__class__
111
- condition = f"children__{field}"
136
+ condition = f"{key}__{field}"
112
137
  results = model.select(**{condition: record.__getattribute__(field)}).all()
113
138
  if distance < 2:
114
139
  return results
115
140
 
116
141
  d = 2
117
142
  while d < distance:
118
- condition = "children__" + condition
143
+ condition = f"{key}__{condition}"
119
144
  records = model.select(**{condition: record.__getattribute__(field)}).all()
120
145
 
121
146
  if len(records) == 0:
@@ -126,16 +151,21 @@ def _get_parents(record: ORM, field: str, distance: int):
126
151
  return results
127
152
 
128
153
 
129
- def _df_edges_from_parents(record: ORM, field: str, distance: int):
154
+ def _df_edges_from_parents(
155
+ record: ORM, field: str, distance: int, children: bool = False
156
+ ):
130
157
  """Construct a DataFrame of edges as the input of graphviz.Digraph."""
131
- parents = _get_parents(record=record, field=field, distance=distance)
158
+ key = "children" if children else "parents"
159
+ parents = _get_parents(
160
+ record=record, field=field, distance=distance, children=children
161
+ )
132
162
  records = parents | record.__class__.objects.filter(id=record.id)
133
- df = records.distinct().df(include=[f"parents__{field}"])
134
- df_edges = df[[f"parents__{field}", field]]
135
- df_edges = df_edges.explode(f"parents__{field}")
163
+ df = records.distinct().df(include=[f"{key}__{field}"])
164
+ df_edges = df[[f"{key}__{field}", field]]
165
+ df_edges = df_edges.explode(f"{key}__{field}")
136
166
  df_edges.dropna(axis=0, inplace=True)
137
167
  df_edges.rename(
138
- columns={f"parents__{field}": "source", field: "target"}, inplace=True
168
+ columns={f"{key}__{field}": "source", field: "target"}, inplace=True
139
169
  )
140
170
  df_edges = df_edges.drop_duplicates()
141
171
 
@@ -153,10 +183,10 @@ def _get_all_parent_runs(file: File):
153
183
  all_runs = {file.run}
154
184
 
155
185
  runs = [file.run]
156
- while any([r.inputs.exists() for r in runs if r is not None]):
186
+ while any([r.input_files.exists() for r in runs if r is not None]):
157
187
  inputs = []
158
188
  for r in runs:
159
- inputs += r.inputs.all()
189
+ inputs += r.input_files.all()
160
190
  runs = [f.run for f in inputs]
161
191
  all_runs.update(runs)
162
192
  return all_runs
@@ -166,29 +196,35 @@ def _get_all_child_runs(file: File):
166
196
  """Get all output file runs recursively."""
167
197
  all_runs: Set[Run] = set()
168
198
 
169
- runs = {f.run for f in file.run.outputs.all()}
199
+ runs = {f.run for f in file.run.output_files.all()}
170
200
  while runs.difference(all_runs):
171
201
  all_runs.update(runs)
172
202
  child_runs: Set[Run] = set()
173
203
  for r in runs:
174
- child_runs.update(Run.select(inputs__id__in=r.outputs.list("id")).list())
204
+ child_runs.update(
205
+ Run.select(input_files__id__in=r.output_files.list("id")).list()
206
+ )
175
207
  runs = child_runs
176
208
  return all_runs
177
209
 
178
210
 
179
211
  def _label_file_run(record: Union[File, Run]):
180
212
  if isinstance(record, File):
213
+ if record.description is None:
214
+ name = record.key
215
+ else:
216
+ name = record.description.replace("&", "&amp;")
217
+
181
218
  return (
182
- rf'<{record.key}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
183
- rf' FACE="Monospace">id={record.id}</FONT>>'
184
- if record.key is not None
185
- else record.id
219
+ rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
220
+ rf' FACE="Monospace">id={record.id}<BR/>suffix={record.suffix}</FONT>>'
186
221
  )
187
222
  elif isinstance(record, Run):
188
- name = f'{record.transform.name.replace("&", "&amp;")} '
223
+ name = f'{record.transform.name.replace("&", "&amp;")}'
189
224
  return (
190
225
  rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
191
- rf' FACE="Monospace">id={record.id}</FONT>>'
226
+ rf' FACE="Monospace">id={record.id}<BR/>type={record.transform.type},'
227
+ rf" user={record.created_by.name}<BR/>run_at={format_datetime(record.run_at)}</FONT>>" # noqa
192
228
  )
193
229
 
194
230
 
@@ -199,10 +235,10 @@ def _df_edges_from_runs(all_runs: List[Run]):
199
235
  for run in all_runs:
200
236
  if run is None:
201
237
  continue
202
- if run.inputs.exists():
203
- df_values.append((run.inputs.list(), run))
204
- if run.outputs.exists():
205
- df_values.append((run, run.outputs.list()))
238
+ if run.input_files.exists():
239
+ df_values.append((run.input_files.list(), run))
240
+ if run.output_files.exists():
241
+ df_values.append((run, run.output_files.list()))
206
242
  df = pd.DataFrame(df_values, columns=["source_record", "target_record"])
207
243
  df = df.explode("source_record")
208
244
  df = df.explode("target_record")
@@ -4,7 +4,9 @@
4
4
  :toctree: .
5
5
 
6
6
  file_fcs
7
+ file_fcs_alpert19
7
8
  file_jpg_paradisi05
9
+ file_tiff_suo22
8
10
  file_fastq
9
11
  file_bam
10
12
  file_mini_csv
@@ -19,6 +21,10 @@
19
21
  anndata_pbmc68k_reduced
20
22
  anndata_pbmc3k_processed
21
23
  anndata_with_obs
24
+ anndata_suo22_Visium10X
25
+ mudata_papalexi21_subset
26
+ schmidt22_crispra_gws_IFNG
27
+ schmidt22_perturbseq
22
28
  fake_bio_notebook_titles
23
29
  """
24
30
 
@@ -27,6 +33,7 @@ from ._core import (
27
33
  anndata_mouse_sc_lymph_node,
28
34
  anndata_pbmc3k_processed,
29
35
  anndata_pbmc68k_reduced,
36
+ anndata_suo22_Visium10X,
30
37
  anndata_with_obs,
31
38
  df_iris,
32
39
  df_iris_in_meter,
@@ -36,8 +43,13 @@ from ._core import (
36
43
  file_bam,
37
44
  file_fastq,
38
45
  file_fcs,
46
+ file_fcs_alpert19,
39
47
  file_jpg_paradisi05,
40
48
  file_mini_csv,
49
+ file_tiff_suo22,
41
50
  generate_cell_ranger_files,
51
+ mudata_papalexi21_subset,
52
+ schmidt22_crispra_gws_IFNG,
53
+ schmidt22_perturbseq,
42
54
  )
43
55
  from ._fake import fake_bio_notebook_titles
@@ -11,13 +11,22 @@ from .._settings import settings
11
11
 
12
12
 
13
13
  def file_fcs() -> Path:
14
- """Return fcs file example."""
14
+ """Example FCS file."""
15
15
  filepath, _ = urlretrieve(
16
16
  "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
17
17
  )
18
18
  return Path(filepath)
19
19
 
20
20
 
21
+ def file_fcs_alpert19() -> Path:
22
+ """FCS file from Alpert19."""
23
+ filepath, _ = urlretrieve(
24
+ "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs", # noqa
25
+ "Alpert19.fcs",
26
+ )
27
+ return Path(filepath)
28
+
29
+
21
30
  def file_jpg_paradisi05() -> Path:
22
31
  """Return jpg file example.
23
32
 
@@ -30,26 +39,45 @@ def file_jpg_paradisi05() -> Path:
30
39
  return Path(filepath)
31
40
 
32
41
 
33
- def file_fastq() -> Path:
42
+ def file_fastq(in_storage_root=False) -> Path:
34
43
  """Mini mock fastq file."""
35
- with open("./input.fastq.gz", "w") as f:
44
+ basedir = Path(".") if not in_storage_root else settings.storage
45
+ filepath = basedir / "input.fastq.gz"
46
+ with open(filepath, "w") as f:
36
47
  f.write("Mock fastq file.")
37
- return Path("./input.fastq.gz")
48
+ return filepath
38
49
 
39
50
 
40
- def file_bam() -> Path:
51
+ def file_bam(in_storage_root=False) -> Path:
41
52
  """Mini mock bam file."""
42
- with open("./output.bam", "w") as f:
53
+ basedir = Path(".") if not in_storage_root else settings.storage
54
+ filepath = basedir / "output.bam"
55
+ with open(filepath, "w") as f:
43
56
  f.write("Mock bam file.")
44
- return Path("./output.bam")
57
+ return filepath
45
58
 
46
59
 
47
- def file_mini_csv() -> Path:
60
+ def file_mini_csv(in_storage_root=False) -> Path:
48
61
  """Mini csv file."""
49
- filename = Path("./mini.csv")
62
+ basedir = Path(".") if not in_storage_root else settings.storage
63
+ filepath = basedir / "mini.csv"
50
64
  df = pd.DataFrame([1, 2, 3], columns=["test"])
51
- df.to_csv(filename, index=False)
52
- return filename
65
+ df.to_csv(filepath, index=False)
66
+ return filepath
67
+
68
+
69
+ def file_tiff_suo22():
70
+ """Image file from Suo22.
71
+
72
+ Pair with anndata_suo22_Visium10X
73
+ """
74
+ filepath, _ = urlretrieve(
75
+ "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff",
76
+ "F121_LP1_4LIV.tiff",
77
+ )
78
+ Path("suo22/").mkdir(exist_ok=True)
79
+ filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff")
80
+ return Path(filepath)
53
81
 
54
82
 
55
83
  def dir_scrnaseq_cellranger(in_storage_root=False) -> Path:
@@ -59,12 +87,12 @@ def dir_scrnaseq_cellranger(in_storage_root=False) -> Path:
59
87
  )
60
88
  from zipfile import ZipFile
61
89
 
62
- path = Path(".") if not in_storage_root else settings.storage
90
+ basedir = Path(".") if not in_storage_root else settings.storage
63
91
  with ZipFile(filepath, "r") as zipObj:
64
92
  # Extract all the contents of zip file in current directory
65
- zipObj.extractall(path=path)
93
+ zipObj.extractall(path=basedir)
66
94
 
67
- return path / "cellranger_run_001"
95
+ return basedir / "cellranger_run_001"
68
96
 
69
97
 
70
98
  def anndata_mouse_sc_lymph_node() -> ad.AnnData:
@@ -133,7 +161,10 @@ def anndata_human_immune_cells() -> ad.AnnData:
133
161
  adata.write('human_immune.h5ad')
134
162
  """
135
163
  filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad")
136
- return ad.read(filepath)
164
+ adata = ad.read(filepath)
165
+ del adata.raw
166
+ adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True)
167
+ return adata
137
168
 
138
169
 
139
170
  def anndata_with_obs() -> ad.AnnData:
@@ -163,6 +194,40 @@ def anndata_with_obs() -> ad.AnnData:
163
194
  return adata
164
195
 
165
196
 
197
+ def anndata_suo22_Visium10X():
198
+ """AnnData from Suo22 generated by 10x Visium."""
199
+ import anndata as ad
200
+
201
+ filepath, _ = urlretrieve(
202
+ "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad",
203
+ "Visium10X_data_LI_subset.h5ad",
204
+ )
205
+ Path("suo22/").mkdir(exist_ok=True)
206
+ filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad")
207
+ return ad.read(filepath)
208
+
209
+
210
+ def mudata_papalexi21_subset():
211
+ """A subsetted mudata from papalexi21.
212
+
213
+ To reproduce the subsetting:
214
+ >>> !wget https://figshare.com/ndownloader/files/36509460
215
+ >>> import mudata as md
216
+ >>> import scanpy as sc
217
+ >>> mdata = md.read_h5mu("36509460")
218
+ >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0]
219
+ >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu") # noqa
220
+ """
221
+ import mudata as md
222
+
223
+ filepath, _ = urlretrieve(
224
+ "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu", # noqa
225
+ "papalexi21_subset.h5mu",
226
+ )
227
+
228
+ return md.read_h5mu(filepath)
229
+
230
+
166
231
  def df_iris() -> pd.DataFrame:
167
232
  """The iris dataset as in sklearn.
168
233
 
@@ -260,53 +325,39 @@ def generate_cell_ranger_files(
260
325
  return sampledir
261
326
 
262
327
 
263
- # def schmidt22_crispra_gws_IFNG() -> Path:
264
- # """CRISPRi screen dataset of Schmidt22.
265
-
266
- # Originally from: https://zenodo.org/record/5784651
267
- # """
268
- # filepath, _ = urlretrieve(
269
- # "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
270
- # "schmidt22-crispra-gws-IFNG.csv",
271
- # )
272
- # return Path(filepath)
273
-
274
-
275
- # def schmidt22_perturbseq() -> Path:
276
- # """Perturb-seq dataset of Schmidt22.
277
-
278
- # Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
279
-
280
- # To reproduce the subsample:
281
- # >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
282
- # >>> adata.obs = adata.obs[['cluster_name']]
283
- # >>> del adata.obsp
284
- # >>> del adata.var['features']
285
- # >>> del adata.obsm['X_pca']
286
- # >>> del adata.uns
287
- # >>> del adata.raw
288
- # >>> del adata.varm
289
- # >>> adata.obs = adata.obs.reset_index()
290
- # >>> del adata.obs['index']
291
- # >>> sc.pp.subsample(adata, 0.03)
292
- # >>> adata.write('schmidt22_perturbseq.h5ad')
293
- # """
294
- # filepath, _ = urlretrieve(
295
- # "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
296
- # "schmidt22_perturbseq.h5ad",
297
- # )
298
- # return Path(filepath)
299
-
300
-
301
- # def dir_scrnaseq_cellranger_schmidt22() -> Path:
302
- # """BFXoutput directory containing Schmidt22_perturbseq."""
303
- # filepath, _ = urlretrieve(
304
- # "https://lamindb-test.s3.amazonaws.com/scrnaseq-cellranger-schmidt22.zip",
305
- # )
306
- # from zipfile import ZipFile
307
-
308
- # with ZipFile(filepath, "r") as zipObj:
309
- # # Extract all the contents of zip file in current directory
310
- # zipObj.extractall(path=".")
311
-
312
- # return Path("scrnaseq-cellranger-schmidt22")
328
+ def schmidt22_crispra_gws_IFNG(basedir=".") -> Path:
329
+ """CRISPRi screen dataset of Schmidt22.
330
+
331
+ Originally from: https://zenodo.org/record/5784651
332
+ """
333
+ filepath, _ = urlretrieve(
334
+ "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv",
335
+ "schmidt22-crispra-gws-IFNG.csv",
336
+ )
337
+ return Path(filepath).rename(Path(basedir) / filepath)
338
+
339
+
340
+ def schmidt22_perturbseq(basedir=".") -> Path:
341
+ """Perturb-seq dataset of Schmidt22.
342
+
343
+ Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
344
+
345
+ To reproduce the subsample:
346
+ >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad')
347
+ >>> adata.obs = adata.obs[['cluster_name']]
348
+ >>> del adata.obsp
349
+ >>> del adata.var['features']
350
+ >>> del adata.obsm['X_pca']
351
+ >>> del adata.uns
352
+ >>> del adata.raw
353
+ >>> del adata.varm
354
+ >>> adata.obs = adata.obs.reset_index()
355
+ >>> del adata.obs['index']
356
+ >>> sc.pp.subsample(adata, 0.03)
357
+ >>> adata.write('schmidt22_perturbseq.h5ad')
358
+ """
359
+ filepath, _ = urlretrieve(
360
+ "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad",
361
+ "schmidt22_perturbseq.h5ad",
362
+ )
363
+ return Path(filepath).rename(Path(basedir) / filepath)
@@ -12,10 +12,6 @@ from lamindb_setup.dev.upath import UPath
12
12
  from lamindb_setup.dev.upath import infer_filesystem as _infer_filesystem
13
13
 
14
14
  from ._anndata_sizes import size_adata
15
-
16
- try:
17
- from ._backed_access import AnnDataAccessor, BackedAccessor
18
- except ImportError:
19
- pass
15
+ from ._backed_access import AnnDataAccessor, BackedAccessor
20
16
  from .file import delete_storage, load_to_memory, store_object
21
17
  from .object import infer_suffix, write_to_file