lamindb 0.49.3__py3-none-any.whl → 0.50.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +55 -15
- lamindb/_context.py +25 -25
- lamindb/_delete.py +8 -8
- lamindb/_feature.py +15 -11
- lamindb/_feature_set.py +70 -39
- lamindb/_file.py +80 -56
- lamindb/_filter.py +5 -5
- lamindb/_from_values.py +55 -92
- lamindb/{_manager.py → _query_manager.py} +8 -5
- lamindb/{_queryset.py → _query_set.py} +31 -28
- lamindb/{_orm.py → _registry.py} +53 -294
- lamindb/_save.py +14 -13
- lamindb/_synonym.py +203 -0
- lamindb/_validate.py +134 -0
- lamindb/_view.py +15 -9
- lamindb/dev/__init__.py +13 -6
- lamindb/dev/_data.py +195 -0
- lamindb/dev/_feature_manager.py +102 -0
- lamindb/dev/_settings.py +10 -9
- lamindb/dev/_view_parents.py +36 -17
- lamindb/dev/datasets/__init__.py +5 -3
- lamindb/dev/datasets/_core.py +35 -17
- lamindb/dev/exc.py +4 -0
- lamindb/dev/storage/_backed_access.py +53 -17
- lamindb/dev/storage/file.py +44 -15
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/METADATA +34 -36
- lamindb-0.50.1.dist-info/RECORD +47 -0
- lamindb/_feature_manager.py +0 -237
- lamindb-0.49.3.dist-info/RECORD +0 -43
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/LICENSE +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/WHEEL +0 -0
- {lamindb-0.49.3.dist-info → lamindb-0.50.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
from typing import Dict, List, Union
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
from lnschema_core.models import Dataset, FeatureSet, File
|
5
|
+
|
6
|
+
from .._query_set import QuerySet
|
7
|
+
|
8
|
+
|
9
|
+
def create_features_df(
|
10
|
+
file: File, feature_sets: List[FeatureSet], exclude: bool = True
|
11
|
+
):
|
12
|
+
features = []
|
13
|
+
for feature_set in feature_sets:
|
14
|
+
if exclude:
|
15
|
+
features_df = feature_set.features.exclude(registries__isnull=True).df()
|
16
|
+
else:
|
17
|
+
features_df = feature_set.features.df()
|
18
|
+
slots = file.feature_sets.through.objects.filter(
|
19
|
+
file=file, feature_set=feature_set
|
20
|
+
).list("slot")
|
21
|
+
for slot in slots:
|
22
|
+
features_df["slot"] = slot
|
23
|
+
features.append(features_df)
|
24
|
+
features_df = pd.concat(features)
|
25
|
+
return features_df.sort_values(["slot", "registries"])
|
26
|
+
|
27
|
+
|
28
|
+
def get_accessor_by_orm(host: Union[File, Dataset]) -> Dict:
|
29
|
+
dictionary = {
|
30
|
+
field.related_model.__get_name_with_schema__(): field.name
|
31
|
+
for field in host._meta.related_objects
|
32
|
+
}
|
33
|
+
dictionary["core.Feature"] = "features"
|
34
|
+
dictionary["core.Label"] = "labels"
|
35
|
+
return dictionary
|
36
|
+
|
37
|
+
|
38
|
+
def get_feature_set_by_slot(host) -> Dict:
|
39
|
+
# if the host is not yet saved
|
40
|
+
if host._state.adding:
|
41
|
+
return host._feature_sets
|
42
|
+
# otherwise, we need a query
|
43
|
+
feature_set_links = host.feature_sets.through.objects.filter(file_id=host.id)
|
44
|
+
return {
|
45
|
+
feature_set_link.slot: FeatureSet.objects.get(
|
46
|
+
id=feature_set_link.feature_set_id
|
47
|
+
)
|
48
|
+
for feature_set_link in feature_set_links
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
class FeatureManager:
|
53
|
+
"""Feature manager (:attr:`~lamindb.dev.Data.features`).
|
54
|
+
|
55
|
+
See :class:`~lamindb.dev.Data` for more information.
|
56
|
+
"""
|
57
|
+
|
58
|
+
def __init__(self, host: Union[File, Dataset]):
|
59
|
+
self._host = host
|
60
|
+
self._feature_set_by_slot = get_feature_set_by_slot(host)
|
61
|
+
self._accessor_by_orm = get_accessor_by_orm(host)
|
62
|
+
|
63
|
+
def __repr__(self) -> str:
|
64
|
+
if len(self._feature_set_by_slot) > 0:
|
65
|
+
msg = ""
|
66
|
+
for slot, feature_set in self._feature_set_by_slot.items():
|
67
|
+
msg += f"'{slot}': {feature_set}\n"
|
68
|
+
return msg
|
69
|
+
else:
|
70
|
+
return "no linked features"
|
71
|
+
|
72
|
+
def __getitem__(self, slot) -> QuerySet:
|
73
|
+
if slot not in self._feature_set_by_slot:
|
74
|
+
raise ValueError(
|
75
|
+
f"No linked feature set for slot: {slot}\nDid you get validation"
|
76
|
+
" warnings? Only features that match registered features get validated"
|
77
|
+
" and linked."
|
78
|
+
)
|
79
|
+
feature_set = self._feature_set_by_slot[slot]
|
80
|
+
orm_name = feature_set.registry
|
81
|
+
return getattr(feature_set, self._accessor_by_orm[orm_name]).all()
|
82
|
+
|
83
|
+
def add_feature_set(self, feature_set: FeatureSet, slot: str):
|
84
|
+
"""Add new feature set to a slot.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
feature_set: `FeatureSet` A feature set object.
|
88
|
+
slot: `str` The access slot.
|
89
|
+
"""
|
90
|
+
if self._host._state.adding:
|
91
|
+
raise ValueError(
|
92
|
+
"Please save the file or dataset before adding a feature set!"
|
93
|
+
)
|
94
|
+
feature_set.save()
|
95
|
+
link_record = self._host.feature_sets.through.objects.filter(
|
96
|
+
file=self._host, feature_set=feature_set, slot=slot
|
97
|
+
).one_or_none()
|
98
|
+
if link_record is None:
|
99
|
+
self._host.feature_sets.through(
|
100
|
+
file=self._host, feature_set=feature_set, slot=slot
|
101
|
+
).save()
|
102
|
+
self._feature_set_by_slot[slot] = feature_set
|
lamindb/dev/_settings.py
CHANGED
@@ -14,7 +14,7 @@ class Settings:
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
def __init__(self):
|
17
|
-
self._verbosity: int =
|
17
|
+
self._verbosity: int = 4 # hint-level logging
|
18
18
|
logger.set_verbosity(self._verbosity)
|
19
19
|
|
20
20
|
upon_file_create_if_hash_exists: Literal[
|
@@ -34,7 +34,7 @@ class Settings:
|
|
34
34
|
It speeds up file creation by about a factor 100.
|
35
35
|
"""
|
36
36
|
upon_create_search_names: bool = True
|
37
|
-
"""To speed up creating
|
37
|
+
"""To speed up creating Registry objects (default `True`).
|
38
38
|
|
39
39
|
If `True`, search for alternative names.
|
40
40
|
|
@@ -80,13 +80,14 @@ class Settings:
|
|
80
80
|
|
81
81
|
@property
|
82
82
|
def verbosity(self) -> int:
|
83
|
-
"""Verbosity (default
|
84
|
-
|
85
|
-
- 0: only show 'error' messages
|
86
|
-
- 1: also show 'warning' messages
|
87
|
-
- 2: also show '
|
88
|
-
- 3: also show '
|
89
|
-
- 4: also show
|
83
|
+
"""Verbosity (default 4 / 'hint').
|
84
|
+
|
85
|
+
- 0: ❌ only show 'error' messages
|
86
|
+
- 1: 🔶 also show 'warning' messages
|
87
|
+
- 2: ✅ also show 'success' messages
|
88
|
+
- 3: 💬 also show 'info' messages
|
89
|
+
- 4: 💡 also show 'hint' messages
|
90
|
+
- 5: 🐛 also show detailed 'debug' messages
|
90
91
|
|
91
92
|
This is based on Scanpy's and Django's verbosity setting.
|
92
93
|
"""
|
lamindb/dev/_view_parents.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
from typing import List, Set, Union
|
2
2
|
|
3
|
-
from lnschema_core import
|
4
|
-
from lnschema_core.models import
|
3
|
+
from lnschema_core import File, Registry, Run, Transform
|
4
|
+
from lnschema_core.models import format_field_value
|
5
|
+
|
6
|
+
LAMIN_GREEN_LIGHTER = "#10b981"
|
7
|
+
LAMIN_GREEN_DARKER = "#065f46"
|
8
|
+
GREEN_FILL = "honeydew"
|
9
|
+
TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
|
5
10
|
|
6
11
|
|
7
12
|
def view_lineage(file: File, with_children: bool = True):
|
@@ -33,8 +38,8 @@ def view_lineage(file: File, with_children: bool = True):
|
|
33
38
|
u = graphviz.Digraph(
|
34
39
|
file.id,
|
35
40
|
node_attr={
|
36
|
-
"fillcolor":
|
37
|
-
"color":
|
41
|
+
"fillcolor": GREEN_FILL,
|
42
|
+
"color": LAMIN_GREEN_DARKER,
|
38
43
|
"fontname": "Helvetica",
|
39
44
|
"fontsize": "10",
|
40
45
|
},
|
@@ -51,7 +56,7 @@ def view_lineage(file: File, with_children: bool = True):
|
|
51
56
|
else:
|
52
57
|
style = "rounded,filled"
|
53
58
|
shape = "box"
|
54
|
-
fillcolor =
|
59
|
+
fillcolor = GREEN_FILL
|
55
60
|
u.node(
|
56
61
|
node_id,
|
57
62
|
label=node_label,
|
@@ -66,12 +71,12 @@ def view_lineage(file: File, with_children: bool = True):
|
|
66
71
|
add_node(row["target_record"], row["target"], row["target_label"], u)
|
67
72
|
|
68
73
|
u.edge(row["source"], row["target"], color="dimgrey")
|
69
|
-
# label the searched file
|
74
|
+
# label the searched file
|
70
75
|
u.node(
|
71
76
|
file.id,
|
72
77
|
label=file_label,
|
73
78
|
style="rounded,filled",
|
74
|
-
fillcolor=
|
79
|
+
fillcolor=LAMIN_GREEN_LIGHTER,
|
75
80
|
shape="box",
|
76
81
|
)
|
77
82
|
|
@@ -79,7 +84,7 @@ def view_lineage(file: File, with_children: bool = True):
|
|
79
84
|
|
80
85
|
|
81
86
|
def view_parents(
|
82
|
-
record:
|
87
|
+
record: Registry, field: str, with_children: bool = False, distance: int = 100
|
83
88
|
):
|
84
89
|
"""Graph of parents."""
|
85
90
|
if not hasattr(record, "parents"):
|
@@ -105,8 +110,8 @@ def view_parents(
|
|
105
110
|
u = graphviz.Digraph(
|
106
111
|
record.id,
|
107
112
|
node_attr={
|
108
|
-
"color":
|
109
|
-
"fillcolor":
|
113
|
+
"color": LAMIN_GREEN_DARKER,
|
114
|
+
"fillcolor": GREEN_FILL,
|
110
115
|
"shape": "box",
|
111
116
|
"style": "rounded,filled",
|
112
117
|
"fontname": "Helvetica",
|
@@ -116,17 +121,17 @@ def view_parents(
|
|
116
121
|
)
|
117
122
|
u.node(
|
118
123
|
record_label.replace(":", "_"),
|
119
|
-
label=record_label,
|
120
|
-
fillcolor=
|
124
|
+
label=_add_emoji(record, record_label),
|
125
|
+
fillcolor=LAMIN_GREEN_LIGHTER,
|
121
126
|
)
|
122
127
|
for _, row in df_edges.iterrows():
|
123
|
-
u.node(row["source"], label=row["source_label"])
|
128
|
+
u.node(row["source"], label=_add_emoji(record, row["source_label"]))
|
124
129
|
u.edge(row["source"], row["target"], color="dimgrey")
|
125
130
|
|
126
131
|
return u
|
127
132
|
|
128
133
|
|
129
|
-
def _get_parents(record:
|
134
|
+
def _get_parents(record: Registry, field: str, distance: int, children: bool = False):
|
130
135
|
"""Recursively get parent records within a distance."""
|
131
136
|
if children:
|
132
137
|
key = "parents"
|
@@ -152,7 +157,7 @@ def _get_parents(record: ORM, field: str, distance: int, children: bool = False)
|
|
152
157
|
|
153
158
|
|
154
159
|
def _df_edges_from_parents(
|
155
|
-
record:
|
160
|
+
record: Registry, field: str, distance: int, children: bool = False
|
156
161
|
):
|
157
162
|
"""Construct a DataFrame of edges as the input of graphviz.Digraph."""
|
158
163
|
key = "children" if children else "parents"
|
@@ -178,6 +183,16 @@ def _df_edges_from_parents(
|
|
178
183
|
return df_edges
|
179
184
|
|
180
185
|
|
186
|
+
def _add_emoji(record: Registry, label: str):
|
187
|
+
if record.__class__.__name__ == "Transform":
|
188
|
+
emoji = TRANSFORM_EMOJIS.get(record.type, "💫")
|
189
|
+
elif record.__class__.__name__ == "Run":
|
190
|
+
emoji = TRANSFORM_EMOJIS.get(record.transform.type, "💫")
|
191
|
+
else:
|
192
|
+
emoji = ""
|
193
|
+
return f"{emoji} {label}"
|
194
|
+
|
195
|
+
|
181
196
|
def _get_all_parent_runs(file: File):
|
182
197
|
"""Get all input file runs recursively."""
|
183
198
|
all_runs = {file.run}
|
@@ -222,9 +237,9 @@ def _label_file_run(record: Union[File, Run]):
|
|
222
237
|
elif isinstance(record, Run):
|
223
238
|
name = f'{record.transform.name.replace("&", "&")}'
|
224
239
|
return (
|
225
|
-
rf'<{name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"'
|
240
|
+
rf'<{TRANSFORM_EMOJIS.get(str(record.transform.type), "💫")} {name}<BR/><FONT COLOR="GREY" POINT-SIZE="10"' # noqa
|
226
241
|
rf' FACE="Monospace">id={record.id}<BR/>type={record.transform.type},'
|
227
|
-
rf" user={record.created_by.name}<BR/>run_at={
|
242
|
+
rf" user={record.created_by.name}<BR/>run_at={format_field_value(record.run_at)}</FONT>>" # noqa
|
228
243
|
)
|
229
244
|
|
230
245
|
|
@@ -248,3 +263,7 @@ def _df_edges_from_runs(all_runs: List[Run]):
|
|
248
263
|
df["source_label"] = df["source_record"].apply(_label_file_run)
|
249
264
|
df["target_label"] = df["target_record"].apply(_label_file_run)
|
250
265
|
return df
|
266
|
+
|
267
|
+
|
268
|
+
def _transform_emoji(transform: Transform):
|
269
|
+
return TRANSFORM_EMOJIS.get(transform.type, "💫")
|
lamindb/dev/datasets/__init__.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
"""
|
1
|
+
"""Test datasets.
|
2
2
|
|
3
3
|
.. autosummary::
|
4
4
|
:toctree: .
|
5
5
|
|
6
6
|
file_fcs
|
7
7
|
file_fcs_alpert19
|
8
|
+
file_tsv_rnaseq_nfcore_salmon_merged_gene_counts
|
8
9
|
file_jpg_paradisi05
|
9
10
|
file_tiff_suo22
|
10
11
|
file_fastq
|
11
12
|
file_bam
|
12
13
|
file_mini_csv
|
13
14
|
dir_scrnaseq_cellranger
|
14
|
-
generate_cell_ranger_files
|
15
15
|
df_iris
|
16
16
|
df_iris_in_meter
|
17
17
|
df_iris_in_meter_batch1
|
@@ -19,6 +19,7 @@
|
|
19
19
|
anndata_mouse_sc_lymph_node
|
20
20
|
anndata_human_immune_cells
|
21
21
|
anndata_pbmc68k_reduced
|
22
|
+
anndata_file_pbmc68k_test
|
22
23
|
anndata_pbmc3k_processed
|
23
24
|
anndata_with_obs
|
24
25
|
anndata_suo22_Visium10X
|
@@ -29,6 +30,7 @@
|
|
29
30
|
"""
|
30
31
|
|
31
32
|
from ._core import (
|
33
|
+
anndata_file_pbmc68k_test,
|
32
34
|
anndata_human_immune_cells,
|
33
35
|
anndata_mouse_sc_lymph_node,
|
34
36
|
anndata_pbmc3k_processed,
|
@@ -47,7 +49,7 @@ from ._core import (
|
|
47
49
|
file_jpg_paradisi05,
|
48
50
|
file_mini_csv,
|
49
51
|
file_tiff_suo22,
|
50
|
-
|
52
|
+
file_tsv_rnaseq_nfcore_salmon_merged_gene_counts,
|
51
53
|
mudata_papalexi21_subset,
|
52
54
|
schmidt22_crispra_gws_IFNG,
|
53
55
|
schmidt22_perturbseq,
|
lamindb/dev/datasets/_core.py
CHANGED
@@ -39,6 +39,18 @@ def file_jpg_paradisi05() -> Path:
|
|
39
39
|
return Path(filepath)
|
40
40
|
|
41
41
|
|
42
|
+
def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts() -> Path:
|
43
|
+
"""Gene counts table from nf-core RNA-seq pipeline.
|
44
|
+
|
45
|
+
Output of: https://nf-co.re/rnaseq
|
46
|
+
"""
|
47
|
+
filepath, _ = urlretrieve(
|
48
|
+
"https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv",
|
49
|
+
"salmon.merged.gene_counts.tsv",
|
50
|
+
)
|
51
|
+
return Path(filepath)
|
52
|
+
|
53
|
+
|
42
54
|
def file_fastq(in_storage_root=False) -> Path:
|
43
55
|
"""Mini mock fastq file."""
|
44
56
|
basedir = Path(".") if not in_storage_root else settings.storage
|
@@ -80,21 +92,6 @@ def file_tiff_suo22():
|
|
80
92
|
return Path(filepath)
|
81
93
|
|
82
94
|
|
83
|
-
def dir_scrnaseq_cellranger(in_storage_root=False) -> Path:
|
84
|
-
"""Directory with exemplary scrnaseq cellranger input and output."""
|
85
|
-
filepath, _ = urlretrieve(
|
86
|
-
"https://lamindb-test.s3.amazonaws.com/cellranger_run_001.zip"
|
87
|
-
)
|
88
|
-
from zipfile import ZipFile
|
89
|
-
|
90
|
-
basedir = Path(".") if not in_storage_root else settings.storage
|
91
|
-
with ZipFile(filepath, "r") as zipObj:
|
92
|
-
# Extract all the contents of zip file in current directory
|
93
|
-
zipObj.extractall(path=basedir)
|
94
|
-
|
95
|
-
return basedir / "cellranger_run_001"
|
96
|
-
|
97
|
-
|
98
95
|
def anndata_mouse_sc_lymph_node() -> ad.AnnData:
|
99
96
|
"""Mouse lymph node scRNA-seq dataset from EBI.
|
100
97
|
|
@@ -135,6 +132,28 @@ def anndata_pbmc68k_reduced() -> ad.AnnData:
|
|
135
132
|
return ad.read(filepath)
|
136
133
|
|
137
134
|
|
135
|
+
def anndata_file_pbmc68k_test() -> Path:
|
136
|
+
"""Modified from scanpy.datasets.pbmc68k_reduced().
|
137
|
+
|
138
|
+
Additional slots were added for testing purposes. Returns the filepath.
|
139
|
+
|
140
|
+
To reproduce::
|
141
|
+
|
142
|
+
pbmc68k = ln.dev.datasets.anndata_pbmc68k_reduced()
|
143
|
+
pbmc68k_test = pbmc68k[:30, :200].copy()
|
144
|
+
pbmc68k_test.raw = pbmc68k_test[:, :100]
|
145
|
+
pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
|
146
|
+
pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr")
|
147
|
+
pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape)
|
148
|
+
pbmc68k_test.layers["test"][0] = 1.
|
149
|
+
pbmc68k_test.write("pbmc68k_test.h5ad")
|
150
|
+
"""
|
151
|
+
filepath, _ = urlretrieve(
|
152
|
+
"https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad"
|
153
|
+
)
|
154
|
+
return Path(filepath)
|
155
|
+
|
156
|
+
|
138
157
|
def anndata_pbmc3k_processed() -> ad.AnnData:
|
139
158
|
"""Modified from scanpy.pbmc3k_processed()."""
|
140
159
|
filepath, _ = urlretrieve(
|
@@ -272,7 +291,7 @@ def df_iris_in_meter_batch2() -> pd.DataFrame:
|
|
272
291
|
return df_iris.iloc[len(df_iris) // 2 :]
|
273
292
|
|
274
293
|
|
275
|
-
def
|
294
|
+
def dir_scrnaseq_cellranger(
|
276
295
|
sample_name: str, basedir: Union[str, Path] = "./", output_only: bool = True
|
277
296
|
):
|
278
297
|
"""Generate mock cell ranger outputs.
|
@@ -281,7 +300,6 @@ def generate_cell_ranger_files(
|
|
281
300
|
sample_name: name of the sample
|
282
301
|
basedir: run directory
|
283
302
|
output_only: only generate output files
|
284
|
-
|
285
303
|
"""
|
286
304
|
basedir = Path(basedir)
|
287
305
|
|
lamindb/dev/exc.py
ADDED
@@ -94,12 +94,23 @@ def read_dataframe(elem: Union[h5py.Dataset, h5py.Group]):
|
|
94
94
|
def safer_read_partial(elem, indices):
|
95
95
|
if get_spec(elem).encoding_type == "":
|
96
96
|
if isinstance(elem, h5py.Dataset):
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
97
|
+
dims = len(elem.shape)
|
98
|
+
if dims == 2:
|
99
|
+
return elem[indices]
|
100
|
+
elif dims == 1:
|
101
|
+
if indices[0] == slice(None):
|
102
|
+
return elem[indices[1]]
|
103
|
+
elif indices[1] == slice(None):
|
104
|
+
return elem[indices[0]]
|
105
|
+
elif isinstance(elem, h5py.Group):
|
106
|
+
try:
|
107
|
+
return SparseDataset(elem)[indices]
|
108
|
+
except Exception:
|
109
|
+
pass
|
110
|
+
raise ValueError(
|
111
|
+
"Can not get a subset of the element of type"
|
112
|
+
f" {type(elem).__name__} with an empty spec."
|
113
|
+
)
|
103
114
|
else:
|
104
115
|
return read_elem_partial(elem, indices=indices)
|
105
116
|
|
@@ -140,6 +151,16 @@ if ZARR_INSTALLED:
|
|
140
151
|
GroupTypes.append(zarr.Group)
|
141
152
|
StorageTypes.append(zarr.Group)
|
142
153
|
|
154
|
+
def _subset_sparse_zarr(elem: zarr.Group, indices):
|
155
|
+
ds = SparseDataset(elem)
|
156
|
+
has_arrays = isinstance(indices[0], np.ndarray) or isinstance(
|
157
|
+
indices[1], np.ndarray
|
158
|
+
)
|
159
|
+
if not has_arrays and indices == (slice(None), slice(None)):
|
160
|
+
return ds.to_memory()
|
161
|
+
else:
|
162
|
+
return ds[indices]
|
163
|
+
|
143
164
|
@registry.register_open("zarr")
|
144
165
|
def open(filepath: Union[UPath, Path, str]): # noqa
|
145
166
|
fs, file_path_str = infer_filesystem(filepath)
|
@@ -156,16 +177,31 @@ if ZARR_INSTALLED:
|
|
156
177
|
|
157
178
|
@registry.register("zarr")
|
158
179
|
def safer_read_partial(elem, indices): # noqa
|
159
|
-
|
180
|
+
encoding_type = get_spec(elem).encoding_type
|
181
|
+
if encoding_type == "":
|
160
182
|
if isinstance(elem, zarr.Array):
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
183
|
+
dims = len(elem.shape)
|
184
|
+
if dims == 2:
|
185
|
+
return elem.oindex[indices]
|
186
|
+
elif dims == 1:
|
187
|
+
if indices[0] == slice(None):
|
188
|
+
return elem.oindex[indices[1]]
|
189
|
+
elif indices[1] == slice(None):
|
190
|
+
return elem.oindex[indices[0]]
|
191
|
+
elif isinstance(elem, zarr.Group):
|
192
|
+
try:
|
193
|
+
return _subset_sparse_zarr(elem, indices)
|
194
|
+
except Exception:
|
195
|
+
pass
|
196
|
+
raise ValueError(
|
197
|
+
"Can not get a subset of the element of type"
|
198
|
+
f" {type(elem).__name__} with an empty spec."
|
199
|
+
)
|
167
200
|
else:
|
168
|
-
|
201
|
+
if encoding_type in ("csr_matrix", "csc_matrix"):
|
202
|
+
return _subset_sparse_zarr(elem, indices)
|
203
|
+
else:
|
204
|
+
return read_elem_partial(elem, indices=indices)
|
169
205
|
|
170
206
|
# this is needed because accessing zarr.Group.keys() directly is very slow
|
171
207
|
@registry.register("zarr")
|
@@ -213,8 +249,6 @@ ArrayTypes = tuple(ArrayTypes) # type: ignore
|
|
213
249
|
GroupTypes = tuple(GroupTypes) # type: ignore
|
214
250
|
StorageTypes = tuple(StorageTypes) # type: ignore
|
215
251
|
|
216
|
-
ArrayOrSparseTypes = ArrayTypes + (SparseDataset,) # type: ignore
|
217
|
-
|
218
252
|
|
219
253
|
ArrayType = Union[ArrayTypes] # type: ignore
|
220
254
|
GroupType = Union[GroupTypes] # type: ignore
|
@@ -222,8 +256,10 @@ StorageType = Union[StorageTypes] # type: ignore
|
|
222
256
|
|
223
257
|
|
224
258
|
def _to_memory(elem):
|
225
|
-
if isinstance(elem,
|
259
|
+
if isinstance(elem, ArrayTypes):
|
226
260
|
return elem[()]
|
261
|
+
elif isinstance(elem, SparseDataset):
|
262
|
+
return elem.to_memory()
|
227
263
|
else:
|
228
264
|
return elem
|
229
265
|
|
lamindb/dev/storage/file.py
CHANGED
@@ -23,6 +23,43 @@ except ImportError:
|
|
23
23
|
AUTO_KEY_PREFIX = ".lamindb/"
|
24
24
|
|
25
25
|
|
26
|
+
KNOWN_SUFFIXES = {
|
27
|
+
# without readers
|
28
|
+
".txt",
|
29
|
+
".tsv",
|
30
|
+
".pdf",
|
31
|
+
".fastq",
|
32
|
+
".tar",
|
33
|
+
".zip",
|
34
|
+
# with readers (see below)
|
35
|
+
".h5ad",
|
36
|
+
".parquet",
|
37
|
+
".csv",
|
38
|
+
".fcs",
|
39
|
+
".zarr",
|
40
|
+
".zrad",
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
def extract_suffix_from_path(path: Union[UPath, Path]) -> str:
|
45
|
+
# this if-clause is based on https://stackoverflow.com/questions/31890341
|
46
|
+
# the rest conscsiously deviates
|
47
|
+
if len(path.suffixes) <= 2:
|
48
|
+
return "".join(path.suffixes)
|
49
|
+
else:
|
50
|
+
msg = "file has more than two suffixes (path.suffixes), "
|
51
|
+
# first check the 2nd-to-last suffix because it might be followed by .gz
|
52
|
+
# or another compression-related suffix
|
53
|
+
if path.suffixes[-2] in KNOWN_SUFFIXES:
|
54
|
+
suffix = "".join(path.suffixes[-2:])
|
55
|
+
msg += f"inferring:'{suffix}'"
|
56
|
+
else:
|
57
|
+
suffix = path.suffixes[-1]
|
58
|
+
msg += f"using only last suffix: '{suffix}'"
|
59
|
+
logger.warning(msg)
|
60
|
+
return suffix
|
61
|
+
|
62
|
+
|
26
63
|
# add type annotations back asap when re-organizing the module
|
27
64
|
def auto_storage_key_from_file(file: File):
|
28
65
|
if file.key is None:
|
@@ -35,25 +72,11 @@ def attempt_accessing_path(file: File, storage_key: str):
|
|
35
72
|
if file.storage_id == settings.storage.id:
|
36
73
|
path = settings.storage.key_to_filepath(storage_key)
|
37
74
|
else:
|
38
|
-
logger.warning(
|
39
|
-
"file.path() is slightly slower for files outside default storage"
|
40
|
-
)
|
75
|
+
logger.warning("file.path is slightly slower for files outside default storage")
|
41
76
|
storage = Storage.filter(id=file.storage_id).one()
|
42
77
|
# find a better way than passing None to instance_settings in the future!
|
43
78
|
storage_settings = StorageSettings(storage.root)
|
44
79
|
path = storage_settings.key_to_filepath(storage_key)
|
45
|
-
# the following is for backward compat
|
46
|
-
if storage_key.startswith(AUTO_KEY_PREFIX) and not path.exists():
|
47
|
-
logger.warning(
|
48
|
-
"You have auto-keyed files in your storage root, please move them into"
|
49
|
-
f" {AUTO_KEY_PREFIX} within your storage location"
|
50
|
-
)
|
51
|
-
# try legacy_storage_key in root
|
52
|
-
for previous_prefix in ["", "lndb/"]:
|
53
|
-
legacy_storage_key = storage_key.replace(AUTO_KEY_PREFIX, previous_prefix)
|
54
|
-
path = settings.storage.key_to_filepath(legacy_storage_key)
|
55
|
-
if path.exists():
|
56
|
-
return path
|
57
80
|
return path
|
58
81
|
|
59
82
|
|
@@ -168,6 +191,11 @@ def read_fcs(*args, **kwargs):
|
|
168
191
|
return readfcs.read(*args, **kwargs)
|
169
192
|
|
170
193
|
|
194
|
+
def read_tsv(path: Union[str, Path]) -> pd.DataFrame:
|
195
|
+
path_sanitized = Path(path)
|
196
|
+
return pd.read_csv(path_sanitized, sep="\t")
|
197
|
+
|
198
|
+
|
171
199
|
def load_to_memory(filepath: Union[str, Path, UPath], stream: bool = False):
|
172
200
|
"""Load a file into memory.
|
173
201
|
|
@@ -189,6 +217,7 @@ def load_to_memory(filepath: Union[str, Path, UPath], stream: bool = False):
|
|
189
217
|
|
190
218
|
READER_FUNCS = {
|
191
219
|
".csv": pd.read_csv,
|
220
|
+
".tsv": read_tsv,
|
192
221
|
".h5ad": read_adata_h5ad,
|
193
222
|
".parquet": pd.read_parquet,
|
194
223
|
".fcs": read_fcs,
|