lamindb 0.64.2__py3-none-any.whl → 0.65.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +4 -4
- lamindb/_artifact.py +2 -13
- lamindb/{_dataset.py → _collection.py} +58 -55
- lamindb/_feature.py +1 -1
- lamindb/_filter.py +2 -2
- lamindb/_parents.py +28 -22
- lamindb/_query_manager.py +2 -2
- lamindb/_registry.py +23 -9
- lamindb/_transform.py +5 -8
- lamindb/dev/__init__.py +11 -3
- lamindb/dev/_data.py +12 -12
- lamindb/dev/_feature_manager.py +44 -22
- lamindb/dev/_label_manager.py +40 -15
- lamindb/dev/{_mapped_dataset.py → _mapped_collection.py} +104 -32
- lamindb/dev/_run_context.py +34 -35
- lamindb/dev/_track_environment.py +18 -0
- lamindb/dev/datasets/__init__.py +1 -1
- lamindb/dev/datasets/_core.py +12 -12
- lamindb/dev/storage/_backed_access.py +4 -1
- lamindb/dev/storage/_zarr.py +4 -1
- lamindb/dev/versioning.py +16 -23
- {lamindb-0.64.2.dist-info → lamindb-0.65.1.dist-info}/METADATA +7 -6
- lamindb-0.65.1.dist-info/RECORD +49 -0
- lamindb-0.64.2.dist-info/RECORD +0 -48
- {lamindb-0.64.2.dist-info → lamindb-0.65.1.dist-info}/LICENSE +0 -0
- {lamindb-0.64.2.dist-info → lamindb-0.65.1.dist-info}/WHEEL +0 -0
@@ -40,12 +40,12 @@ class _Connect:
|
|
40
40
|
self.conn.close()
|
41
41
|
|
42
42
|
|
43
|
-
class
|
44
|
-
"""Map-style
|
43
|
+
class MappedCollection:
|
44
|
+
"""Map-style collection for use in data loaders.
|
45
45
|
|
46
46
|
This currently only works for collections of `AnnData` objects.
|
47
47
|
|
48
|
-
For an example, see :meth:`~lamindb.
|
48
|
+
For an example, see :meth:`~lamindb.Collection.mapped`.
|
49
49
|
|
50
50
|
.. note::
|
51
51
|
|
@@ -57,10 +57,14 @@ class MappedDataset:
|
|
57
57
|
self,
|
58
58
|
path_list: List[Union[str, PathLike]],
|
59
59
|
label_keys: Optional[Union[str, List[str]]] = None,
|
60
|
-
join_vars: Optional[Literal["auto", "inner"]] = "auto",
|
60
|
+
join_vars: Optional[Literal["auto", "inner", "outer"]] = "auto",
|
61
61
|
encode_labels: bool = True,
|
62
|
+
cache_categories: bool = True,
|
62
63
|
parallel: bool = False,
|
64
|
+
dtype: Optional[str] = None,
|
63
65
|
):
|
66
|
+
assert join_vars in {None, "auto", "inner", "outer"}
|
67
|
+
|
64
68
|
self.storages = [] # type: ignore
|
65
69
|
self.conns = [] # type: ignore
|
66
70
|
self.parallel = parallel
|
@@ -86,8 +90,15 @@ class MappedDataset:
|
|
86
90
|
|
87
91
|
self.encode_labels = encode_labels
|
88
92
|
self.label_keys = [label_keys] if isinstance(label_keys, str) else label_keys
|
89
|
-
if self.label_keys is not None
|
90
|
-
|
93
|
+
if self.label_keys is not None:
|
94
|
+
if cache_categories:
|
95
|
+
self._cache_categories(self.label_keys)
|
96
|
+
else:
|
97
|
+
self._cache_cats: dict = {}
|
98
|
+
if self.encode_labels:
|
99
|
+
self._make_encoders(self.label_keys)
|
100
|
+
|
101
|
+
self._dtype = dtype
|
91
102
|
|
92
103
|
self._closed = False
|
93
104
|
|
@@ -104,6 +115,18 @@ class MappedDataset:
|
|
104
115
|
self.conns.append(conn)
|
105
116
|
self.storages.append(storage)
|
106
117
|
|
118
|
+
def _cache_categories(self, label_keys: list):
|
119
|
+
self._cache_cats = {}
|
120
|
+
decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
|
121
|
+
for label in label_keys:
|
122
|
+
self._cache_cats[label] = []
|
123
|
+
for storage in self.storages:
|
124
|
+
with _Connect(storage) as store:
|
125
|
+
cats = self.get_categories(store, label)
|
126
|
+
if cats is not None:
|
127
|
+
cats = decode(cats) if isinstance(cats[0], bytes) else cats[...]
|
128
|
+
self._cache_cats[label].append(cats)
|
129
|
+
|
107
130
|
def _make_encoders(self, label_keys: list):
|
108
131
|
self.encoders = []
|
109
132
|
for label in label_keys:
|
@@ -115,20 +138,31 @@ class MappedDataset:
|
|
115
138
|
for storage in self.storages:
|
116
139
|
with _Connect(storage) as store:
|
117
140
|
var_list.append(_safer_read_index(store["var"]))
|
141
|
+
|
142
|
+
self.var_joint = None
|
118
143
|
if self.join_vars == "auto":
|
119
144
|
vars_eq = all(var_list[0].equals(vrs) for vrs in var_list[1:])
|
120
145
|
if vars_eq:
|
121
146
|
self.join_vars = None
|
122
147
|
return
|
123
148
|
else:
|
124
|
-
self.
|
149
|
+
self.var_joint = reduce(pd.Index.intersection, var_list)
|
150
|
+
if len(self.var_joint) > 0:
|
151
|
+
self.join_vars = "inner"
|
152
|
+
else:
|
153
|
+
self.join_vars = "outer"
|
154
|
+
|
125
155
|
if self.join_vars == "inner":
|
126
|
-
self.var_joint
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
156
|
+
if self.var_joint is None:
|
157
|
+
self.var_joint = reduce(pd.Index.intersection, var_list)
|
158
|
+
if len(self.var_joint) == 0:
|
159
|
+
raise ValueError(
|
160
|
+
"The provided AnnData objects don't have shared varibales."
|
161
|
+
)
|
131
162
|
self.var_indices = [vrs.get_indexer(self.var_joint) for vrs in var_list]
|
163
|
+
elif self.join_vars == "outer":
|
164
|
+
self.var_joint = reduce(pd.Index.union, var_list)
|
165
|
+
self.var_indices = [self.var_joint.get_indexer(vrs) for vrs in var_list]
|
132
166
|
|
133
167
|
def __len__(self):
|
134
168
|
return self.n_obs
|
@@ -137,15 +171,21 @@ class MappedDataset:
|
|
137
171
|
obs_idx = self.indices[idx]
|
138
172
|
storage_idx = self.storage_idx[idx]
|
139
173
|
if self.var_indices is not None:
|
140
|
-
|
174
|
+
var_idxs_join = self.var_indices[storage_idx]
|
141
175
|
else:
|
142
|
-
|
176
|
+
var_idxs_join = None
|
143
177
|
|
144
178
|
with _Connect(self.storages[storage_idx]) as store:
|
145
|
-
out = [self.get_data_idx(store, obs_idx,
|
179
|
+
out = [self.get_data_idx(store, obs_idx, var_idxs_join)]
|
146
180
|
if self.label_keys is not None:
|
147
181
|
for i, label in enumerate(self.label_keys):
|
148
|
-
|
182
|
+
if label in self._cache_cats:
|
183
|
+
cats = self._cache_cats[label][storage_idx]
|
184
|
+
if cats is None:
|
185
|
+
cats = []
|
186
|
+
else:
|
187
|
+
cats = None
|
188
|
+
label_idx = self.get_label_idx(store, obs_idx, label, cats)
|
149
189
|
if self.encode_labels:
|
150
190
|
label_idx = self.encoders[i][label_idx]
|
151
191
|
out.append(label_idx)
|
@@ -155,26 +195,50 @@ class MappedDataset:
|
|
155
195
|
self,
|
156
196
|
storage: StorageType, # type: ignore
|
157
197
|
idx: int,
|
158
|
-
|
198
|
+
var_idxs_join: Optional[list] = None,
|
159
199
|
layer_key: Optional[str] = None,
|
160
200
|
):
|
161
201
|
"""Get the index for the data."""
|
162
202
|
layer = storage["X"] if layer_key is None else storage["layers"][layer_key] # type: ignore
|
163
203
|
if isinstance(layer, ArrayTypes): # type: ignore
|
164
|
-
|
165
|
-
|
204
|
+
layer_idx = layer[idx]
|
205
|
+
if self.join_vars is None:
|
206
|
+
result = layer_idx
|
207
|
+
if self._dtype is not None:
|
208
|
+
result = result.astype(self._dtype, copy=False)
|
209
|
+
elif self.join_vars == "outer":
|
210
|
+
dtype = layer_idx.dtype if self._dtype is None else self._dtype
|
211
|
+
result = np.zeros(len(self.var_joint), dtype=dtype)
|
212
|
+
result[var_idxs_join] = layer_idx
|
213
|
+
else: # inner join
|
214
|
+
result = layer_idx[var_idxs_join]
|
215
|
+
if self._dtype is not None:
|
216
|
+
result = result.astype(self._dtype, copy=False)
|
217
|
+
return result
|
166
218
|
else: # assume csr_matrix here
|
167
219
|
data = layer["data"]
|
168
220
|
indices = layer["indices"]
|
169
221
|
indptr = layer["indptr"]
|
170
222
|
s = slice(*(indptr[idx : idx + 2]))
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
223
|
+
data_s = data[s]
|
224
|
+
dtype = data_s.dtype if self._dtype is None else self._dtype
|
225
|
+
if self.join_vars == "outer":
|
226
|
+
layer_idx = np.zeros(len(self.var_joint), dtype=dtype)
|
227
|
+
layer_idx[var_idxs_join[indices[s]]] = data_s
|
228
|
+
else:
|
229
|
+
layer_idx = np.zeros(layer.attrs["shape"][1], dtype=dtype)
|
230
|
+
layer_idx[indices[s]] = data_s
|
231
|
+
if self.join_vars == "inner":
|
232
|
+
layer_idx = layer_idx[var_idxs_join]
|
233
|
+
return layer_idx
|
176
234
|
|
177
|
-
def get_label_idx(
|
235
|
+
def get_label_idx(
|
236
|
+
self,
|
237
|
+
storage: StorageType,
|
238
|
+
idx: int,
|
239
|
+
label_key: str,
|
240
|
+
categories: Optional[list] = None,
|
241
|
+
):
|
178
242
|
"""Get the index for the label by key."""
|
179
243
|
obs = storage["obs"] # type: ignore
|
180
244
|
# how backwards compatible do we want to be here actually?
|
@@ -186,9 +250,11 @@ class MappedDataset:
|
|
186
250
|
label = labels[idx]
|
187
251
|
else:
|
188
252
|
label = labels["codes"][idx]
|
189
|
-
|
190
|
-
|
191
|
-
|
253
|
+
if categories is not None:
|
254
|
+
cats = categories
|
255
|
+
else:
|
256
|
+
cats = self.get_categories(storage, label_key)
|
257
|
+
if cats is not None and len(cats) > 0:
|
192
258
|
label = cats[label]
|
193
259
|
if isinstance(label, bytes):
|
194
260
|
label = label.decode("utf-8")
|
@@ -215,11 +281,14 @@ class MappedDataset:
|
|
215
281
|
"""Get merged labels."""
|
216
282
|
labels_merge = []
|
217
283
|
decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
|
218
|
-
for storage in self.storages:
|
284
|
+
for i, storage in enumerate(self.storages):
|
219
285
|
with _Connect(storage) as store:
|
220
286
|
codes = self.get_codes(store, label_key)
|
221
287
|
labels = decode(codes) if isinstance(codes[0], bytes) else codes
|
222
|
-
|
288
|
+
if label_key in self._cache_cats:
|
289
|
+
cats = self._cache_cats[label_key][i]
|
290
|
+
else:
|
291
|
+
cats = self.get_categories(store, label_key)
|
223
292
|
if cats is not None:
|
224
293
|
cats = decode(cats) if isinstance(cats[0], bytes) else cats
|
225
294
|
labels = cats[labels]
|
@@ -230,9 +299,12 @@ class MappedDataset:
|
|
230
299
|
"""Get merged categories."""
|
231
300
|
cats_merge = set()
|
232
301
|
decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
|
233
|
-
for storage in self.storages:
|
302
|
+
for i, storage in enumerate(self.storages):
|
234
303
|
with _Connect(storage) as store:
|
235
|
-
|
304
|
+
if label_key in self._cache_cats:
|
305
|
+
cats = self._cache_cats[label_key][i]
|
306
|
+
else:
|
307
|
+
cats = self.get_categories(store, label_key)
|
236
308
|
if cats is not None:
|
237
309
|
cats = decode(cats) if isinstance(cats[0], bytes) else cats
|
238
310
|
cats_merge.update(cats)
|
lamindb/dev/_run_context.py
CHANGED
@@ -13,7 +13,7 @@ from lnschema_core import Run, Transform, ids
|
|
13
13
|
from lnschema_core.types import TransformType
|
14
14
|
from lnschema_core.users import current_user_id
|
15
15
|
|
16
|
-
from lamindb.dev.versioning import
|
16
|
+
from lamindb.dev.versioning import get_uid_from_old_version
|
17
17
|
|
18
18
|
from .hashing import to_b64_str
|
19
19
|
|
@@ -33,7 +33,9 @@ msg_manual_init = (
|
|
33
33
|
)
|
34
34
|
|
35
35
|
|
36
|
-
|
36
|
+
# we don't want a real error here, as this is so frequent
|
37
|
+
# in VSCode
|
38
|
+
class UpdateNbWithNonInteractiveEditor(SystemExit):
|
37
39
|
pass
|
38
40
|
|
39
41
|
|
@@ -82,21 +84,21 @@ def update_notebook_metadata(
|
|
82
84
|
from nbproject._header import _filepath
|
83
85
|
|
84
86
|
notebook = nb_dev.read_notebook(_filepath)
|
85
|
-
|
87
|
+
stem_uid = notebook.metadata["nbproject"]["id"]
|
86
88
|
version = notebook.metadata["nbproject"]["version"]
|
87
89
|
|
88
|
-
updated,
|
90
|
+
updated, new_stem_uid, new_version = update_transform_source_metadata(
|
89
91
|
notebook, _filepath, bump_version=bump_version, run_from_cli=False
|
90
92
|
)
|
91
93
|
|
92
94
|
if version != new_version:
|
93
95
|
notebook.metadata["nbproject"]["version"] = new_version
|
94
|
-
new_uid, _
|
95
|
-
is_new_version_of=transform, version=new_version, n_full_id=
|
96
|
+
new_uid, _ = get_uid_from_old_version(
|
97
|
+
is_new_version_of=transform, version=new_version, n_full_id=16
|
96
98
|
)
|
97
99
|
else:
|
98
|
-
notebook.metadata["nbproject"]["id"] =
|
99
|
-
new_uid =
|
100
|
+
notebook.metadata["nbproject"]["id"] = stem_uid
|
101
|
+
new_uid = new_stem_uid + ids.base62(n_char=4)
|
100
102
|
|
101
103
|
# here we check that responses to both inputs (for new id and version) were not 'n'
|
102
104
|
if updated:
|
@@ -123,20 +125,22 @@ def get_notebook_name_colab() -> str:
|
|
123
125
|
return name.rstrip(".ipynb")
|
124
126
|
|
125
127
|
|
126
|
-
def
|
128
|
+
def get_transform_kwargs_from_stem_uid(
|
127
129
|
nbproject_id: str,
|
128
130
|
nbproject_version: str,
|
129
|
-
) -> Tuple[Optional[Transform], str, str
|
130
|
-
|
131
|
-
|
132
|
-
version
|
131
|
+
) -> Tuple[Optional[Transform], str, str]:
|
132
|
+
from lamin_utils._base62 import encodebytes
|
133
|
+
|
134
|
+
# merely zero-padding the nbproject version such that the base62 encoding is at
|
135
|
+
# least 4 characters long does yield sufficiently diverse hashes within 4 characters
|
136
|
+
# it'd be nice because the uid_ext would be ordered, but it leads to collisions
|
137
|
+
uid_ext = encodebytes(hashlib.md5(nbproject_version.encode()).digest())[:4]
|
138
|
+
new_uid = nbproject_id + uid_ext
|
139
|
+
assert len(new_uid) == 16
|
133
140
|
transform = Transform.filter(
|
134
|
-
uid__startswith=nbproject_id, version=
|
141
|
+
uid__startswith=nbproject_id, version=nbproject_version
|
135
142
|
).one_or_none()
|
136
|
-
|
137
|
-
if transform is None:
|
138
|
-
old_version_of = Transform.filter(uid__startswith=nbproject_id).first()
|
139
|
-
return transform, uid, version, old_version_of
|
143
|
+
return transform, new_uid, nbproject_version
|
140
144
|
|
141
145
|
|
142
146
|
class run_context:
|
@@ -228,7 +232,7 @@ class run_context:
|
|
228
232
|
"it looks like you are running ln.track() from a "
|
229
233
|
"notebook!\nplease install nbproject: pip install nbproject"
|
230
234
|
)
|
231
|
-
elif isinstance(e,
|
235
|
+
elif isinstance(e, UpdateNbWithNonInteractiveEditor):
|
232
236
|
raise e
|
233
237
|
elif isinstance(e, (NotebookNotSavedError, NoTitleError)):
|
234
238
|
raise e
|
@@ -245,7 +249,7 @@ class run_context:
|
|
245
249
|
is_tracked = False
|
246
250
|
else:
|
247
251
|
name = Path(module.__file__).stem # type: ignore
|
248
|
-
if not hasattr(module, "
|
252
|
+
if not hasattr(module, "__transform_stem_uid__"):
|
249
253
|
raise RuntimeError(
|
250
254
|
"no automated tracking because no uid attached to script!\n"
|
251
255
|
f"please run: lamin track {module.__file__}\n"
|
@@ -254,9 +258,8 @@ class run_context:
|
|
254
258
|
transform,
|
255
259
|
uid,
|
256
260
|
version,
|
257
|
-
|
258
|
-
|
259
|
-
module.__lamindb_uid_prefix__,
|
261
|
+
) = get_transform_kwargs_from_stem_uid(
|
262
|
+
module.__transform_stem_uid__,
|
260
263
|
module.__version__, # type: ignore
|
261
264
|
)
|
262
265
|
short_name = Path(module.__file__).name # type: ignore
|
@@ -265,7 +268,6 @@ class run_context:
|
|
265
268
|
version=version,
|
266
269
|
name=name,
|
267
270
|
reference=reference,
|
268
|
-
is_new_version_of=old_version_of,
|
269
271
|
transform_type=TransformType.pipeline,
|
270
272
|
short_name=short_name,
|
271
273
|
is_interactive=False,
|
@@ -321,6 +323,10 @@ class run_context:
|
|
321
323
|
logger.important(f"saved: {run}")
|
322
324
|
cls.run = run
|
323
325
|
|
326
|
+
from ._track_environment import track_environment
|
327
|
+
|
328
|
+
track_environment(run)
|
329
|
+
|
324
330
|
# at this point, we have a transform can display its parents if there are any
|
325
331
|
parents = cls.transform.parents.all() if cls.transform is not None else []
|
326
332
|
if len(parents) > 0:
|
@@ -431,7 +437,7 @@ class run_context:
|
|
431
437
|
cls._notebook_meta = metadata # type: ignore
|
432
438
|
else:
|
433
439
|
msg = msg_manual_init.format(notebook_path=notebook_path_str)
|
434
|
-
raise
|
440
|
+
raise UpdateNbWithNonInteractiveEditor(msg)
|
435
441
|
|
436
442
|
if _env in ("lab", "notebook"):
|
437
443
|
# save the notebook in case that title was updated
|
@@ -446,7 +452,7 @@ class run_context:
|
|
446
452
|
is_interactive = _seconds_modified(_filepath) < 1.5 # should be ~1 sec
|
447
453
|
if not is_interactive and needs_init:
|
448
454
|
msg = msg_manual_init.format(notebook_path=_filepath)
|
449
|
-
raise
|
455
|
+
raise UpdateNbWithNonInteractiveEditor(msg)
|
450
456
|
|
451
457
|
nbproject_id = metadata["id"]
|
452
458
|
nbproject_version = metadata["version"]
|
@@ -468,7 +474,6 @@ class run_context:
|
|
468
474
|
transform = Transform.filter(uid=uid).one_or_none()
|
469
475
|
name = filestem
|
470
476
|
short_name = None
|
471
|
-
old_version_of = None
|
472
477
|
# nbproject parsing successful
|
473
478
|
elif nbproject_id is not None:
|
474
479
|
name = nbproject_title
|
@@ -476,21 +481,18 @@ class run_context:
|
|
476
481
|
transform,
|
477
482
|
uid,
|
478
483
|
version,
|
479
|
-
|
480
|
-
) = get_transform_kwargs_from_uid_prefix(nbproject_id, nbproject_version)
|
484
|
+
) = get_transform_kwargs_from_stem_uid(nbproject_id, nbproject_version)
|
481
485
|
short_name = filestem
|
482
486
|
cls._create_or_load_transform(
|
483
487
|
uid=uid,
|
484
488
|
version=version,
|
485
489
|
name=name,
|
486
490
|
reference=reference,
|
487
|
-
is_new_version_of=old_version_of,
|
488
491
|
transform_type=TransformType.notebook,
|
489
492
|
short_name=short_name,
|
490
493
|
is_interactive=is_interactive,
|
491
494
|
filepath=notebook_path,
|
492
495
|
transform=transform,
|
493
|
-
metadata=metadata,
|
494
496
|
)
|
495
497
|
|
496
498
|
@classmethod
|
@@ -509,7 +511,7 @@ class run_context:
|
|
509
511
|
cls._notebook_meta = metadata # type: ignore
|
510
512
|
else:
|
511
513
|
msg = msg_manual_init.format(notebook_path=filepath)
|
512
|
-
raise
|
514
|
+
raise UpdateNbWithNonInteractiveEditor(msg)
|
513
515
|
else:
|
514
516
|
from lamin_cli._transform import update_transform_source_metadata
|
515
517
|
|
@@ -533,13 +535,11 @@ class run_context:
|
|
533
535
|
version: Optional[str],
|
534
536
|
name: str,
|
535
537
|
reference: Optional[str],
|
536
|
-
is_new_version_of: Optional[Transform],
|
537
538
|
short_name: Optional[str],
|
538
539
|
transform_type: TransformType,
|
539
540
|
is_interactive: bool,
|
540
541
|
filepath: str,
|
541
542
|
transform: Optional[Transform] = None,
|
542
|
-
metadata: Optional[Dict] = None,
|
543
543
|
) -> bool:
|
544
544
|
# make a new transform record
|
545
545
|
if transform is None:
|
@@ -549,7 +549,6 @@ class run_context:
|
|
549
549
|
name=name,
|
550
550
|
short_name=short_name,
|
551
551
|
reference=reference,
|
552
|
-
is_new_version_of=is_new_version_of,
|
553
552
|
type=transform_type,
|
554
553
|
)
|
555
554
|
transform.save()
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import subprocess
|
2
|
+
|
3
|
+
import lamindb_setup as ln_setup
|
4
|
+
from lamin_utils import logger
|
5
|
+
from lnschema_core.models import Run
|
6
|
+
|
7
|
+
|
8
|
+
def track_environment(run: Run) -> None:
|
9
|
+
filepath = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
|
10
|
+
# create a requirements.txt
|
11
|
+
# we don't create a conda environment.yml mostly for its slowness
|
12
|
+
try:
|
13
|
+
result = subprocess.run(f"pip freeze > {str(filepath)}", shell=True)
|
14
|
+
except OSError as e:
|
15
|
+
result = None
|
16
|
+
logger.warning(f"could not run pip freeze with error {e}")
|
17
|
+
if result is not None and result.returncode == 0:
|
18
|
+
logger.info(f"tracked pip freeze > {str(filepath)}")
|
lamindb/dev/datasets/__init__.py
CHANGED
lamindb/dev/datasets/_core.py
CHANGED
@@ -144,7 +144,7 @@ def dir_iris_images() -> UPath: # pragma: no cover
|
|
144
144
|
def anndata_mouse_sc_lymph_node(
|
145
145
|
populate_registries: bool = False,
|
146
146
|
) -> ad.AnnData: # pragma: no cover
|
147
|
-
"""Mouse lymph node scRNA-seq
|
147
|
+
"""Mouse lymph node scRNA-seq collection from EBI.
|
148
148
|
|
149
149
|
Subsampled to 10k genes.
|
150
150
|
|
@@ -226,11 +226,11 @@ def anndata_mouse_sc_lymph_node(
|
|
226
226
|
|
227
227
|
|
228
228
|
def anndata_pbmc68k_reduced() -> ad.AnnData:
|
229
|
-
"""Modified from scanpy.
|
229
|
+
"""Modified from scanpy.collections.pbmc68k_reduced().
|
230
230
|
|
231
231
|
This code was run::
|
232
232
|
|
233
|
-
pbmc68k = sc.
|
233
|
+
pbmc68k = sc.collections.pbmc68k_reduced()
|
234
234
|
pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
|
235
235
|
pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
|
236
236
|
{"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
|
@@ -254,7 +254,7 @@ def anndata_pbmc68k_reduced() -> ad.AnnData:
|
|
254
254
|
|
255
255
|
|
256
256
|
def anndata_file_pbmc68k_test() -> Path:
|
257
|
-
"""Modified from scanpy.
|
257
|
+
"""Modified from scanpy.collections.pbmc68k_reduced().
|
258
258
|
|
259
259
|
Additional slots were added for testing purposes. Returns the filepath.
|
260
260
|
|
@@ -291,7 +291,7 @@ def anndata_human_immune_cells(
|
|
291
291
|
"""Cross-tissue immune cell analysis reveals tissue-specific features in humans.
|
292
292
|
|
293
293
|
From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3 # noqa
|
294
|
-
|
294
|
+
Collection: Global
|
295
295
|
|
296
296
|
To reproduce the subsample::
|
297
297
|
|
@@ -404,18 +404,18 @@ def mudata_papalexi21_subset(): # pragma: no cover
|
|
404
404
|
|
405
405
|
|
406
406
|
def df_iris() -> pd.DataFrame:
|
407
|
-
"""The iris
|
407
|
+
"""The iris collection as in sklearn.
|
408
408
|
|
409
409
|
Original code::
|
410
410
|
|
411
|
-
sklearn.
|
411
|
+
sklearn.collections.load_iris(as_frame=True).frame
|
412
412
|
"""
|
413
413
|
filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
|
414
414
|
return pd.read_parquet(filepath)
|
415
415
|
|
416
416
|
|
417
417
|
def df_iris_in_meter() -> pd.DataFrame:
|
418
|
-
"""The iris
|
418
|
+
"""The iris collection with lengths in meter."""
|
419
419
|
df = df_iris()
|
420
420
|
# rename columns
|
421
421
|
df.rename(
|
@@ -436,13 +436,13 @@ def df_iris_in_meter() -> pd.DataFrame:
|
|
436
436
|
|
437
437
|
|
438
438
|
def df_iris_in_meter_study1() -> pd.DataFrame:
|
439
|
-
"""The iris
|
439
|
+
"""The iris collection with lengths in meter."""
|
440
440
|
df_iris = df_iris_in_meter()
|
441
441
|
return df_iris.iloc[: len(df_iris) // 2]
|
442
442
|
|
443
443
|
|
444
444
|
def df_iris_in_meter_study2() -> pd.DataFrame:
|
445
|
-
"""The iris
|
445
|
+
"""The iris collection with lengths in meter."""
|
446
446
|
df_iris = df_iris_in_meter()
|
447
447
|
return df_iris.iloc[len(df_iris) // 2 :]
|
448
448
|
|
@@ -500,7 +500,7 @@ def dir_scrnaseq_cellranger(
|
|
500
500
|
|
501
501
|
|
502
502
|
def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
|
503
|
-
"""CRISPRi screen
|
503
|
+
"""CRISPRi screen collection of Schmidt22.
|
504
504
|
|
505
505
|
Originally from: https://zenodo.org/record/5784651
|
506
506
|
"""
|
@@ -512,7 +512,7 @@ def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
|
|
512
512
|
|
513
513
|
|
514
514
|
def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
|
515
|
-
"""Perturb-seq
|
515
|
+
"""Perturb-seq collection of Schmidt22.
|
516
516
|
|
517
517
|
Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
|
518
518
|
|
@@ -48,7 +48,10 @@ else:
|
|
48
48
|
from anndata._core.sparse_dataset import (
|
49
49
|
BaseCompressedSparseDataset as SparseDataset,
|
50
50
|
)
|
51
|
-
from anndata._core.sparse_dataset import
|
51
|
+
from anndata._core.sparse_dataset import ( # type: ignore
|
52
|
+
CSRDataset,
|
53
|
+
sparse_dataset,
|
54
|
+
)
|
52
55
|
|
53
56
|
def _check_group_format(*args):
|
54
57
|
pass
|
lamindb/dev/storage/_zarr.py
CHANGED
@@ -72,7 +72,10 @@ def write_adata_zarr(
|
|
72
72
|
|
73
73
|
if chunks is not None and not isinstance(adata.X, sparse.spmatrix):
|
74
74
|
_write_elem_cb(
|
75
|
-
f,
|
75
|
+
f,
|
76
|
+
"X",
|
77
|
+
adata.X,
|
78
|
+
dataset_kwargs=dict(chunks=chunks, **dataset_kwargs),
|
76
79
|
)
|
77
80
|
else:
|
78
81
|
_write_elem_cb(f, "X", adata.X, dataset_kwargs=dataset_kwargs)
|
lamindb/dev/versioning.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import Optional, Tuple, Union
|
2
2
|
|
3
3
|
from lnschema_core import ids
|
4
|
-
from lnschema_core.models import
|
4
|
+
from lnschema_core.models import IsVersioned
|
5
5
|
|
6
6
|
|
7
7
|
def set_version(version: Optional[str] = None, previous_version: Optional[str] = None):
|
@@ -32,33 +32,29 @@ def init_uid(
|
|
32
32
|
*,
|
33
33
|
version: Optional[str] = None,
|
34
34
|
n_full_id: int = 20,
|
35
|
+
is_new_version_of: Optional[IsVersioned] = None,
|
35
36
|
) -> str:
|
36
|
-
if
|
37
|
-
|
38
|
-
|
39
|
-
|
37
|
+
if is_new_version_of is not None:
|
38
|
+
stem_uid = is_new_version_of.stem_uid
|
39
|
+
else:
|
40
|
+
if n_full_id == 20:
|
41
|
+
stem_uid = ids.base62_16()
|
42
|
+
elif n_full_id == 16:
|
43
|
+
stem_uid = ids.base62_12()
|
40
44
|
if version is not None:
|
41
45
|
if not isinstance(version, str):
|
42
46
|
raise ValueError(
|
43
47
|
"`version` parameter must be `None` or `str`, e.g., '0.1', '1', '2',"
|
44
48
|
" etc."
|
45
49
|
)
|
46
|
-
return
|
47
|
-
|
50
|
+
return stem_uid + ids.base62_4()
|
48
51
|
|
49
|
-
def get_initial_version_id(is_new_version_of: Union[Artifact, Transform]):
|
50
|
-
if is_new_version_of.initial_version_id is None:
|
51
|
-
initial_version_id = is_new_version_of.id
|
52
|
-
else:
|
53
|
-
initial_version_id = is_new_version_of.initial_version_id
|
54
|
-
return initial_version_id
|
55
52
|
|
56
|
-
|
57
|
-
|
58
|
-
is_new_version_of: Union[Artifact, Transform],
|
53
|
+
def get_uid_from_old_version(
|
54
|
+
is_new_version_of: IsVersioned,
|
59
55
|
version: Optional[str],
|
60
56
|
n_full_id: int = 20,
|
61
|
-
) -> Tuple[str,
|
57
|
+
) -> Tuple[str, str]:
|
62
58
|
"""{}."""
|
63
59
|
msg = ""
|
64
60
|
if is_new_version_of.version is None:
|
@@ -67,18 +63,15 @@ def get_ids_from_old_version(
|
|
67
63
|
else:
|
68
64
|
previous_version = is_new_version_of.version
|
69
65
|
version = set_version(version, previous_version)
|
70
|
-
initial_version_id = get_initial_version_id(is_new_version_of)
|
71
66
|
new_uid = init_uid(
|
72
67
|
version=version,
|
73
68
|
n_full_id=n_full_id,
|
69
|
+
is_new_version_of=is_new_version_of,
|
74
70
|
)
|
75
71
|
# the following covers the edge case where the old file was unversioned
|
76
72
|
if is_new_version_of.version is None:
|
77
73
|
is_new_version_of.version = previous_version
|
78
74
|
is_new_version_of.save()
|
79
75
|
if msg != "":
|
80
|
-
msg +=
|
81
|
-
|
82
|
-
f" '{initial_version_id}')"
|
83
|
-
)
|
84
|
-
return new_uid, initial_version_id, version # type: ignore
|
76
|
+
msg += f"& new version to '{version}'"
|
77
|
+
return new_uid, version
|