lamindb 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +4 -2
- lamindb/_artifact.py +54 -36
- lamindb/_collection.py +1 -1
- lamindb/_feature.py +1 -1
- lamindb/_finish.py +9 -1
- lamindb/_query_set.py +24 -6
- lamindb/_record.py +4 -5
- lamindb/_save.py +9 -1
- lamindb/_tracked.py +25 -2
- lamindb/base/users.py +1 -4
- lamindb/core/_context.py +7 -2
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +15 -3
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +10 -5
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/paths.py +12 -12
- lamindb/curators/__init__.py +77 -65
- lamindb/models.py +58 -18
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/METADATA +2 -2
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/RECORD +25 -25
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
"""A data framework for biology.
|
2
2
|
|
3
|
-
Tracking notebooks &
|
3
|
+
Tracking notebooks, scripts & functions.
|
4
4
|
|
5
5
|
.. autosummary::
|
6
6
|
:toctree: .
|
7
7
|
|
8
8
|
track
|
9
9
|
finish
|
10
|
+
tracked
|
10
11
|
|
11
12
|
Registries.
|
12
13
|
|
@@ -57,11 +58,12 @@ Backward compatibility.
|
|
57
58
|
:toctree: .
|
58
59
|
|
59
60
|
FeatureSet
|
61
|
+
Curator
|
60
62
|
|
61
63
|
"""
|
62
64
|
|
63
65
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
64
|
-
__version__ = "1.1.
|
66
|
+
__version__ = "1.1.1"
|
65
67
|
|
66
68
|
from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
|
67
69
|
from lamindb_setup._check_setup import _check_instance_setup
|
lamindb/_artifact.py
CHANGED
@@ -152,6 +152,7 @@ def process_data(
|
|
152
152
|
default_storage: Storage,
|
153
153
|
using_key: str | None,
|
154
154
|
skip_existence_check: bool = False,
|
155
|
+
is_replace: bool = False,
|
155
156
|
) -> tuple[Any, Path | UPath, str, Storage, bool]:
|
156
157
|
"""Serialize a data object that's provided as file or in memory."""
|
157
158
|
# if not overwritten, data gets stored in default storage
|
@@ -161,14 +162,24 @@ def process_data(
|
|
161
162
|
data_types = (pd.DataFrame, AnnData, MuData)
|
162
163
|
else:
|
163
164
|
data_types = (pd.DataFrame, AnnData) # type:ignore
|
164
|
-
|
165
|
+
if key is not None:
|
166
|
+
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
167
|
+
# use suffix as the (adata) format if the format is not provided
|
168
|
+
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
169
|
+
format = key_suffix[1:]
|
170
|
+
else:
|
171
|
+
key_suffix = None
|
165
172
|
if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
|
166
173
|
access_token = (
|
167
174
|
default_storage._access_token
|
168
175
|
if hasattr(default_storage, "_access_token")
|
169
176
|
else None
|
170
177
|
)
|
171
|
-
path = create_path(data, access_token=access_token)
|
178
|
+
path = create_path(data, access_token=access_token)
|
179
|
+
# we don't resolve http links because they can resolve into a different domain
|
180
|
+
# for example into a temporary url
|
181
|
+
if path.protocol not in {"http", "https"}:
|
182
|
+
path = path.resolve()
|
172
183
|
storage, use_existing_storage_key = process_pathlike(
|
173
184
|
path,
|
174
185
|
default_storage=default_storage,
|
@@ -180,30 +191,23 @@ def process_data(
|
|
180
191
|
elif isinstance(data, data_types):
|
181
192
|
storage = default_storage
|
182
193
|
memory_rep = data
|
183
|
-
if key is not None:
|
184
|
-
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
185
|
-
# use suffix as the (adata) format if the format is not provided
|
186
|
-
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
187
|
-
format = key_suffix[1:]
|
188
|
-
else:
|
189
|
-
key_suffix = None
|
190
194
|
suffix = infer_suffix(data, format)
|
191
|
-
if key_suffix is not None and key_suffix != suffix:
|
192
|
-
raise InvalidArgument(
|
193
|
-
f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
|
194
|
-
f" be '{suffix}'."
|
195
|
-
)
|
196
|
-
cache_name = f"{provisional_uid}{suffix}"
|
197
|
-
path = settings.cache_dir / cache_name
|
198
|
-
# Alex: I don't understand the line below
|
199
|
-
if path.suffixes == []:
|
200
|
-
path = path.with_suffix(suffix)
|
201
|
-
write_to_disk(data, path)
|
202
|
-
use_existing_storage_key = False
|
203
195
|
else:
|
204
196
|
raise NotImplementedError(
|
205
197
|
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
206
198
|
)
|
199
|
+
if key_suffix is not None and key_suffix != suffix and not is_replace:
|
200
|
+
# consciously omitting a trailing period
|
201
|
+
if isinstance(data, (str, Path, UPath)):
|
202
|
+
message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
|
203
|
+
else:
|
204
|
+
message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
|
205
|
+
raise InvalidArgument(message)
|
206
|
+
# in case we have an in-memory representation, we need to write it to disk
|
207
|
+
if isinstance(data, data_types):
|
208
|
+
path = settings.cache_dir / f"{provisional_uid}{suffix}"
|
209
|
+
write_to_disk(data, path)
|
210
|
+
use_existing_storage_key = False
|
207
211
|
return memory_rep, path, suffix, storage, use_existing_storage_key
|
208
212
|
|
209
213
|
|
@@ -321,6 +325,7 @@ def get_artifact_kwargs_from_data(
|
|
321
325
|
default_storage,
|
322
326
|
using_key,
|
323
327
|
skip_check_exists,
|
328
|
+
is_replace=is_replace,
|
324
329
|
)
|
325
330
|
stat_or_artifact = get_stat_or_artifact(
|
326
331
|
path=path,
|
@@ -453,7 +458,7 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
|
|
453
458
|
return True
|
454
459
|
if isinstance(data, (str, Path, UPath)):
|
455
460
|
data_path = UPath(data)
|
456
|
-
if data_path.
|
461
|
+
if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
|
457
462
|
return True
|
458
463
|
elif data_path.suffix == ".zarr":
|
459
464
|
# ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
|
@@ -689,6 +694,7 @@ def from_df(
|
|
689
694
|
kind="dataset",
|
690
695
|
**kwargs,
|
691
696
|
)
|
697
|
+
artifact.n_observations = len(df)
|
692
698
|
return artifact
|
693
699
|
|
694
700
|
|
@@ -973,7 +979,7 @@ inconsistent_state_msg = (
|
|
973
979
|
|
974
980
|
# docstring handled through attach_func_to_class_method
|
975
981
|
def open(
|
976
|
-
self, mode: str = "r", is_run_input: bool | None = None
|
982
|
+
self, mode: str = "r", is_run_input: bool | None = None, **kwargs
|
977
983
|
) -> (
|
978
984
|
AnnDataAccessor
|
979
985
|
| BackedAccessor
|
@@ -984,16 +990,23 @@ def open(
|
|
984
990
|
):
|
985
991
|
if self._overwrite_versions and not self.is_latest:
|
986
992
|
raise ValueError(inconsistent_state_msg)
|
993
|
+
# all hdf5 suffixes including gzipped
|
994
|
+
h5_suffixes = [".h5", ".hdf5", ".h5ad"]
|
995
|
+
h5_suffixes += [s + ".gz" for s in h5_suffixes]
|
987
996
|
# ignore empty suffix for now
|
988
997
|
suffixes = (
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
998
|
+
(
|
999
|
+
"",
|
1000
|
+
".zarr",
|
1001
|
+
".anndata.zarr",
|
1002
|
+
".tiledbsoma",
|
1003
|
+
)
|
1004
|
+
+ tuple(h5_suffixes)
|
1005
|
+
+ PYARROW_SUFFIXES
|
1006
|
+
+ tuple(
|
1007
|
+
s + ".gz" for s in PYARROW_SUFFIXES
|
1008
|
+
) # this doesn't work for externally gzipped files, REMOVE LATER
|
1009
|
+
)
|
997
1010
|
if self.suffix not in suffixes:
|
998
1011
|
raise ValueError(
|
999
1012
|
"Artifact should have a zarr, h5, tiledbsoma object"
|
@@ -1011,7 +1024,7 @@ def open(
|
|
1011
1024
|
using_key = settings._using_key
|
1012
1025
|
filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
|
1013
1026
|
is_tiledbsoma_w = (
|
1014
|
-
filepath.name == "soma" or
|
1027
|
+
filepath.name == "soma" or self.suffix == ".tiledbsoma"
|
1015
1028
|
) and mode == "w"
|
1016
1029
|
# consider the case where an object is already locally cached
|
1017
1030
|
localpath = setup_settings.paths.cloud_to_local_no_update(
|
@@ -1025,14 +1038,14 @@ def open(
|
|
1025
1038
|
) and not filepath.synchronize(localpath, just_check=True)
|
1026
1039
|
if open_cache:
|
1027
1040
|
try:
|
1028
|
-
access = backed_access(localpath, mode, using_key)
|
1041
|
+
access = backed_access(localpath, mode, using_key, **kwargs)
|
1029
1042
|
except Exception as e:
|
1030
1043
|
if isinstance(filepath, LocalPathClasses):
|
1031
1044
|
raise e
|
1032
1045
|
logger.warning(
|
1033
1046
|
f"The cache might be corrupted: {e}. Trying to open directly."
|
1034
1047
|
)
|
1035
|
-
access = backed_access(filepath, mode, using_key)
|
1048
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
1036
1049
|
# happens only if backed_access has been successful
|
1037
1050
|
# delete the corrupted cache
|
1038
1051
|
if localpath.is_dir():
|
@@ -1040,7 +1053,7 @@ def open(
|
|
1040
1053
|
else:
|
1041
1054
|
localpath.unlink(missing_ok=True)
|
1042
1055
|
else:
|
1043
|
-
access = backed_access(filepath, mode, using_key)
|
1056
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
1044
1057
|
if is_tiledbsoma_w:
|
1045
1058
|
|
1046
1059
|
def finalize():
|
@@ -1237,6 +1250,7 @@ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
|
1237
1250
|
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
1238
1251
|
state_was_adding = self._state.adding
|
1239
1252
|
print_progress = kwargs.pop("print_progress", True)
|
1253
|
+
store_kwargs = kwargs.pop("store_kwargs", {}) # kwargs for .upload_from in the end
|
1240
1254
|
access_token = kwargs.pop("access_token", None)
|
1241
1255
|
local_path = None
|
1242
1256
|
if upload and setup_settings.instance.keep_artifacts_local:
|
@@ -1258,7 +1272,11 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1258
1272
|
if "using" in kwargs:
|
1259
1273
|
using_key = kwargs["using"]
|
1260
1274
|
exception_upload = check_and_attempt_upload(
|
1261
|
-
self,
|
1275
|
+
self,
|
1276
|
+
using_key,
|
1277
|
+
access_token=access_token,
|
1278
|
+
print_progress=print_progress,
|
1279
|
+
**store_kwargs,
|
1262
1280
|
)
|
1263
1281
|
if exception_upload is not None:
|
1264
1282
|
# we do not want to raise file not found on cleanup if upload of a file failed
|
lamindb/_collection.py
CHANGED
@@ -273,7 +273,7 @@ def mapped(
|
|
273
273
|
else:
|
274
274
|
artifacts = self.ordered_artifacts.all()
|
275
275
|
for artifact in artifacts:
|
276
|
-
if
|
276
|
+
if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
|
277
277
|
logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
|
278
278
|
continue
|
279
279
|
elif not stream:
|
lamindb/_feature.py
CHANGED
@@ -218,7 +218,7 @@ def __init__(self, *args, **kwargs):
|
|
218
218
|
return None
|
219
219
|
dtype = kwargs.get("dtype", None)
|
220
220
|
default_value = kwargs.pop("default_value", None)
|
221
|
-
nullable = kwargs.pop("nullable",
|
221
|
+
nullable = kwargs.pop("nullable", True) # default value of nullable
|
222
222
|
cat_filters = kwargs.pop("cat_filters", None)
|
223
223
|
kwargs = process_init_feature_param(args, kwargs)
|
224
224
|
super(Feature, self).__init__(*args, **kwargs)
|
lamindb/_finish.py
CHANGED
@@ -436,7 +436,15 @@ def save_context_core(
|
|
436
436
|
# save both run & transform records if we arrive here
|
437
437
|
if run is not None:
|
438
438
|
run.save()
|
439
|
-
transform.
|
439
|
+
transform_id_prior_to_save = transform.id
|
440
|
+
transform.save() # this in-place updates the state of transform upon hash collision
|
441
|
+
if transform.id != transform_id_prior_to_save:
|
442
|
+
# the hash existed and we're actually back to the previous version
|
443
|
+
# hence, this was in fact a run of the previous transform rather than of
|
444
|
+
# the new transform
|
445
|
+
# this can happen in interactive notebooks if the user makes no change to the notebook
|
446
|
+
run.transform = transform
|
447
|
+
run.save()
|
440
448
|
|
441
449
|
# finalize
|
442
450
|
if not from_cli and run is not None:
|
lamindb/_query_set.py
CHANGED
@@ -214,10 +214,27 @@ def get(
|
|
214
214
|
else:
|
215
215
|
assert idlike is None # noqa: S101
|
216
216
|
expressions = process_expressions(qs, expressions)
|
217
|
+
# don't want _branch_code here in .get(), only in .filter()
|
218
|
+
expressions.pop("_branch_code", None)
|
217
219
|
# inject is_latest for consistency with idlike
|
218
|
-
|
220
|
+
is_latest_was_not_in_expressions = "is_latest" not in expressions
|
221
|
+
if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions:
|
219
222
|
expressions["is_latest"] = True
|
220
|
-
|
223
|
+
try:
|
224
|
+
return registry.objects.using(qs.db).get(**expressions)
|
225
|
+
except registry.DoesNotExist:
|
226
|
+
# handle the case in which the is_latest injection led to a missed query
|
227
|
+
if "is_latest" in expressions and is_latest_was_not_in_expressions:
|
228
|
+
expressions.pop("is_latest")
|
229
|
+
result = (
|
230
|
+
registry.objects.using(qs.db)
|
231
|
+
.filter(**expressions)
|
232
|
+
.order_by("-created_at")
|
233
|
+
.first()
|
234
|
+
)
|
235
|
+
if result is not None:
|
236
|
+
return result
|
237
|
+
raise registry.DoesNotExist from registry.DoesNotExist
|
221
238
|
|
222
239
|
|
223
240
|
class RecordList(UserList, Generic[T]):
|
@@ -641,11 +658,12 @@ class QuerySet(models.QuerySet):
|
|
641
658
|
and value.strip("-").isalpha()
|
642
659
|
and "__" not in field
|
643
660
|
and hasattr(self.model, field)
|
644
|
-
and getattr(self.model, field).field.related_model
|
645
661
|
):
|
646
|
-
|
647
|
-
|
648
|
-
|
662
|
+
field_attr = getattr(self.model, field)
|
663
|
+
if hasattr(field_attr, "field") and field_attr.field.related_model:
|
664
|
+
raise FieldError(
|
665
|
+
f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
|
666
|
+
)
|
649
667
|
|
650
668
|
expressions = process_expressions(self, expressions)
|
651
669
|
if len(expressions) > 0:
|
lamindb/_record.py
CHANGED
@@ -248,11 +248,10 @@ def __init__(record: Record, *args, **kwargs):
|
|
248
248
|
f" {name_field}{version_comment}: '{kwargs[name_field]}'"
|
249
249
|
)
|
250
250
|
if isinstance(record, Schema):
|
251
|
-
if
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
)
|
251
|
+
if existing_record.hash != kwargs["hash"]:
|
252
|
+
raise ValueError(
|
253
|
+
f"Schema name is already in use by schema with uid '{existing_record.uid}', please choose a different name."
|
254
|
+
)
|
256
255
|
init_self_from_db(record, existing_record)
|
257
256
|
update_attributes(record, kwargs)
|
258
257
|
return None
|
lamindb/_save.py
CHANGED
@@ -133,7 +133,9 @@ def check_and_attempt_upload(
|
|
133
133
|
using_key: str | None = None,
|
134
134
|
access_token: str | None = None,
|
135
135
|
print_progress: bool = True,
|
136
|
+
**kwargs,
|
136
137
|
) -> Exception | None:
|
138
|
+
# kwargs are propagated to .upload_from in the end
|
137
139
|
# if Artifact object is either newly instantiated or replace() was called on
|
138
140
|
# a local env it will have a _local_filepath and needs to be uploaded
|
139
141
|
if hasattr(artifact, "_local_filepath"):
|
@@ -143,6 +145,7 @@ def check_and_attempt_upload(
|
|
143
145
|
using_key,
|
144
146
|
access_token=access_token,
|
145
147
|
print_progress=print_progress,
|
148
|
+
**kwargs,
|
146
149
|
)
|
147
150
|
except Exception as exception:
|
148
151
|
logger.warning(f"could not upload artifact: {artifact}")
|
@@ -316,8 +319,10 @@ def upload_artifact(
|
|
316
319
|
using_key: str | None = None,
|
317
320
|
access_token: str | None = None,
|
318
321
|
print_progress: bool = True,
|
322
|
+
**kwargs,
|
319
323
|
) -> tuple[UPath, UPath | None]:
|
320
324
|
"""Store and add file and its linked entries."""
|
325
|
+
# kwargs are propagated to .upload_from in the end
|
321
326
|
# can't currently use filepath_from_artifact here because it resolves to ._local_filepath
|
322
327
|
storage_key = auto_storage_key_from_artifact(artifact)
|
323
328
|
storage_path, storage_settings = attempt_accessing_path(
|
@@ -326,7 +331,10 @@ def upload_artifact(
|
|
326
331
|
if hasattr(artifact, "_to_store") and artifact._to_store:
|
327
332
|
logger.save(f"storing artifact '{artifact.uid}' at '{storage_path}'")
|
328
333
|
store_file_or_folder(
|
329
|
-
artifact._local_filepath,
|
334
|
+
artifact._local_filepath,
|
335
|
+
storage_path,
|
336
|
+
print_progress=print_progress,
|
337
|
+
**kwargs,
|
330
338
|
)
|
331
339
|
|
332
340
|
if isinstance(storage_path, LocalPathClasses):
|
lamindb/_tracked.py
CHANGED
@@ -26,10 +26,33 @@ def get_current_tracked_run() -> Run | None:
|
|
26
26
|
|
27
27
|
|
28
28
|
def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
|
29
|
-
"""
|
29
|
+
"""Mark a function as tracked with this decorator.
|
30
|
+
|
31
|
+
You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.
|
32
|
+
|
33
|
+
Guide: :doc:`/track`
|
34
|
+
|
35
|
+
.. versionadded:: 1.1.0
|
36
|
+
This is still in beta and will be refined in future releases.
|
30
37
|
|
31
38
|
Args:
|
32
|
-
uid:
|
39
|
+
uid: Persist the uid to identify this transform across renames.
|
40
|
+
|
41
|
+
Example::
|
42
|
+
|
43
|
+
import lamindb as ln
|
44
|
+
|
45
|
+
@ln.tracked()
|
46
|
+
def subset_dataframe(
|
47
|
+
input_artifact_key: str, # all arguments tracked as parameters of the function run
|
48
|
+
output_artifact_key: str,
|
49
|
+
subset_rows: int = 2,
|
50
|
+
subset_cols: int = 2,
|
51
|
+
) -> None:
|
52
|
+
artifact = ln.Artifact.get(key=input_artifact_key)
|
53
|
+
df = artifact.load() # auto-tracked as input
|
54
|
+
new_df = df.iloc[:subset_rows, :subset_cols]
|
55
|
+
ln.Artifact.from_df(new_df, key=output_artifact_key).save() # auto-tracked as output
|
33
56
|
"""
|
34
57
|
|
35
58
|
def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:
|
lamindb/base/users.py
CHANGED
@@ -12,12 +12,9 @@ def current_user_id() -> int:
|
|
12
12
|
if ln_setup.core.django.IS_MIGRATING:
|
13
13
|
return 1
|
14
14
|
else:
|
15
|
-
exc_attr = (
|
16
|
-
"DoesNotExist" if hasattr(User, "DoesNotExist") else "_DoesNotExist"
|
17
|
-
)
|
18
15
|
try:
|
19
16
|
user_id = User.objects.get(uid=settings.user.uid).id
|
20
|
-
except
|
17
|
+
except User.DoesNotExist:
|
21
18
|
register_user(settings.user)
|
22
19
|
user_id = User.objects.get(uid=settings.user.uid).id
|
23
20
|
return user_id
|
lamindb/core/_context.py
CHANGED
@@ -13,6 +13,7 @@ from typing import TYPE_CHECKING
|
|
13
13
|
import lamindb_setup as ln_setup
|
14
14
|
from django.db.models import Func, IntegerField
|
15
15
|
from lamin_utils import logger
|
16
|
+
from lamindb_setup.core import deprecated
|
16
17
|
from lamindb_setup.core.hashing import hash_file
|
17
18
|
|
18
19
|
from lamindb.base import ids
|
@@ -217,8 +218,8 @@ class Context:
|
|
217
218
|
self._description = value
|
218
219
|
|
219
220
|
@property
|
221
|
+
@deprecated(new_name="description")
|
220
222
|
def name(self) -> str | None:
|
221
|
-
"""Deprecated. Populates `description` argument for `context.transform`."""
|
222
223
|
return self._description
|
223
224
|
|
224
225
|
@name.setter
|
@@ -257,7 +258,7 @@ class Context:
|
|
257
258
|
path: str | None = None,
|
258
259
|
log_to_file: bool | None = None,
|
259
260
|
) -> None:
|
260
|
-
"""
|
261
|
+
"""Track a global run of your Python session.
|
261
262
|
|
262
263
|
- sets :attr:`~lamindb.core.Context.transform` &
|
263
264
|
:attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
|
@@ -284,6 +285,10 @@ class Context:
|
|
284
285
|
|
285
286
|
>>> ln.track()
|
286
287
|
|
288
|
+
If you want to ensure a single version history across renames of the notebook or script, pass the auto-generated `uid` that you'll find in the logs:
|
289
|
+
|
290
|
+
>>> ln.track("Onv04I53OgtT0000") # example uid, the last four characters encode the version of the transform
|
291
|
+
|
287
292
|
"""
|
288
293
|
self._logging_message_track = ""
|
289
294
|
self._logging_message_imports = ""
|
@@ -27,7 +27,8 @@ if TYPE_CHECKING:
|
|
27
27
|
class _Connect:
|
28
28
|
def __init__(self, storage):
|
29
29
|
if isinstance(storage, UPath):
|
30
|
-
|
30
|
+
# force no external compression even for files with .gz extension. REMOVE LATER
|
31
|
+
self.conn, self.store = registry.open("h5py", storage, compression=None)
|
31
32
|
self.to_close = True
|
32
33
|
else:
|
33
34
|
self.conn, self.store = None, storage
|
@@ -246,7 +247,8 @@ class MappedCollection:
|
|
246
247
|
if parallel:
|
247
248
|
conn, storage = None, path
|
248
249
|
else:
|
249
|
-
|
250
|
+
# force no external compression even for files with .gz extension. REMOVE LATER
|
251
|
+
conn, storage = registry.open("h5py", path, compression=None)
|
250
252
|
else:
|
251
253
|
conn, storage = registry.open("zarr", path)
|
252
254
|
self.conns.append(conn)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import subprocess
|
4
|
+
import sys
|
4
5
|
from typing import TYPE_CHECKING
|
5
6
|
|
6
7
|
import lamindb_setup as ln_setup
|
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
|
|
17
18
|
try:
|
18
19
|
with open(filepath, "w") as f:
|
19
20
|
result = subprocess.run(
|
20
|
-
["pip", "freeze"],
|
21
|
+
[sys.executable, "-m", "pip", "freeze"],
|
21
22
|
stdout=f,
|
22
23
|
)
|
23
24
|
except OSError as e:
|
lamindb/core/datasets/_small.py
CHANGED
@@ -23,7 +23,7 @@ def small_dataset1(
|
|
23
23
|
var_ids[0]: [1, 2, 3],
|
24
24
|
var_ids[1]: [3, 4, 5],
|
25
25
|
var_ids[2]: [5, 6, 7],
|
26
|
-
"
|
26
|
+
"perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
|
27
27
|
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
28
28
|
"cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
|
29
29
|
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
@@ -60,7 +60,7 @@ def small_dataset2(
|
|
60
60
|
var_ids[0]: [2, 3, 3],
|
61
61
|
var_ids[1]: [3, 4, 5],
|
62
62
|
var_ids[2]: [4, 2, 3],
|
63
|
-
"
|
63
|
+
"perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
64
64
|
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
65
65
|
}
|
66
66
|
metadata = {
|
@@ -74,7 +74,7 @@ def small_dataset2(
|
|
74
74
|
)
|
75
75
|
ad.AnnData(
|
76
76
|
dataset_df[var_ids],
|
77
|
-
obs=dataset_df[["
|
77
|
+
obs=dataset_df[["perturbation", "cell_type_by_model"]],
|
78
78
|
)
|
79
79
|
if otype == "DataFrame":
|
80
80
|
for key, value in metadata.items():
|
lamindb/core/loaders.py
CHANGED
@@ -65,8 +65,8 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
|
|
65
65
|
def load_h5ad(filepath, **kwargs) -> ad.AnnData:
|
66
66
|
"""Load an `.h5ad` file to `AnnData`."""
|
67
67
|
fs, filepath = infer_filesystem(filepath)
|
68
|
-
|
69
|
-
with fs.open(filepath, mode="rb") as file:
|
68
|
+
compression = kwargs.pop("compression", "infer")
|
69
|
+
with fs.open(filepath, mode="rb", compression=compression) as file:
|
70
70
|
adata = ad.read_h5ad(file, backed=False, **kwargs)
|
71
71
|
return adata
|
72
72
|
|
@@ -148,9 +148,13 @@ def load_rds(path: UPathStr) -> UPathStr:
|
|
148
148
|
|
149
149
|
FILE_LOADERS = {
|
150
150
|
".csv": pd.read_csv,
|
151
|
+
".csv.gz": pd.read_csv,
|
151
152
|
".tsv": load_tsv,
|
153
|
+
".tsv.gz": load_tsv,
|
152
154
|
".h5ad": load_h5ad,
|
155
|
+
".h5ad.gz": load_h5ad,
|
153
156
|
".parquet": pd.read_parquet,
|
157
|
+
".parquet.gz": pd.read_parquet, # this doesn't work for externally gzipped files, REMOVE LATER
|
154
158
|
".fcs": load_fcs,
|
155
159
|
".zarr": load_anndata_zarr,
|
156
160
|
".html": load_html,
|
@@ -177,7 +181,15 @@ def load_to_memory(filepath: UPathStr, **kwargs):
|
|
177
181
|
|
178
182
|
filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
|
179
183
|
|
180
|
-
|
184
|
+
# infer the correct suffix when .gz is present
|
185
|
+
suffixes = filepath.suffixes
|
186
|
+
suffix = (
|
187
|
+
"".join(suffixes[-2:])
|
188
|
+
if len(suffixes) > 1 and ".gz" in suffixes
|
189
|
+
else filepath.suffix
|
190
|
+
)
|
191
|
+
|
192
|
+
loader = FILE_LOADERS.get(suffix)
|
181
193
|
if loader is None:
|
182
194
|
return filepath
|
183
195
|
else:
|
@@ -16,6 +16,7 @@ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
|
16
16
|
from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
|
17
17
|
from anndata.compat import _read_attr
|
18
18
|
from fsspec.implementations.local import LocalFileSystem
|
19
|
+
from fsspec.utils import infer_compression
|
19
20
|
from lamin_utils import logger
|
20
21
|
from lamindb_setup.core.upath import create_mapper, infer_filesystem
|
21
22
|
from packaging import version
|
@@ -152,9 +153,13 @@ registry = AccessRegistry()
|
|
152
153
|
|
153
154
|
|
154
155
|
@registry.register_open("h5py")
|
155
|
-
def open(filepath: UPathStr, mode: str = "r"):
|
156
|
+
def open(filepath: UPathStr, mode: str = "r", compression: str | None = "infer"):
|
156
157
|
fs, file_path_str = infer_filesystem(filepath)
|
157
|
-
|
158
|
+
# we don't open compressed files directly because we need fsspec to uncompress on .open
|
159
|
+
compression = (
|
160
|
+
infer_compression(file_path_str) if compression == "infer" else compression
|
161
|
+
)
|
162
|
+
if isinstance(fs, LocalFileSystem) and compression is None:
|
158
163
|
assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
|
159
164
|
return None, h5py.File(file_path_str, mode=mode)
|
160
165
|
if mode == "r":
|
@@ -165,7 +170,7 @@ def open(filepath: UPathStr, mode: str = "r"):
|
|
165
170
|
conn_mode = "ab"
|
166
171
|
else:
|
167
172
|
raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
|
168
|
-
conn = fs.open(file_path_str, mode=conn_mode)
|
173
|
+
conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
|
169
174
|
try:
|
170
175
|
storage = h5py.File(conn, mode=mode)
|
171
176
|
except Exception as e:
|
@@ -70,6 +70,7 @@ def backed_access(
|
|
70
70
|
artifact_or_filepath: Artifact | UPath,
|
71
71
|
mode: str = "r",
|
72
72
|
using_key: str | None = None,
|
73
|
+
**kwargs,
|
73
74
|
) -> (
|
74
75
|
AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
|
75
76
|
):
|
@@ -80,18 +81,22 @@ def backed_access(
|
|
80
81
|
else:
|
81
82
|
objectpath = artifact_or_filepath
|
82
83
|
name = objectpath.name
|
83
|
-
|
84
|
+
# ignore .gz, only check the real suffix
|
85
|
+
suffixes = objectpath.suffixes
|
86
|
+
suffix = (
|
87
|
+
suffixes[-2] if len(suffixes) > 1 and ".gz" in suffixes else objectpath.suffix
|
88
|
+
)
|
84
89
|
|
85
90
|
if name == "soma" or suffix == ".tiledbsoma":
|
86
91
|
if mode not in {"r", "w"}:
|
87
92
|
raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
|
88
|
-
return _open_tiledbsoma(objectpath, mode=mode) # type: ignore
|
93
|
+
return _open_tiledbsoma(objectpath, mode=mode, **kwargs) # type: ignore
|
89
94
|
elif suffix in {".h5", ".hdf5", ".h5ad"}:
|
90
|
-
conn, storage = registry.open("h5py", objectpath, mode=mode)
|
95
|
+
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
91
96
|
elif suffix == ".zarr":
|
92
|
-
conn, storage = registry.open("zarr", objectpath, mode=mode)
|
97
|
+
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
93
98
|
elif _is_pyarrow_dataset(objectpath):
|
94
|
-
return _open_pyarrow_dataset(objectpath)
|
99
|
+
return _open_pyarrow_dataset(objectpath, **kwargs)
|
95
100
|
else:
|
96
101
|
raise ValueError(
|
97
102
|
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|