lamindb 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +5 -4
- lamindb/{_file.py → _artifact.py} +265 -210
- lamindb/_dataset.py +87 -115
- lamindb/_delete.py +2 -2
- lamindb/_filter.py +2 -2
- lamindb/_parents.py +7 -7
- lamindb/_query_manager.py +5 -2
- lamindb/_registry.py +3 -3
- lamindb/_save.py +63 -63
- lamindb/dev/_data.py +10 -9
- lamindb/dev/_feature_manager.py +10 -10
- lamindb/dev/_label_manager.py +4 -4
- lamindb/dev/_run_context.py +2 -2
- lamindb/dev/_settings.py +5 -4
- lamindb/dev/_view_tree.py +5 -5
- lamindb/dev/datasets/_core.py +6 -6
- lamindb/dev/hashing.py +11 -1
- lamindb/dev/storage/__init__.py +1 -1
- lamindb/dev/storage/_backed_access.py +6 -6
- lamindb/dev/storage/file.py +36 -31
- lamindb/dev/versioning.py +3 -3
- {lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/METADATA +5 -5
- lamindb-0.64.1.dist-info/RECORD +48 -0
- lamindb-0.63.5.dist-info/RECORD +0 -48
- {lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/LICENSE +0 -0
- {lamindb-0.63.5.dist-info → lamindb-0.64.1.dist-info}/WHEEL +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
from pathlib import Path, PurePath, PurePosixPath
|
2
|
-
from typing import Any, List, Optional, Tuple, Union
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
3
3
|
|
4
4
|
import anndata as ad
|
5
5
|
import fsspec
|
@@ -13,7 +13,7 @@ from lamindb_setup.dev import StorageSettings
|
|
13
13
|
from lamindb_setup.dev._docs import doc_args
|
14
14
|
from lamindb_setup.dev._hub_utils import get_storage_region
|
15
15
|
from lamindb_setup.dev.upath import create_path, extract_suffix_from_path
|
16
|
-
from lnschema_core import Feature, FeatureSet,
|
16
|
+
from lnschema_core import Artifact, Feature, FeatureSet, Run, Storage
|
17
17
|
from lnschema_core.models import IsTree
|
18
18
|
from lnschema_core.types import (
|
19
19
|
AnnDataLike,
|
@@ -26,7 +26,7 @@ from lnschema_core.types import (
|
|
26
26
|
from lamindb._utils import attach_func_to_class_method
|
27
27
|
from lamindb.dev._data import _track_run_input
|
28
28
|
from lamindb.dev._settings import settings
|
29
|
-
from lamindb.dev.hashing import b16_to_b64, hash_file
|
29
|
+
from lamindb.dev.hashing import b16_to_b64, hash_file, hash_md5s_from_dir
|
30
30
|
from lamindb.dev.storage import (
|
31
31
|
LocalPathClasses,
|
32
32
|
UPath,
|
@@ -38,9 +38,9 @@ from lamindb.dev.storage import (
|
|
38
38
|
)
|
39
39
|
from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
|
40
40
|
from lamindb.dev.storage.file import (
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
auto_storage_key_from_artifact,
|
42
|
+
auto_storage_key_from_artifact_uid,
|
43
|
+
filepath_from_artifact,
|
44
44
|
)
|
45
45
|
from lamindb.dev.versioning import get_ids_from_old_version, init_uid
|
46
46
|
|
@@ -113,11 +113,11 @@ def process_data(
|
|
113
113
|
"""Serialize a data object that's provided as file or in memory."""
|
114
114
|
# if not overwritten, data gets stored in default storage
|
115
115
|
if isinstance(data, (str, Path, UPath)): # PathLike, spelled out
|
116
|
-
|
116
|
+
path = create_path(data)
|
117
117
|
storage, use_existing_storage_key = process_pathlike(
|
118
|
-
|
118
|
+
path, skip_existence_check=skip_existence_check
|
119
119
|
)
|
120
|
-
suffix = extract_suffix_from_path(
|
120
|
+
suffix = extract_suffix_from_path(path)
|
121
121
|
memory_rep = None
|
122
122
|
elif isinstance(data, (pd.DataFrame, AnnData)): # DataLike, spelled out
|
123
123
|
storage = lamindb_setup.settings.storage.record
|
@@ -136,157 +136,190 @@ def process_data(
|
|
136
136
|
f" be '{suffix}'."
|
137
137
|
)
|
138
138
|
cache_name = f"{provisional_uid}{suffix}"
|
139
|
-
|
139
|
+
path = lamindb_setup.settings.storage.cache_dir / cache_name
|
140
140
|
# Alex: I don't understand the line below
|
141
|
-
if
|
142
|
-
|
141
|
+
if path.suffixes == []:
|
142
|
+
path = path.with_suffix(suffix)
|
143
143
|
if suffix not in {".zarr", ".zrad"}:
|
144
|
-
write_to_file(data,
|
144
|
+
write_to_file(data, path)
|
145
145
|
use_existing_storage_key = False
|
146
146
|
else:
|
147
147
|
raise NotImplementedError(
|
148
|
-
f"Do not know how to create a
|
148
|
+
f"Do not know how to create a artifact object from {data}, pass a path"
|
149
149
|
" instead!"
|
150
150
|
)
|
151
|
-
return memory_rep,
|
151
|
+
return memory_rep, path, suffix, storage, use_existing_storage_key
|
152
|
+
|
153
|
+
|
154
|
+
def get_stat_file_cloud(stat: Dict) -> Tuple[int, str, str]:
|
155
|
+
size = stat["size"]
|
156
|
+
# small files
|
157
|
+
if "-" not in stat["ETag"]:
|
158
|
+
# only store hash for non-multipart uploads
|
159
|
+
# we can't rapidly validate multi-part uploaded files client-side
|
160
|
+
# we can add more logic later down-the-road
|
161
|
+
hash = b16_to_b64(stat["ETag"])
|
162
|
+
hash_type = "md5"
|
163
|
+
else:
|
164
|
+
stripped_etag, suffix = stat["ETag"].split("-")
|
165
|
+
suffix = suffix.strip('"')
|
166
|
+
hash = f"{b16_to_b64(stripped_etag)}-{suffix}"
|
167
|
+
hash_type = "md5-n" # this is the S3 chunk-hashing strategy
|
168
|
+
return size, hash, hash_type
|
169
|
+
|
170
|
+
|
171
|
+
def get_stat_dir_s3(path: UPath) -> Tuple[int, str, str, int]:
|
172
|
+
import boto3
|
173
|
+
from lamindb_setup.dev.upath import AWS_CREDENTIALS_PRESENT
|
152
174
|
|
175
|
+
if not AWS_CREDENTIALS_PRESENT:
|
176
|
+
# passing the following param directly to Session() doesn't
|
177
|
+
# work, unfortunately: botocore_session=path.fs.session
|
178
|
+
from botocore import UNSIGNED
|
179
|
+
from botocore.config import Config
|
153
180
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
181
|
+
config = Config(signature_version=UNSIGNED)
|
182
|
+
s3 = boto3.session.Session().resource("s3", config=config)
|
183
|
+
else:
|
184
|
+
s3 = boto3.session.Session().resource("s3")
|
185
|
+
bucket, key, _ = path.fs.split_path(path.as_posix())
|
186
|
+
# assuming this here is the fastest way of querying for many objects
|
187
|
+
objects = s3.Bucket(bucket).objects.filter(Prefix=key)
|
188
|
+
size = sum([object.size for object in objects])
|
189
|
+
md5s = [
|
190
|
+
# skip leading and trailing quotes
|
191
|
+
object.e_tag[1:-1]
|
192
|
+
for object in objects
|
193
|
+
]
|
194
|
+
n_objects = len(md5s)
|
195
|
+
hash, hash_type = hash_md5s_from_dir(md5s)
|
196
|
+
return size, hash, hash_type, n_objects
|
197
|
+
|
198
|
+
|
199
|
+
def get_stat_dir_gs(path: UPath) -> Tuple[int, str, str, int]:
|
200
|
+
import google.cloud.storage as gc_storage
|
201
|
+
|
202
|
+
bucket, key, _ = path.fs.split_path(path.as_posix())
|
203
|
+
# assuming this here is the fastest way of querying for many objects
|
204
|
+
client = gc_storage.Client(
|
205
|
+
credentials=path.fs.credentials.credentials, project=path.fs.project
|
206
|
+
)
|
207
|
+
objects = client.Bucket(bucket).list_blobs(prefix=key)
|
208
|
+
sizes, md5s = [], []
|
209
|
+
for object in objects:
|
210
|
+
sizes.append(object.size)
|
211
|
+
md5s.append(object.md5_hash)
|
212
|
+
n_objects = len(md5s)
|
213
|
+
hash, hash_type = hash_md5s_from_dir(md5s)
|
214
|
+
return sum(sizes), hash, hash_type, n_objects
|
215
|
+
|
216
|
+
|
217
|
+
def get_stat_or_artifact(
|
218
|
+
path: UPath,
|
219
|
+
suffix: str,
|
220
|
+
memory_rep: Optional[Any] = None,
|
158
221
|
check_hash: bool = True,
|
159
|
-
) -> Union[Tuple[Optional[str], Optional[str]],
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
hash =
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
222
|
+
) -> Union[Tuple[int, Optional[str], Optional[str], Optional[int]], Artifact]:
|
223
|
+
n_objects = None
|
224
|
+
if settings.upon_file_create_skip_size_hash:
|
225
|
+
return None, None, None, n_objects
|
226
|
+
if (
|
227
|
+
suffix in {".zarr", ".zrad"}
|
228
|
+
and memory_rep is not None
|
229
|
+
and isinstance(memory_rep, AnnData)
|
230
|
+
):
|
231
|
+
size = size_adata(memory_rep)
|
232
|
+
return size, None, None, n_objects
|
233
|
+
stat = path.stat() # one network request
|
234
|
+
if not isinstance(path, LocalPathClasses):
|
235
|
+
size, hash, hash_type = None, None, None
|
236
|
+
if stat is not None:
|
237
|
+
if "ETag" in stat: # is file
|
238
|
+
size, hash, hash_type = get_stat_file_cloud(stat)
|
239
|
+
elif path.is_dir():
|
240
|
+
if path.protocol == "s3":
|
241
|
+
size, hash, hash_type, n_objects = get_stat_dir_s3(path)
|
242
|
+
elif path.protocol == "gs":
|
243
|
+
size, hash, hash_type, n_objects = get_stat_dir_gs(path)
|
244
|
+
if hash is None:
|
245
|
+
logger.warning(f"did not add hash for {path}")
|
246
|
+
return size, hash, hash_type, n_objects
|
180
247
|
else:
|
181
|
-
|
248
|
+
if path.is_dir():
|
249
|
+
md5s = []
|
250
|
+
size = 0
|
251
|
+
for subpath in path.rglob("*"):
|
252
|
+
if not subpath.is_file():
|
253
|
+
continue
|
254
|
+
size += subpath.stat().st_size
|
255
|
+
md5s.append(hash_file(subpath)[0])
|
256
|
+
hash, hash_type = hash_md5s_from_dir(md5s)
|
257
|
+
n_objects = len(md5s)
|
258
|
+
else:
|
259
|
+
hash, hash_type = hash_file(path)
|
260
|
+
size = stat.st_size
|
182
261
|
if not check_hash:
|
183
|
-
return hash, hash_type
|
262
|
+
return size, hash, hash_type, n_objects
|
184
263
|
# also checks hidden and trashed files
|
185
|
-
result =
|
264
|
+
result = Artifact.filter(hash=hash, visibility=None).list()
|
186
265
|
if len(result) > 0:
|
187
|
-
if settings.
|
188
|
-
msg = f"
|
266
|
+
if settings.upon_artifact_create_if_hash_exists == "error":
|
267
|
+
msg = f"artifact with same hash exists: {result[0]}"
|
189
268
|
hint = (
|
190
269
|
"💡 you can make this error a warning:\n"
|
191
|
-
" ln.settings.
|
270
|
+
" ln.settings.upon_artifact_create_if_hash_exists"
|
192
271
|
)
|
193
272
|
raise RuntimeError(f"{msg}\n{hint}")
|
194
|
-
elif settings.
|
273
|
+
elif settings.upon_artifact_create_if_hash_exists == "warn_create_new":
|
195
274
|
logger.warning(
|
196
|
-
"creating new
|
275
|
+
"creating new Artifact object despite existing artifact with same hash:"
|
197
276
|
f" {result[0]}"
|
198
277
|
)
|
199
|
-
return hash, hash_type
|
278
|
+
return size, hash, hash_type, n_objects
|
200
279
|
else:
|
201
|
-
logger.warning(f"returning existing
|
280
|
+
logger.warning(f"returning existing artifact with same hash: {result[0]}")
|
202
281
|
if result[0].visibility < 1:
|
203
282
|
if result[0].visibility == -1:
|
204
283
|
visibility_text = "in the trash"
|
205
284
|
elif result[0].visibility == 0:
|
206
285
|
visibility_text = "hidden"
|
207
286
|
logger.warning(
|
208
|
-
f"the existing
|
209
|
-
" `
|
287
|
+
f"the existing artifact is {visibility_text}, restore it before"
|
288
|
+
" use: `artifact.restore()`"
|
210
289
|
)
|
211
290
|
return result[0]
|
212
291
|
else:
|
213
|
-
return hash, hash_type
|
292
|
+
return size, hash, hash_type, n_objects
|
214
293
|
|
215
294
|
|
216
|
-
def
|
217
|
-
filepath: UPath,
|
218
|
-
memory_rep: Optional[Union[pd.DataFrame, AnnData]],
|
219
|
-
suffix: str,
|
220
|
-
check_hash: bool = True,
|
221
|
-
):
|
222
|
-
cloudpath = None
|
223
|
-
localpath = None
|
224
|
-
hash_and_type: Tuple[Optional[str], Optional[str]]
|
225
|
-
|
226
|
-
if suffix in {".zarr", ".zrad"}:
|
227
|
-
if memory_rep is not None:
|
228
|
-
size = size_adata(memory_rep)
|
229
|
-
else:
|
230
|
-
if not isinstance(filepath, LocalPathClasses):
|
231
|
-
cloudpath = filepath
|
232
|
-
# todo: properly calculate size
|
233
|
-
size = 0
|
234
|
-
else:
|
235
|
-
localpath = filepath
|
236
|
-
size = sum(
|
237
|
-
f.stat().st_size for f in filepath.rglob("*") if f.is_file() # type: ignore # noqa
|
238
|
-
)
|
239
|
-
hash_and_type = None, None
|
240
|
-
else:
|
241
|
-
# to accelerate ingesting high numbers of files
|
242
|
-
if settings.upon_file_create_skip_size_hash:
|
243
|
-
size = None
|
244
|
-
hash_and_type = None, None
|
245
|
-
else:
|
246
|
-
filepath_stat = filepath.stat()
|
247
|
-
if not isinstance(filepath, LocalPathClasses):
|
248
|
-
size = filepath_stat["size"]
|
249
|
-
cloudpath = filepath
|
250
|
-
hash_and_type = None, None
|
251
|
-
else:
|
252
|
-
size = filepath_stat.st_size # type: ignore
|
253
|
-
localpath = filepath
|
254
|
-
hash_and_type = get_hash(
|
255
|
-
filepath, suffix, filepath_stat=filepath_stat, check_hash=check_hash
|
256
|
-
)
|
257
|
-
return localpath, cloudpath, size, hash_and_type
|
258
|
-
|
259
|
-
|
260
|
-
def check_path_in_existing_storage(
|
261
|
-
filepath: Union[Path, UPath]
|
262
|
-
) -> Union[Storage, bool]:
|
295
|
+
def check_path_in_existing_storage(path: Union[Path, UPath]) -> Union[Storage, bool]:
|
263
296
|
for storage in Storage.filter().all():
|
264
297
|
# if path is part of storage, return it
|
265
|
-
if check_path_is_child_of_root(
|
298
|
+
if check_path_is_child_of_root(path, root=create_path(storage.root)):
|
266
299
|
return storage
|
267
300
|
return False
|
268
301
|
|
269
302
|
|
270
303
|
def check_path_is_child_of_root(
|
271
|
-
|
304
|
+
path: Union[Path, UPath], root: Optional[Union[Path, UPath]] = None
|
272
305
|
) -> bool:
|
273
306
|
if root is None:
|
274
307
|
root = lamindb_setup.settings.storage.root
|
275
308
|
|
276
|
-
|
309
|
+
path = UPath(str(path)) if not isinstance(path, UPath) else path
|
277
310
|
root = UPath(str(root)) if not isinstance(root, UPath) else root
|
278
311
|
|
279
312
|
# the following comparisons can fail if types aren't comparable
|
280
|
-
if not isinstance(
|
313
|
+
if not isinstance(path, LocalPathClasses) and not isinstance(
|
281
314
|
root, LocalPathClasses
|
282
315
|
):
|
283
316
|
# the following tests equivalency of two UPath objects
|
284
317
|
# via string representations; otherwise
|
285
318
|
# S3Path('s3://lndb-storage/') and S3Path('s3://lamindb-ci/')
|
286
319
|
# test as equivalent
|
287
|
-
return list(
|
288
|
-
elif isinstance(
|
289
|
-
return root.resolve() in
|
320
|
+
return list(path.parents)[-1].as_posix() == root.as_posix()
|
321
|
+
elif isinstance(path, LocalPathClasses) and isinstance(root, LocalPathClasses):
|
322
|
+
return root.resolve() in path.resolve().parents
|
290
323
|
else:
|
291
324
|
return False
|
292
325
|
|
@@ -297,7 +330,7 @@ def get_relative_path_to_directory(
|
|
297
330
|
if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
|
298
331
|
# UPath.relative_to() is not behaving as it should (2023-04-07)
|
299
332
|
# need to lstrip otherwise inconsistent behavior across trailing slashes
|
300
|
-
# see
|
333
|
+
# see test_artifact.py: test_get_relative_path_to_directory
|
301
334
|
relpath = PurePath(
|
302
335
|
path.as_posix().replace(directory.as_posix(), "").lstrip("/")
|
303
336
|
)
|
@@ -310,7 +343,7 @@ def get_relative_path_to_directory(
|
|
310
343
|
return relpath
|
311
344
|
|
312
345
|
|
313
|
-
def
|
346
|
+
def get_artifact_kwargs_from_data(
|
314
347
|
*,
|
315
348
|
data: Union[Path, UPath, str, pd.DataFrame, AnnData],
|
316
349
|
key: Optional[str],
|
@@ -320,25 +353,23 @@ def get_file_kwargs_from_data(
|
|
320
353
|
skip_check_exists: bool = False,
|
321
354
|
):
|
322
355
|
run = get_run(run)
|
323
|
-
memory_rep,
|
356
|
+
memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
|
324
357
|
provisional_uid, data, format, key, skip_check_exists
|
325
358
|
)
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
memory_rep,
|
331
|
-
suffix,
|
359
|
+
stat_or_artifact = get_stat_or_artifact(
|
360
|
+
path=path,
|
361
|
+
suffix=suffix,
|
362
|
+
memory_rep=memory_rep,
|
332
363
|
)
|
333
|
-
if isinstance(
|
334
|
-
return
|
364
|
+
if isinstance(stat_or_artifact, Artifact):
|
365
|
+
return stat_or_artifact, None
|
335
366
|
else:
|
336
|
-
hash, hash_type =
|
367
|
+
size, hash, hash_type, n_objects = stat_or_artifact
|
337
368
|
|
338
369
|
check_path_in_storage = False
|
339
370
|
if use_existing_storage_key:
|
340
371
|
inferred_key = get_relative_path_to_directory(
|
341
|
-
path=
|
372
|
+
path=path, directory=storage.root_as_path()
|
342
373
|
).as_posix()
|
343
374
|
if key is None:
|
344
375
|
key = inferred_key
|
@@ -363,10 +394,11 @@ def get_file_kwargs_from_data(
|
|
363
394
|
key=key,
|
364
395
|
uid=provisional_uid,
|
365
396
|
suffix=suffix,
|
397
|
+
is_dir=n_objects is not None,
|
366
398
|
)
|
367
399
|
|
368
400
|
# do we use a virtual or an actual storage key?
|
369
|
-
key_is_virtual = settings.
|
401
|
+
key_is_virtual = settings.artifact_use_virtual_keys
|
370
402
|
|
371
403
|
# if the file is already in storage, independent of the default
|
372
404
|
# we use an actual storage key
|
@@ -383,17 +415,24 @@ def get_file_kwargs_from_data(
|
|
383
415
|
# passing both the id and the object
|
384
416
|
# to make them both available immediately
|
385
417
|
# after object creation
|
418
|
+
n_objects=n_objects,
|
419
|
+
n_observations=None, # to implement
|
386
420
|
run_id=run.id if run is not None else None,
|
387
421
|
run=run,
|
388
422
|
key_is_virtual=key_is_virtual,
|
389
423
|
)
|
424
|
+
if not isinstance(path, LocalPathClasses):
|
425
|
+
local_filepath = None
|
426
|
+
cloud_filepath = path
|
427
|
+
else:
|
428
|
+
local_filepath = path
|
429
|
+
cloud_filepath = None
|
390
430
|
privates = dict(
|
391
431
|
local_filepath=local_filepath,
|
392
432
|
cloud_filepath=cloud_filepath,
|
393
433
|
memory_rep=memory_rep,
|
394
434
|
check_path_in_storage=check_path_in_storage,
|
395
435
|
)
|
396
|
-
|
397
436
|
return kwargs, privates
|
398
437
|
|
399
438
|
|
@@ -404,6 +443,7 @@ def log_storage_hint(
|
|
404
443
|
key: Optional[str],
|
405
444
|
uid: str,
|
406
445
|
suffix: str,
|
446
|
+
is_dir: bool,
|
407
447
|
) -> None:
|
408
448
|
hint = ""
|
409
449
|
if check_path_in_storage:
|
@@ -415,11 +455,11 @@ def log_storage_hint(
|
|
415
455
|
if check_path_is_child_of_root(root_path, Path.cwd()):
|
416
456
|
# only display the relative path, not the fully resolved path
|
417
457
|
display_root = root_path.relative_to(Path.cwd())
|
418
|
-
hint += f"
|
458
|
+
hint += f"path in storage '{display_root}'" # type: ignore
|
419
459
|
else:
|
420
|
-
hint += "
|
460
|
+
hint += "path content will be copied to default storage upon `save()`"
|
421
461
|
if key is None:
|
422
|
-
storage_key =
|
462
|
+
storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
|
423
463
|
hint += f" with key `None` ('{storage_key}')"
|
424
464
|
else:
|
425
465
|
hint += f" with key '{key}'"
|
@@ -447,17 +487,17 @@ def data_is_mudata(data: DataLike): # pragma: no cover
|
|
447
487
|
return False
|
448
488
|
|
449
489
|
|
450
|
-
def __init__(
|
490
|
+
def __init__(artifact: Artifact, *args, **kwargs):
|
451
491
|
# Below checks for the Django-internal call in from_db()
|
452
|
-
# it'd be better if we could avoid this, but not being able to create a
|
492
|
+
# it'd be better if we could avoid this, but not being able to create a Artifact
|
453
493
|
# from data with the default constructor renders the central class of the API
|
454
494
|
# essentially useless
|
455
495
|
# The danger below is not that a user might pass as many args (12 of it), but rather
|
456
496
|
# that at some point the Django API might change; on the other hand, this
|
457
497
|
# condition of for calling the constructor based on kwargs should always
|
458
498
|
# stay robust
|
459
|
-
if len(args) == len(
|
460
|
-
super(
|
499
|
+
if len(args) == len(artifact._meta.concrete_fields):
|
500
|
+
super(Artifact, artifact).__init__(*args, **kwargs)
|
461
501
|
return None
|
462
502
|
# now we proceed with the user-facing constructor
|
463
503
|
if len(args) > 1:
|
@@ -468,7 +508,7 @@ def __init__(file: File, *args, **kwargs):
|
|
468
508
|
description: Optional[str] = (
|
469
509
|
kwargs.pop("description") if "description" in kwargs else None
|
470
510
|
)
|
471
|
-
is_new_version_of: Optional[
|
511
|
+
is_new_version_of: Optional[Artifact] = (
|
472
512
|
kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
|
473
513
|
)
|
474
514
|
initial_version_id: Optional[int] = (
|
@@ -495,8 +535,8 @@ def __init__(file: File, *args, **kwargs):
|
|
495
535
|
if is_new_version_of is None:
|
496
536
|
provisional_uid = init_uid(version=version, n_full_id=20)
|
497
537
|
else:
|
498
|
-
if not isinstance(is_new_version_of,
|
499
|
-
raise TypeError("is_new_version_of has to be of type ln.
|
538
|
+
if not isinstance(is_new_version_of, Artifact):
|
539
|
+
raise TypeError("is_new_version_of has to be of type ln.Artifact")
|
500
540
|
provisional_uid, initial_version_id, version = get_ids_from_old_version(
|
501
541
|
is_new_version_of, version, n_full_id=20
|
502
542
|
)
|
@@ -507,9 +547,9 @@ def __init__(file: File, *args, **kwargs):
|
|
507
547
|
if initial_version_id is None:
|
508
548
|
logger.info(
|
509
549
|
"initializing versioning for this file! create future versions of it"
|
510
|
-
" using ln.
|
550
|
+
" using ln.Artifact(..., is_new_version_of=old_file)"
|
511
551
|
)
|
512
|
-
|
552
|
+
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
513
553
|
data=data,
|
514
554
|
key=key,
|
515
555
|
run=run,
|
@@ -519,14 +559,14 @@ def __init__(file: File, *args, **kwargs):
|
|
519
559
|
)
|
520
560
|
|
521
561
|
# an object with the same hash already exists
|
522
|
-
if isinstance(
|
562
|
+
if isinstance(kwargs_or_artifact, Artifact):
|
523
563
|
from ._registry import init_self_from_db
|
524
564
|
|
525
|
-
#
|
526
|
-
init_self_from_db(
|
565
|
+
# kwargs_or_artifact is an existing file
|
566
|
+
init_self_from_db(artifact, kwargs_or_artifact)
|
527
567
|
return None
|
528
568
|
else:
|
529
|
-
kwargs =
|
569
|
+
kwargs = kwargs_or_artifact
|
530
570
|
|
531
571
|
if isinstance(data, pd.DataFrame):
|
532
572
|
if log_hint:
|
@@ -551,7 +591,7 @@ def __init__(file: File, *args, **kwargs):
|
|
551
591
|
kwargs["description"] = description
|
552
592
|
kwargs["visibility"] = visibility
|
553
593
|
# this check needs to come down here because key might be populated from an
|
554
|
-
# existing file path during
|
594
|
+
# existing file path during get_artifact_kwargs_from_data()
|
555
595
|
if (
|
556
596
|
kwargs["key"] is None
|
557
597
|
and kwargs["description"] is None
|
@@ -562,16 +602,16 @@ def __init__(file: File, *args, **kwargs):
|
|
562
602
|
add_transform_to_kwargs(kwargs, kwargs["run"])
|
563
603
|
|
564
604
|
if data is not None:
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
605
|
+
artifact._local_filepath = privates["local_filepath"]
|
606
|
+
artifact._cloud_filepath = privates["cloud_filepath"]
|
607
|
+
artifact._memory_rep = privates["memory_rep"]
|
608
|
+
artifact._to_store = not privates["check_path_in_storage"]
|
569
609
|
|
570
|
-
super(
|
610
|
+
super(Artifact, artifact).__init__(**kwargs)
|
571
611
|
|
572
612
|
|
573
613
|
@classmethod # type: ignore
|
574
|
-
@doc_args(
|
614
|
+
@doc_args(Artifact.from_df.__doc__)
|
575
615
|
def from_df(
|
576
616
|
cls,
|
577
617
|
df: "pd.DataFrame",
|
@@ -580,11 +620,11 @@ def from_df(
|
|
580
620
|
description: Optional[str] = None,
|
581
621
|
run: Optional[Run] = None,
|
582
622
|
version: Optional[str] = None,
|
583
|
-
is_new_version_of: Optional["
|
623
|
+
is_new_version_of: Optional["Artifact"] = None,
|
584
624
|
**kwargs,
|
585
|
-
) -> "
|
625
|
+
) -> "Artifact":
|
586
626
|
"""{}"""
|
587
|
-
|
627
|
+
artifact = Artifact(
|
588
628
|
data=df,
|
589
629
|
key=key,
|
590
630
|
run=run,
|
@@ -595,10 +635,10 @@ def from_df(
|
|
595
635
|
)
|
596
636
|
feature_set = FeatureSet.from_df(df, field=field, **kwargs)
|
597
637
|
if feature_set is not None:
|
598
|
-
|
638
|
+
artifact._feature_sets = {"columns": feature_set}
|
599
639
|
else:
|
600
|
-
|
601
|
-
return
|
640
|
+
artifact._feature_sets = {}
|
641
|
+
return artifact
|
602
642
|
|
603
643
|
|
604
644
|
def parse_feature_sets_from_anndata(
|
@@ -646,7 +686,7 @@ def parse_feature_sets_from_anndata(
|
|
646
686
|
|
647
687
|
|
648
688
|
@classmethod # type: ignore
|
649
|
-
@doc_args(
|
689
|
+
@doc_args(Artifact.from_anndata.__doc__)
|
650
690
|
def from_anndata(
|
651
691
|
cls,
|
652
692
|
adata: "AnnDataLike",
|
@@ -655,11 +695,11 @@ def from_anndata(
|
|
655
695
|
description: Optional[str] = None,
|
656
696
|
run: Optional[Run] = None,
|
657
697
|
version: Optional[str] = None,
|
658
|
-
is_new_version_of: Optional["
|
698
|
+
is_new_version_of: Optional["Artifact"] = None,
|
659
699
|
**kwargs,
|
660
|
-
) -> "
|
700
|
+
) -> "Artifact":
|
661
701
|
"""{}"""
|
662
|
-
|
702
|
+
artifact = Artifact(
|
663
703
|
data=adata,
|
664
704
|
key=key,
|
665
705
|
run=run,
|
@@ -668,20 +708,24 @@ def from_anndata(
|
|
668
708
|
is_new_version_of=is_new_version_of,
|
669
709
|
log_hint=False,
|
670
710
|
)
|
671
|
-
|
672
|
-
return
|
711
|
+
artifact._feature_sets = parse_feature_sets_from_anndata(adata, field, **kwargs)
|
712
|
+
return artifact
|
673
713
|
|
674
714
|
|
675
715
|
@classmethod # type: ignore
|
676
|
-
@doc_args(
|
716
|
+
@doc_args(Artifact.from_dir.__doc__)
|
677
717
|
def from_dir(
|
678
718
|
cls,
|
679
719
|
path: PathLike,
|
680
720
|
key: Optional[str] = None,
|
681
721
|
*,
|
682
722
|
run: Optional[Run] = None,
|
683
|
-
) -> List["
|
723
|
+
) -> List["Artifact"]:
|
684
724
|
"""{}"""
|
725
|
+
logger.warning(
|
726
|
+
"this creates one artifact per file in the directory - you might simply call"
|
727
|
+
" ln.Artifact(dir) to get one artifact for the entire directory"
|
728
|
+
)
|
685
729
|
folderpath: UPath = create_path(path) # returns Path for local
|
686
730
|
storage, use_existing_storage = process_pathlike(folderpath)
|
687
731
|
folder_key_path: Union[PurePath, Path]
|
@@ -703,7 +747,7 @@ def from_dir(
|
|
703
747
|
# always sanitize by stripping a trailing slash
|
704
748
|
folder_key = folder_key_path.as_posix().rstrip("/")
|
705
749
|
|
706
|
-
# TODO: (non-local) UPath doesn't list the first level
|
750
|
+
# TODO: (non-local) UPath doesn't list the first level artifacts and dirs with "*"
|
707
751
|
pattern = "" if not isinstance(folderpath, LocalPathClasses) else "*"
|
708
752
|
|
709
753
|
# silence fine-grained logging
|
@@ -711,51 +755,59 @@ def from_dir(
|
|
711
755
|
verbosity_int = settings._verbosity_int
|
712
756
|
if verbosity_int >= 1:
|
713
757
|
settings.verbosity = "warning"
|
714
|
-
|
758
|
+
artifacts_dict = {}
|
715
759
|
for filepath in folderpath.rglob(pattern):
|
716
760
|
if filepath.is_file():
|
717
761
|
relative_path = get_relative_path_to_directory(filepath, folderpath)
|
718
|
-
|
762
|
+
artifact_key = folder_key + "/" + relative_path.as_posix()
|
719
763
|
# if creating from rglob, we don't need to check for existence
|
720
|
-
|
721
|
-
|
764
|
+
artifact = Artifact(
|
765
|
+
filepath, run=run, key=artifact_key, skip_check_exists=True
|
766
|
+
)
|
767
|
+
artifacts_dict[artifact.uid] = artifact
|
722
768
|
settings.verbosity = verbosity
|
723
769
|
|
724
770
|
# run sanity check on hashes
|
725
|
-
hashes = [
|
726
|
-
|
771
|
+
hashes = [
|
772
|
+
artifact.hash
|
773
|
+
for artifact in artifacts_dict.values()
|
774
|
+
if artifact.hash is not None
|
775
|
+
]
|
776
|
+
uids = artifacts_dict.keys()
|
727
777
|
if len(set(hashes)) == len(hashes):
|
728
|
-
|
778
|
+
artifacts = list(artifacts_dict.values())
|
729
779
|
else:
|
730
780
|
# consider exact duplicates (same id, same hash)
|
731
|
-
# below can't happen anymore because
|
781
|
+
# below can't happen anymore because artifacts is a dict now
|
732
782
|
# if len(set(uids)) == len(set(hashes)):
|
733
|
-
# logger.warning("dropping duplicate records in list of
|
734
|
-
#
|
783
|
+
# logger.warning("dropping duplicate records in list of artifact records")
|
784
|
+
# artifacts = list(set(uids))
|
735
785
|
# consider false duplicates (different id, same hash)
|
736
786
|
if not len(set(uids)) == len(set(hashes)):
|
737
787
|
seen_hashes = set()
|
738
|
-
|
739
|
-
hash:
|
740
|
-
for hash,
|
741
|
-
if
|
788
|
+
non_unique_artifacts = {
|
789
|
+
hash: artifact
|
790
|
+
for hash, artifact in artifacts_dict.items()
|
791
|
+
if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore # noqa
|
742
792
|
}
|
743
|
-
display_non_unique = "\n ".join(
|
793
|
+
display_non_unique = "\n ".join(
|
794
|
+
f"{artifact}" for artifact in non_unique_artifacts
|
795
|
+
)
|
744
796
|
logger.warning(
|
745
|
-
"there are multiple
|
746
|
-
f" {len(
|
747
|
-
f" {display_non_unique}"
|
797
|
+
"there are multiple artifact uids with the same hashes, dropping"
|
798
|
+
f" {len(non_unique_artifacts)} duplicates out of"
|
799
|
+
f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
|
748
800
|
)
|
749
|
-
|
750
|
-
|
751
|
-
for
|
752
|
-
if
|
801
|
+
artifacts = [
|
802
|
+
artifact
|
803
|
+
for artifact in artifacts_dict.values()
|
804
|
+
if artifact not in non_unique_artifacts.values()
|
753
805
|
]
|
754
806
|
logger.success(
|
755
|
-
f"created {len(
|
807
|
+
f"created {len(artifacts)} artifacts from directory using storage"
|
756
808
|
f" {storage.root} and key = {folder_key}/"
|
757
809
|
)
|
758
|
-
return
|
810
|
+
return artifacts
|
759
811
|
|
760
812
|
|
761
813
|
# docstring handled through attach_func_to_class_method
|
@@ -765,7 +817,7 @@ def replace(
|
|
765
817
|
run: Optional[Run] = None,
|
766
818
|
format: Optional[str] = None,
|
767
819
|
) -> None:
|
768
|
-
kwargs, privates =
|
820
|
+
kwargs, privates = get_artifact_kwargs_from_data(
|
769
821
|
provisional_uid=self.uid,
|
770
822
|
data=data,
|
771
823
|
key=self.key,
|
@@ -773,7 +825,7 @@ def replace(
|
|
773
825
|
format=format,
|
774
826
|
)
|
775
827
|
|
776
|
-
# this
|
828
|
+
# this artifact already exists
|
777
829
|
if privates is None:
|
778
830
|
return kwargs
|
779
831
|
|
@@ -793,8 +845,11 @@ def replace(
|
|
793
845
|
f" and delete '{key_path}' upon `save()`"
|
794
846
|
)
|
795
847
|
else:
|
796
|
-
old_storage =
|
797
|
-
|
848
|
+
old_storage = auto_storage_key_from_artifact(self)
|
849
|
+
is_dir = self.n_objects is not None
|
850
|
+
new_storage = auto_storage_key_from_artifact_uid(
|
851
|
+
self.uid, kwargs["suffix"], is_dir
|
852
|
+
)
|
798
853
|
if old_storage != new_storage:
|
799
854
|
self._clear_storagekey = old_storage
|
800
855
|
if self.key is not None:
|
@@ -822,8 +877,8 @@ def backed(
|
|
822
877
|
suffixes = (".h5", ".hdf5", ".h5ad", ".zrad", ".zarr")
|
823
878
|
if self.suffix not in suffixes:
|
824
879
|
raise ValueError(
|
825
|
-
"
|
826
|
-
" one of the following suffixes for the object name:"
|
880
|
+
"Artifact should have a zarr or h5 object as the underlying data, please"
|
881
|
+
" use one of the following suffixes for the object name:"
|
827
882
|
f" {', '.join(suffixes)}."
|
828
883
|
)
|
829
884
|
|
@@ -831,7 +886,7 @@ def backed(
|
|
831
886
|
|
832
887
|
_track_run_input(self, is_run_input)
|
833
888
|
|
834
|
-
filepath =
|
889
|
+
filepath = filepath_from_artifact(self)
|
835
890
|
# consider the case where an object is already locally cached
|
836
891
|
localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
|
837
892
|
if localpath.exists():
|
@@ -847,7 +902,7 @@ def load(
|
|
847
902
|
_track_run_input(self, is_run_input)
|
848
903
|
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
849
904
|
return self._memory_rep
|
850
|
-
return load_to_memory(
|
905
|
+
return load_to_memory(filepath_from_artifact(self), stream=stream, **kwargs)
|
851
906
|
|
852
907
|
|
853
908
|
# docstring handled through attach_func_to_class_method
|
@@ -856,7 +911,7 @@ def stage(self, is_run_input: Optional[bool] = None) -> Path:
|
|
856
911
|
raise RuntimeError("zarr object can't be staged, please use load() or stream()")
|
857
912
|
_track_run_input(self, is_run_input)
|
858
913
|
|
859
|
-
filepath =
|
914
|
+
filepath = filepath_from_artifact(self)
|
860
915
|
return setup_settings.instance.storage.cloud_to_local(filepath, print_progress=True)
|
861
916
|
|
862
917
|
|
@@ -864,21 +919,21 @@ def stage(self, is_run_input: Optional[bool] = None) -> Path:
|
|
864
919
|
def delete(
|
865
920
|
self, permanent: Optional[bool] = None, storage: Optional[bool] = None
|
866
921
|
) -> None:
|
867
|
-
# by default, we only move
|
922
|
+
# by default, we only move artifacts into the trash
|
868
923
|
if self.visibility > VisibilityChoice.trash.value and permanent is not True:
|
869
924
|
if storage is not None:
|
870
|
-
logger.warning("moving
|
925
|
+
logger.warning("moving artifact to trash, storage arg is ignored")
|
871
926
|
# move to trash
|
872
927
|
self.visibility = VisibilityChoice.trash.value
|
873
928
|
self.save()
|
874
|
-
logger.warning("moved
|
929
|
+
logger.warning("moved artifact to trash")
|
875
930
|
return
|
876
931
|
|
877
|
-
# if the
|
932
|
+
# if the artifact is already in the trash
|
878
933
|
# permanent delete skips the trash
|
879
934
|
if permanent is None:
|
880
935
|
response = input(
|
881
|
-
"
|
936
|
+
"Artifact record is already in trash! Are you sure you want to permanently"
|
882
937
|
" delete it? (y/n) You can't undo this action."
|
883
938
|
)
|
884
939
|
delete_record = response == "y"
|
@@ -896,7 +951,7 @@ def delete(
|
|
896
951
|
if storage is not None:
|
897
952
|
logger.warning("storage arg is ignored if storage key is non-semantic")
|
898
953
|
else:
|
899
|
-
# for
|
954
|
+
# for artifacts with non-virtual semantic storage keys (key is not None)
|
900
955
|
# ask for extra-confirmation
|
901
956
|
if storage is None:
|
902
957
|
response = input(
|
@@ -913,8 +968,8 @@ def delete(
|
|
913
968
|
logger.success(f"deleted {colors.yellow(f'{filepath}')}")
|
914
969
|
|
915
970
|
|
916
|
-
def _delete_skip_storage(
|
917
|
-
super(
|
971
|
+
def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
972
|
+
super(Artifact, artifact).delete(*args, **kwargs)
|
918
973
|
|
919
974
|
|
920
975
|
# docstring handled through attach_func_to_class_method
|
@@ -933,15 +988,15 @@ def save(self, *args, **kwargs) -> None:
|
|
933
988
|
|
934
989
|
def _save_skip_storage(file, *args, **kwargs) -> None:
|
935
990
|
save_feature_sets(file)
|
936
|
-
super(
|
991
|
+
super(Artifact, file).save(*args, **kwargs)
|
937
992
|
save_feature_set_links(file)
|
938
993
|
|
939
994
|
|
940
995
|
@property # type: ignore
|
941
|
-
@doc_args(
|
996
|
+
@doc_args(Artifact.path.__doc__)
|
942
997
|
def path(self) -> Union[Path, UPath]:
|
943
998
|
"""{}"""
|
944
|
-
return
|
999
|
+
return filepath_from_artifact(self)
|
945
1000
|
|
946
1001
|
|
947
1002
|
@classmethod # type: ignore
|
@@ -990,17 +1045,17 @@ if _TESTING:
|
|
990
1045
|
from inspect import signature
|
991
1046
|
|
992
1047
|
SIGS = {
|
993
|
-
name: signature(getattr(
|
1048
|
+
name: signature(getattr(Artifact, name))
|
994
1049
|
for name in METHOD_NAMES
|
995
1050
|
if name != "__init__"
|
996
1051
|
}
|
997
1052
|
|
998
1053
|
for name in METHOD_NAMES:
|
999
|
-
attach_func_to_class_method(name,
|
1054
|
+
attach_func_to_class_method(name, Artifact, globals())
|
1000
1055
|
|
1001
1056
|
# privates currently dealt with separately
|
1002
|
-
|
1003
|
-
|
1004
|
-
setattr(
|
1057
|
+
Artifact._delete_skip_storage = _delete_skip_storage
|
1058
|
+
Artifact._save_skip_storage = _save_skip_storage
|
1059
|
+
setattr(Artifact, "path", path)
|
1005
1060
|
# this seems a Django-generated function
|
1006
|
-
delattr(
|
1061
|
+
delattr(Artifact, "get_visibility_display")
|