lamindb 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path, PurePath, PurePosixPath
2
- from typing import Any, List, Optional, Tuple, Union
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
3
 
4
4
  import anndata as ad
5
5
  import fsspec
@@ -13,7 +13,7 @@ from lamindb_setup.dev import StorageSettings
13
13
  from lamindb_setup.dev._docs import doc_args
14
14
  from lamindb_setup.dev._hub_utils import get_storage_region
15
15
  from lamindb_setup.dev.upath import create_path, extract_suffix_from_path
16
- from lnschema_core import Feature, FeatureSet, File, Run, Storage
16
+ from lnschema_core import Artifact, Feature, FeatureSet, Run, Storage
17
17
  from lnschema_core.models import IsTree
18
18
  from lnschema_core.types import (
19
19
  AnnDataLike,
@@ -26,7 +26,7 @@ from lnschema_core.types import (
26
26
  from lamindb._utils import attach_func_to_class_method
27
27
  from lamindb.dev._data import _track_run_input
28
28
  from lamindb.dev._settings import settings
29
- from lamindb.dev.hashing import b16_to_b64, hash_file
29
+ from lamindb.dev.hashing import b16_to_b64, hash_file, hash_md5s_from_dir
30
30
  from lamindb.dev.storage import (
31
31
  LocalPathClasses,
32
32
  UPath,
@@ -38,9 +38,9 @@ from lamindb.dev.storage import (
38
38
  )
39
39
  from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
40
40
  from lamindb.dev.storage.file import (
41
- auto_storage_key_from_file,
42
- auto_storage_key_from_id_suffix,
43
- filepath_from_file,
41
+ auto_storage_key_from_artifact,
42
+ auto_storage_key_from_artifact_uid,
43
+ filepath_from_artifact,
44
44
  )
45
45
  from lamindb.dev.versioning import get_ids_from_old_version, init_uid
46
46
 
@@ -113,11 +113,11 @@ def process_data(
113
113
  """Serialize a data object that's provided as file or in memory."""
114
114
  # if not overwritten, data gets stored in default storage
115
115
  if isinstance(data, (str, Path, UPath)): # PathLike, spelled out
116
- filepath = create_path(data)
116
+ path = create_path(data)
117
117
  storage, use_existing_storage_key = process_pathlike(
118
- filepath, skip_existence_check=skip_existence_check
118
+ path, skip_existence_check=skip_existence_check
119
119
  )
120
- suffix = extract_suffix_from_path(filepath)
120
+ suffix = extract_suffix_from_path(path)
121
121
  memory_rep = None
122
122
  elif isinstance(data, (pd.DataFrame, AnnData)): # DataLike, spelled out
123
123
  storage = lamindb_setup.settings.storage.record
@@ -136,157 +136,190 @@ def process_data(
136
136
  f" be '{suffix}'."
137
137
  )
138
138
  cache_name = f"{provisional_uid}{suffix}"
139
- filepath = lamindb_setup.settings.storage.cache_dir / cache_name
139
+ path = lamindb_setup.settings.storage.cache_dir / cache_name
140
140
  # Alex: I don't understand the line below
141
- if filepath.suffixes == []:
142
- filepath = filepath.with_suffix(suffix)
141
+ if path.suffixes == []:
142
+ path = path.with_suffix(suffix)
143
143
  if suffix not in {".zarr", ".zrad"}:
144
- write_to_file(data, filepath)
144
+ write_to_file(data, path)
145
145
  use_existing_storage_key = False
146
146
  else:
147
147
  raise NotImplementedError(
148
- f"Do not know how to create a file object from {data}, pass a filepath"
148
+ f"Do not know how to create a artifact object from {data}, pass a path"
149
149
  " instead!"
150
150
  )
151
- return memory_rep, filepath, suffix, storage, use_existing_storage_key
151
+ return memory_rep, path, suffix, storage, use_existing_storage_key
152
+
153
+
154
+ def get_stat_file_cloud(stat: Dict) -> Tuple[int, str, str]:
155
+ size = stat["size"]
156
+ # small files
157
+ if "-" not in stat["ETag"]:
158
+ # only store hash for non-multipart uploads
159
+ # we can't rapidly validate multi-part uploaded files client-side
160
+ # we can add more logic later down-the-road
161
+ hash = b16_to_b64(stat["ETag"])
162
+ hash_type = "md5"
163
+ else:
164
+ stripped_etag, suffix = stat["ETag"].split("-")
165
+ suffix = suffix.strip('"')
166
+ hash = f"{b16_to_b64(stripped_etag)}-{suffix}"
167
+ hash_type = "md5-n" # this is the S3 chunk-hashing strategy
168
+ return size, hash, hash_type
169
+
170
+
171
+ def get_stat_dir_s3(path: UPath) -> Tuple[int, str, str, int]:
172
+ import boto3
173
+ from lamindb_setup.dev.upath import AWS_CREDENTIALS_PRESENT
152
174
 
175
+ if not AWS_CREDENTIALS_PRESENT:
176
+ # passing the following param directly to Session() doesn't
177
+ # work, unfortunately: botocore_session=path.fs.session
178
+ from botocore import UNSIGNED
179
+ from botocore.config import Config
153
180
 
154
- def get_hash(
155
- filepath: UPath,
156
- suffix,
157
- filepath_stat=None,
181
+ config = Config(signature_version=UNSIGNED)
182
+ s3 = boto3.session.Session().resource("s3", config=config)
183
+ else:
184
+ s3 = boto3.session.Session().resource("s3")
185
+ bucket, key, _ = path.fs.split_path(path.as_posix())
186
+ # assuming this here is the fastest way of querying for many objects
187
+ objects = s3.Bucket(bucket).objects.filter(Prefix=key)
188
+ size = sum([object.size for object in objects])
189
+ md5s = [
190
+ # skip leading and trailing quotes
191
+ object.e_tag[1:-1]
192
+ for object in objects
193
+ ]
194
+ n_objects = len(md5s)
195
+ hash, hash_type = hash_md5s_from_dir(md5s)
196
+ return size, hash, hash_type, n_objects
197
+
198
+
199
+ def get_stat_dir_gs(path: UPath) -> Tuple[int, str, str, int]:
200
+ import google.cloud.storage as gc_storage
201
+
202
+ bucket, key, _ = path.fs.split_path(path.as_posix())
203
+ # assuming this here is the fastest way of querying for many objects
204
+ client = gc_storage.Client(
205
+ credentials=path.fs.credentials.credentials, project=path.fs.project
206
+ )
207
+ objects = client.Bucket(bucket).list_blobs(prefix=key)
208
+ sizes, md5s = [], []
209
+ for object in objects:
210
+ sizes.append(object.size)
211
+ md5s.append(object.md5_hash)
212
+ n_objects = len(md5s)
213
+ hash, hash_type = hash_md5s_from_dir(md5s)
214
+ return sum(sizes), hash, hash_type, n_objects
215
+
216
+
217
+ def get_stat_or_artifact(
218
+ path: UPath,
219
+ suffix: str,
220
+ memory_rep: Optional[Any] = None,
158
221
  check_hash: bool = True,
159
- ) -> Union[Tuple[Optional[str], Optional[str]], File]:
160
- if suffix in {".zarr", ".zrad"}:
161
- return None
162
- if not isinstance(filepath, LocalPathClasses):
163
- stat = filepath_stat
164
- if stat is not None and "ETag" in stat:
165
- # small files
166
- if "-" not in stat["ETag"]:
167
- # only store hash for non-multipart uploads
168
- # we can't rapidly validate multi-part uploaded files client-side
169
- # we can add more logic later down-the-road
170
- hash = b16_to_b64(stat["ETag"])
171
- hash_type = "md5"
172
- else:
173
- stripped_etag, suffix = stat["ETag"].split("-")
174
- suffix = suffix.strip('"')
175
- hash = f"{b16_to_b64(stripped_etag)}-{suffix}"
176
- hash_type = "md5-n" # this is the S3 chunk-hashing strategy
177
- else:
178
- logger.warning(f"did not add hash for {filepath}")
179
- return None, None
222
+ ) -> Union[Tuple[int, Optional[str], Optional[str], Optional[int]], Artifact]:
223
+ n_objects = None
224
+ if settings.upon_file_create_skip_size_hash:
225
+ return None, None, None, n_objects
226
+ if (
227
+ suffix in {".zarr", ".zrad"}
228
+ and memory_rep is not None
229
+ and isinstance(memory_rep, AnnData)
230
+ ):
231
+ size = size_adata(memory_rep)
232
+ return size, None, None, n_objects
233
+ stat = path.stat() # one network request
234
+ if not isinstance(path, LocalPathClasses):
235
+ size, hash, hash_type = None, None, None
236
+ if stat is not None:
237
+ if "ETag" in stat: # is file
238
+ size, hash, hash_type = get_stat_file_cloud(stat)
239
+ elif path.is_dir():
240
+ if path.protocol == "s3":
241
+ size, hash, hash_type, n_objects = get_stat_dir_s3(path)
242
+ elif path.protocol == "gs":
243
+ size, hash, hash_type, n_objects = get_stat_dir_gs(path)
244
+ if hash is None:
245
+ logger.warning(f"did not add hash for {path}")
246
+ return size, hash, hash_type, n_objects
180
247
  else:
181
- hash, hash_type = hash_file(filepath)
248
+ if path.is_dir():
249
+ md5s = []
250
+ size = 0
251
+ for subpath in path.rglob("*"):
252
+ if not subpath.is_file():
253
+ continue
254
+ size += subpath.stat().st_size
255
+ md5s.append(hash_file(subpath)[0])
256
+ hash, hash_type = hash_md5s_from_dir(md5s)
257
+ n_objects = len(md5s)
258
+ else:
259
+ hash, hash_type = hash_file(path)
260
+ size = stat.st_size
182
261
  if not check_hash:
183
- return hash, hash_type
262
+ return size, hash, hash_type, n_objects
184
263
  # also checks hidden and trashed files
185
- result = File.filter(hash=hash, visibility=None).list()
264
+ result = Artifact.filter(hash=hash, visibility=None).list()
186
265
  if len(result) > 0:
187
- if settings.upon_file_create_if_hash_exists == "error":
188
- msg = f"file with same hash exists: {result[0]}"
266
+ if settings.upon_artifact_create_if_hash_exists == "error":
267
+ msg = f"artifact with same hash exists: {result[0]}"
189
268
  hint = (
190
269
  "💡 you can make this error a warning:\n"
191
- " ln.settings.upon_file_create_if_hash_exists"
270
+ " ln.settings.upon_artifact_create_if_hash_exists"
192
271
  )
193
272
  raise RuntimeError(f"{msg}\n{hint}")
194
- elif settings.upon_file_create_if_hash_exists == "warn_create_new":
273
+ elif settings.upon_artifact_create_if_hash_exists == "warn_create_new":
195
274
  logger.warning(
196
- "creating new File object despite existing file with same hash:"
275
+ "creating new Artifact object despite existing artifact with same hash:"
197
276
  f" {result[0]}"
198
277
  )
199
- return hash, hash_type
278
+ return size, hash, hash_type, n_objects
200
279
  else:
201
- logger.warning(f"returning existing file with same hash: {result[0]}")
280
+ logger.warning(f"returning existing artifact with same hash: {result[0]}")
202
281
  if result[0].visibility < 1:
203
282
  if result[0].visibility == -1:
204
283
  visibility_text = "in the trash"
205
284
  elif result[0].visibility == 0:
206
285
  visibility_text = "hidden"
207
286
  logger.warning(
208
- f"the existing file is {visibility_text}, restore it before use:"
209
- " `file.restore()`"
287
+ f"the existing artifact is {visibility_text}, restore it before"
288
+ " use: `artifact.restore()`"
210
289
  )
211
290
  return result[0]
212
291
  else:
213
- return hash, hash_type
292
+ return size, hash, hash_type, n_objects
214
293
 
215
294
 
216
- def get_path_size_hash(
217
- filepath: UPath,
218
- memory_rep: Optional[Union[pd.DataFrame, AnnData]],
219
- suffix: str,
220
- check_hash: bool = True,
221
- ):
222
- cloudpath = None
223
- localpath = None
224
- hash_and_type: Tuple[Optional[str], Optional[str]]
225
-
226
- if suffix in {".zarr", ".zrad"}:
227
- if memory_rep is not None:
228
- size = size_adata(memory_rep)
229
- else:
230
- if not isinstance(filepath, LocalPathClasses):
231
- cloudpath = filepath
232
- # todo: properly calculate size
233
- size = 0
234
- else:
235
- localpath = filepath
236
- size = sum(
237
- f.stat().st_size for f in filepath.rglob("*") if f.is_file() # type: ignore # noqa
238
- )
239
- hash_and_type = None, None
240
- else:
241
- # to accelerate ingesting high numbers of files
242
- if settings.upon_file_create_skip_size_hash:
243
- size = None
244
- hash_and_type = None, None
245
- else:
246
- filepath_stat = filepath.stat()
247
- if not isinstance(filepath, LocalPathClasses):
248
- size = filepath_stat["size"]
249
- cloudpath = filepath
250
- hash_and_type = None, None
251
- else:
252
- size = filepath_stat.st_size # type: ignore
253
- localpath = filepath
254
- hash_and_type = get_hash(
255
- filepath, suffix, filepath_stat=filepath_stat, check_hash=check_hash
256
- )
257
- return localpath, cloudpath, size, hash_and_type
258
-
259
-
260
- def check_path_in_existing_storage(
261
- filepath: Union[Path, UPath]
262
- ) -> Union[Storage, bool]:
295
+ def check_path_in_existing_storage(path: Union[Path, UPath]) -> Union[Storage, bool]:
263
296
  for storage in Storage.filter().all():
264
297
  # if path is part of storage, return it
265
- if check_path_is_child_of_root(filepath, root=create_path(storage.root)):
298
+ if check_path_is_child_of_root(path, root=create_path(storage.root)):
266
299
  return storage
267
300
  return False
268
301
 
269
302
 
270
303
  def check_path_is_child_of_root(
271
- filepath: Union[Path, UPath], root: Optional[Union[Path, UPath]] = None
304
+ path: Union[Path, UPath], root: Optional[Union[Path, UPath]] = None
272
305
  ) -> bool:
273
306
  if root is None:
274
307
  root = lamindb_setup.settings.storage.root
275
308
 
276
- filepath = UPath(str(filepath)) if not isinstance(filepath, UPath) else filepath
309
+ path = UPath(str(path)) if not isinstance(path, UPath) else path
277
310
  root = UPath(str(root)) if not isinstance(root, UPath) else root
278
311
 
279
312
  # the following comparisons can fail if types aren't comparable
280
- if not isinstance(filepath, LocalPathClasses) and not isinstance(
313
+ if not isinstance(path, LocalPathClasses) and not isinstance(
281
314
  root, LocalPathClasses
282
315
  ):
283
316
  # the following tests equivalency of two UPath objects
284
317
  # via string representations; otherwise
285
318
  # S3Path('s3://lndb-storage/') and S3Path('s3://lamindb-ci/')
286
319
  # test as equivalent
287
- return list(filepath.parents)[-1].as_posix() == root.as_posix()
288
- elif isinstance(filepath, LocalPathClasses) and isinstance(root, LocalPathClasses):
289
- return root.resolve() in filepath.resolve().parents
320
+ return list(path.parents)[-1].as_posix() == root.as_posix()
321
+ elif isinstance(path, LocalPathClasses) and isinstance(root, LocalPathClasses):
322
+ return root.resolve() in path.resolve().parents
290
323
  else:
291
324
  return False
292
325
 
@@ -297,7 +330,7 @@ def get_relative_path_to_directory(
297
330
  if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
298
331
  # UPath.relative_to() is not behaving as it should (2023-04-07)
299
332
  # need to lstrip otherwise inconsistent behavior across trailing slashes
300
- # see test_file.py: test_get_relative_path_to_directory
333
+ # see test_artifact.py: test_get_relative_path_to_directory
301
334
  relpath = PurePath(
302
335
  path.as_posix().replace(directory.as_posix(), "").lstrip("/")
303
336
  )
@@ -310,7 +343,7 @@ def get_relative_path_to_directory(
310
343
  return relpath
311
344
 
312
345
 
313
- def get_file_kwargs_from_data(
346
+ def get_artifact_kwargs_from_data(
314
347
  *,
315
348
  data: Union[Path, UPath, str, pd.DataFrame, AnnData],
316
349
  key: Optional[str],
@@ -320,25 +353,23 @@ def get_file_kwargs_from_data(
320
353
  skip_check_exists: bool = False,
321
354
  ):
322
355
  run = get_run(run)
323
- memory_rep, filepath, suffix, storage, use_existing_storage_key = process_data(
356
+ memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
324
357
  provisional_uid, data, format, key, skip_check_exists
325
358
  )
326
- # the following will return a localpath that is not None if filepath is local
327
- # it will return a cloudpath that is not None if filepath is on the cloud
328
- local_filepath, cloud_filepath, size, hash_and_type = get_path_size_hash(
329
- filepath,
330
- memory_rep,
331
- suffix,
359
+ stat_or_artifact = get_stat_or_artifact(
360
+ path=path,
361
+ suffix=suffix,
362
+ memory_rep=memory_rep,
332
363
  )
333
- if isinstance(hash_and_type, File):
334
- return hash_and_type, None
364
+ if isinstance(stat_or_artifact, Artifact):
365
+ return stat_or_artifact, None
335
366
  else:
336
- hash, hash_type = hash_and_type
367
+ size, hash, hash_type, n_objects = stat_or_artifact
337
368
 
338
369
  check_path_in_storage = False
339
370
  if use_existing_storage_key:
340
371
  inferred_key = get_relative_path_to_directory(
341
- path=filepath, directory=storage.root_as_path()
372
+ path=path, directory=storage.root_as_path()
342
373
  ).as_posix()
343
374
  if key is None:
344
375
  key = inferred_key
@@ -363,10 +394,11 @@ def get_file_kwargs_from_data(
363
394
  key=key,
364
395
  uid=provisional_uid,
365
396
  suffix=suffix,
397
+ is_dir=n_objects is not None,
366
398
  )
367
399
 
368
400
  # do we use a virtual or an actual storage key?
369
- key_is_virtual = settings.file_use_virtual_keys
401
+ key_is_virtual = settings.artifact_use_virtual_keys
370
402
 
371
403
  # if the file is already in storage, independent of the default
372
404
  # we use an actual storage key
@@ -383,17 +415,24 @@ def get_file_kwargs_from_data(
383
415
  # passing both the id and the object
384
416
  # to make them both available immediately
385
417
  # after object creation
418
+ n_objects=n_objects,
419
+ n_observations=None, # to implement
386
420
  run_id=run.id if run is not None else None,
387
421
  run=run,
388
422
  key_is_virtual=key_is_virtual,
389
423
  )
424
+ if not isinstance(path, LocalPathClasses):
425
+ local_filepath = None
426
+ cloud_filepath = path
427
+ else:
428
+ local_filepath = path
429
+ cloud_filepath = None
390
430
  privates = dict(
391
431
  local_filepath=local_filepath,
392
432
  cloud_filepath=cloud_filepath,
393
433
  memory_rep=memory_rep,
394
434
  check_path_in_storage=check_path_in_storage,
395
435
  )
396
-
397
436
  return kwargs, privates
398
437
 
399
438
 
@@ -404,6 +443,7 @@ def log_storage_hint(
404
443
  key: Optional[str],
405
444
  uid: str,
406
445
  suffix: str,
446
+ is_dir: bool,
407
447
  ) -> None:
408
448
  hint = ""
409
449
  if check_path_in_storage:
@@ -415,11 +455,11 @@ def log_storage_hint(
415
455
  if check_path_is_child_of_root(root_path, Path.cwd()):
416
456
  # only display the relative path, not the fully resolved path
417
457
  display_root = root_path.relative_to(Path.cwd())
418
- hint += f"file in storage '{display_root}'" # type: ignore
458
+ hint += f"path in storage '{display_root}'" # type: ignore
419
459
  else:
420
- hint += "file will be copied to default storage upon `save()`"
460
+ hint += "path content will be copied to default storage upon `save()`"
421
461
  if key is None:
422
- storage_key = auto_storage_key_from_id_suffix(uid, suffix)
462
+ storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
423
463
  hint += f" with key `None` ('{storage_key}')"
424
464
  else:
425
465
  hint += f" with key '{key}'"
@@ -447,17 +487,17 @@ def data_is_mudata(data: DataLike): # pragma: no cover
447
487
  return False
448
488
 
449
489
 
450
- def __init__(file: File, *args, **kwargs):
490
+ def __init__(artifact: Artifact, *args, **kwargs):
451
491
  # Below checks for the Django-internal call in from_db()
452
- # it'd be better if we could avoid this, but not being able to create a File
492
+ # it'd be better if we could avoid this, but not being able to create a Artifact
453
493
  # from data with the default constructor renders the central class of the API
454
494
  # essentially useless
455
495
  # The danger below is not that a user might pass as many args (12 of it), but rather
456
496
  # that at some point the Django API might change; on the other hand, this
457
497
  # condition of for calling the constructor based on kwargs should always
458
498
  # stay robust
459
- if len(args) == len(file._meta.concrete_fields):
460
- super(File, file).__init__(*args, **kwargs)
499
+ if len(args) == len(artifact._meta.concrete_fields):
500
+ super(Artifact, artifact).__init__(*args, **kwargs)
461
501
  return None
462
502
  # now we proceed with the user-facing constructor
463
503
  if len(args) > 1:
@@ -468,7 +508,7 @@ def __init__(file: File, *args, **kwargs):
468
508
  description: Optional[str] = (
469
509
  kwargs.pop("description") if "description" in kwargs else None
470
510
  )
471
- is_new_version_of: Optional[File] = (
511
+ is_new_version_of: Optional[Artifact] = (
472
512
  kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
473
513
  )
474
514
  initial_version_id: Optional[int] = (
@@ -495,8 +535,8 @@ def __init__(file: File, *args, **kwargs):
495
535
  if is_new_version_of is None:
496
536
  provisional_uid = init_uid(version=version, n_full_id=20)
497
537
  else:
498
- if not isinstance(is_new_version_of, File):
499
- raise TypeError("is_new_version_of has to be of type ln.File")
538
+ if not isinstance(is_new_version_of, Artifact):
539
+ raise TypeError("is_new_version_of has to be of type ln.Artifact")
500
540
  provisional_uid, initial_version_id, version = get_ids_from_old_version(
501
541
  is_new_version_of, version, n_full_id=20
502
542
  )
@@ -507,9 +547,9 @@ def __init__(file: File, *args, **kwargs):
507
547
  if initial_version_id is None:
508
548
  logger.info(
509
549
  "initializing versioning for this file! create future versions of it"
510
- " using ln.File(..., is_new_version_of=old_file)"
550
+ " using ln.Artifact(..., is_new_version_of=old_file)"
511
551
  )
512
- kwargs_or_file, privates = get_file_kwargs_from_data(
552
+ kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
513
553
  data=data,
514
554
  key=key,
515
555
  run=run,
@@ -519,14 +559,14 @@ def __init__(file: File, *args, **kwargs):
519
559
  )
520
560
 
521
561
  # an object with the same hash already exists
522
- if isinstance(kwargs_or_file, File):
562
+ if isinstance(kwargs_or_artifact, Artifact):
523
563
  from ._registry import init_self_from_db
524
564
 
525
- # kwargs_or_file is an existing file
526
- init_self_from_db(file, kwargs_or_file)
565
+ # kwargs_or_artifact is an existing file
566
+ init_self_from_db(artifact, kwargs_or_artifact)
527
567
  return None
528
568
  else:
529
- kwargs = kwargs_or_file
569
+ kwargs = kwargs_or_artifact
530
570
 
531
571
  if isinstance(data, pd.DataFrame):
532
572
  if log_hint:
@@ -551,7 +591,7 @@ def __init__(file: File, *args, **kwargs):
551
591
  kwargs["description"] = description
552
592
  kwargs["visibility"] = visibility
553
593
  # this check needs to come down here because key might be populated from an
554
- # existing file path during get_file_kwargs_from_data()
594
+ # existing file path during get_artifact_kwargs_from_data()
555
595
  if (
556
596
  kwargs["key"] is None
557
597
  and kwargs["description"] is None
@@ -562,16 +602,16 @@ def __init__(file: File, *args, **kwargs):
562
602
  add_transform_to_kwargs(kwargs, kwargs["run"])
563
603
 
564
604
  if data is not None:
565
- file._local_filepath = privates["local_filepath"]
566
- file._cloud_filepath = privates["cloud_filepath"]
567
- file._memory_rep = privates["memory_rep"]
568
- file._to_store = not privates["check_path_in_storage"]
605
+ artifact._local_filepath = privates["local_filepath"]
606
+ artifact._cloud_filepath = privates["cloud_filepath"]
607
+ artifact._memory_rep = privates["memory_rep"]
608
+ artifact._to_store = not privates["check_path_in_storage"]
569
609
 
570
- super(File, file).__init__(**kwargs)
610
+ super(Artifact, artifact).__init__(**kwargs)
571
611
 
572
612
 
573
613
  @classmethod # type: ignore
574
- @doc_args(File.from_df.__doc__)
614
+ @doc_args(Artifact.from_df.__doc__)
575
615
  def from_df(
576
616
  cls,
577
617
  df: "pd.DataFrame",
@@ -580,11 +620,11 @@ def from_df(
580
620
  description: Optional[str] = None,
581
621
  run: Optional[Run] = None,
582
622
  version: Optional[str] = None,
583
- is_new_version_of: Optional["File"] = None,
623
+ is_new_version_of: Optional["Artifact"] = None,
584
624
  **kwargs,
585
- ) -> "File":
625
+ ) -> "Artifact":
586
626
  """{}"""
587
- file = File(
627
+ artifact = Artifact(
588
628
  data=df,
589
629
  key=key,
590
630
  run=run,
@@ -595,10 +635,10 @@ def from_df(
595
635
  )
596
636
  feature_set = FeatureSet.from_df(df, field=field, **kwargs)
597
637
  if feature_set is not None:
598
- file._feature_sets = {"columns": feature_set}
638
+ artifact._feature_sets = {"columns": feature_set}
599
639
  else:
600
- file._feature_sets = {}
601
- return file
640
+ artifact._feature_sets = {}
641
+ return artifact
602
642
 
603
643
 
604
644
  def parse_feature_sets_from_anndata(
@@ -646,7 +686,7 @@ def parse_feature_sets_from_anndata(
646
686
 
647
687
 
648
688
  @classmethod # type: ignore
649
- @doc_args(File.from_anndata.__doc__)
689
+ @doc_args(Artifact.from_anndata.__doc__)
650
690
  def from_anndata(
651
691
  cls,
652
692
  adata: "AnnDataLike",
@@ -655,11 +695,11 @@ def from_anndata(
655
695
  description: Optional[str] = None,
656
696
  run: Optional[Run] = None,
657
697
  version: Optional[str] = None,
658
- is_new_version_of: Optional["File"] = None,
698
+ is_new_version_of: Optional["Artifact"] = None,
659
699
  **kwargs,
660
- ) -> "File":
700
+ ) -> "Artifact":
661
701
  """{}"""
662
- file = File(
702
+ artifact = Artifact(
663
703
  data=adata,
664
704
  key=key,
665
705
  run=run,
@@ -668,20 +708,24 @@ def from_anndata(
668
708
  is_new_version_of=is_new_version_of,
669
709
  log_hint=False,
670
710
  )
671
- file._feature_sets = parse_feature_sets_from_anndata(adata, field, **kwargs)
672
- return file
711
+ artifact._feature_sets = parse_feature_sets_from_anndata(adata, field, **kwargs)
712
+ return artifact
673
713
 
674
714
 
675
715
  @classmethod # type: ignore
676
- @doc_args(File.from_dir.__doc__)
716
+ @doc_args(Artifact.from_dir.__doc__)
677
717
  def from_dir(
678
718
  cls,
679
719
  path: PathLike,
680
720
  key: Optional[str] = None,
681
721
  *,
682
722
  run: Optional[Run] = None,
683
- ) -> List["File"]:
723
+ ) -> List["Artifact"]:
684
724
  """{}"""
725
+ logger.warning(
726
+ "this creates one artifact per file in the directory - you might simply call"
727
+ " ln.Artifact(dir) to get one artifact for the entire directory"
728
+ )
685
729
  folderpath: UPath = create_path(path) # returns Path for local
686
730
  storage, use_existing_storage = process_pathlike(folderpath)
687
731
  folder_key_path: Union[PurePath, Path]
@@ -703,7 +747,7 @@ def from_dir(
703
747
  # always sanitize by stripping a trailing slash
704
748
  folder_key = folder_key_path.as_posix().rstrip("/")
705
749
 
706
- # TODO: (non-local) UPath doesn't list the first level files and dirs with "*"
750
+ # TODO: (non-local) UPath doesn't list the first level artifacts and dirs with "*"
707
751
  pattern = "" if not isinstance(folderpath, LocalPathClasses) else "*"
708
752
 
709
753
  # silence fine-grained logging
@@ -711,51 +755,59 @@ def from_dir(
711
755
  verbosity_int = settings._verbosity_int
712
756
  if verbosity_int >= 1:
713
757
  settings.verbosity = "warning"
714
- files_dict = {}
758
+ artifacts_dict = {}
715
759
  for filepath in folderpath.rglob(pattern):
716
760
  if filepath.is_file():
717
761
  relative_path = get_relative_path_to_directory(filepath, folderpath)
718
- file_key = folder_key + "/" + relative_path.as_posix()
762
+ artifact_key = folder_key + "/" + relative_path.as_posix()
719
763
  # if creating from rglob, we don't need to check for existence
720
- file = File(filepath, run=run, key=file_key, skip_check_exists=True)
721
- files_dict[file.uid] = file
764
+ artifact = Artifact(
765
+ filepath, run=run, key=artifact_key, skip_check_exists=True
766
+ )
767
+ artifacts_dict[artifact.uid] = artifact
722
768
  settings.verbosity = verbosity
723
769
 
724
770
  # run sanity check on hashes
725
- hashes = [file.hash for file in files_dict.values() if file.hash is not None]
726
- uids = files_dict.keys()
771
+ hashes = [
772
+ artifact.hash
773
+ for artifact in artifacts_dict.values()
774
+ if artifact.hash is not None
775
+ ]
776
+ uids = artifacts_dict.keys()
727
777
  if len(set(hashes)) == len(hashes):
728
- files = list(files_dict.values())
778
+ artifacts = list(artifacts_dict.values())
729
779
  else:
730
780
  # consider exact duplicates (same id, same hash)
731
- # below can't happen anymore because files is a dict now
781
+ # below can't happen anymore because artifacts is a dict now
732
782
  # if len(set(uids)) == len(set(hashes)):
733
- # logger.warning("dropping duplicate records in list of file records")
734
- # files = list(set(uids))
783
+ # logger.warning("dropping duplicate records in list of artifact records")
784
+ # artifacts = list(set(uids))
735
785
  # consider false duplicates (different id, same hash)
736
786
  if not len(set(uids)) == len(set(hashes)):
737
787
  seen_hashes = set()
738
- non_unique_files = {
739
- hash: file
740
- for hash, file in files_dict.items()
741
- if file.hash in seen_hashes or seen_hashes.add(file.hash) # type: ignore # noqa
788
+ non_unique_artifacts = {
789
+ hash: artifact
790
+ for hash, artifact in artifacts_dict.items()
791
+ if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore # noqa
742
792
  }
743
- display_non_unique = "\n ".join(f"{file}" for file in non_unique_files)
793
+ display_non_unique = "\n ".join(
794
+ f"{artifact}" for artifact in non_unique_artifacts
795
+ )
744
796
  logger.warning(
745
- "there are multiple file uids with the same hashes, dropping"
746
- f" {len(non_unique_files)} duplicates out of {len(files_dict)} files:\n"
747
- f" {display_non_unique}"
797
+ "there are multiple artifact uids with the same hashes, dropping"
798
+ f" {len(non_unique_artifacts)} duplicates out of"
799
+ f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
748
800
  )
749
- files = [
750
- file
751
- for file in files_dict.values()
752
- if file not in non_unique_files.values()
801
+ artifacts = [
802
+ artifact
803
+ for artifact in artifacts_dict.values()
804
+ if artifact not in non_unique_artifacts.values()
753
805
  ]
754
806
  logger.success(
755
- f"created {len(files)} files from directory using storage"
807
+ f"created {len(artifacts)} artifacts from directory using storage"
756
808
  f" {storage.root} and key = {folder_key}/"
757
809
  )
758
- return files
810
+ return artifacts
759
811
 
760
812
 
761
813
  # docstring handled through attach_func_to_class_method
@@ -765,7 +817,7 @@ def replace(
765
817
  run: Optional[Run] = None,
766
818
  format: Optional[str] = None,
767
819
  ) -> None:
768
- kwargs, privates = get_file_kwargs_from_data(
820
+ kwargs, privates = get_artifact_kwargs_from_data(
769
821
  provisional_uid=self.uid,
770
822
  data=data,
771
823
  key=self.key,
@@ -773,7 +825,7 @@ def replace(
773
825
  format=format,
774
826
  )
775
827
 
776
- # this file already exists
828
+ # this artifact already exists
777
829
  if privates is None:
778
830
  return kwargs
779
831
 
@@ -793,8 +845,11 @@ def replace(
793
845
  f" and delete '{key_path}' upon `save()`"
794
846
  )
795
847
  else:
796
- old_storage = auto_storage_key_from_file(self)
797
- new_storage = auto_storage_key_from_id_suffix(self.uid, kwargs["suffix"])
848
+ old_storage = auto_storage_key_from_artifact(self)
849
+ is_dir = self.n_objects is not None
850
+ new_storage = auto_storage_key_from_artifact_uid(
851
+ self.uid, kwargs["suffix"], is_dir
852
+ )
798
853
  if old_storage != new_storage:
799
854
  self._clear_storagekey = old_storage
800
855
  if self.key is not None:
@@ -822,8 +877,8 @@ def backed(
822
877
  suffixes = (".h5", ".hdf5", ".h5ad", ".zrad", ".zarr")
823
878
  if self.suffix not in suffixes:
824
879
  raise ValueError(
825
- "File should have a zarr or h5 object as the underlying data, please use"
826
- " one of the following suffixes for the object name:"
880
+ "Artifact should have a zarr or h5 object as the underlying data, please"
881
+ " use one of the following suffixes for the object name:"
827
882
  f" {', '.join(suffixes)}."
828
883
  )
829
884
 
@@ -831,7 +886,7 @@ def backed(
831
886
 
832
887
  _track_run_input(self, is_run_input)
833
888
 
834
- filepath = filepath_from_file(self)
889
+ filepath = filepath_from_artifact(self)
835
890
  # consider the case where an object is already locally cached
836
891
  localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
837
892
  if localpath.exists():
@@ -847,7 +902,7 @@ def load(
847
902
  _track_run_input(self, is_run_input)
848
903
  if hasattr(self, "_memory_rep") and self._memory_rep is not None:
849
904
  return self._memory_rep
850
- return load_to_memory(filepath_from_file(self), stream=stream, **kwargs)
905
+ return load_to_memory(filepath_from_artifact(self), stream=stream, **kwargs)
851
906
 
852
907
 
853
908
  # docstring handled through attach_func_to_class_method
@@ -856,7 +911,7 @@ def stage(self, is_run_input: Optional[bool] = None) -> Path:
856
911
  raise RuntimeError("zarr object can't be staged, please use load() or stream()")
857
912
  _track_run_input(self, is_run_input)
858
913
 
859
- filepath = filepath_from_file(self)
914
+ filepath = filepath_from_artifact(self)
860
915
  return setup_settings.instance.storage.cloud_to_local(filepath, print_progress=True)
861
916
 
862
917
 
@@ -864,21 +919,21 @@ def stage(self, is_run_input: Optional[bool] = None) -> Path:
864
919
  def delete(
865
920
  self, permanent: Optional[bool] = None, storage: Optional[bool] = None
866
921
  ) -> None:
867
- # by default, we only move files into the trash
922
+ # by default, we only move artifacts into the trash
868
923
  if self.visibility > VisibilityChoice.trash.value and permanent is not True:
869
924
  if storage is not None:
870
- logger.warning("moving file to trash, storage arg is ignored")
925
+ logger.warning("moving artifact to trash, storage arg is ignored")
871
926
  # move to trash
872
927
  self.visibility = VisibilityChoice.trash.value
873
928
  self.save()
874
- logger.warning("moved file to trash")
929
+ logger.warning("moved artifact to trash")
875
930
  return
876
931
 
877
- # if the file is already in the trash
932
+ # if the artifact is already in the trash
878
933
  # permanent delete skips the trash
879
934
  if permanent is None:
880
935
  response = input(
881
- "File record is already in trash! Are you sure you want to permanently"
936
+ "Artifact record is already in trash! Are you sure you want to permanently"
882
937
  " delete it? (y/n) You can't undo this action."
883
938
  )
884
939
  delete_record = response == "y"
@@ -896,7 +951,7 @@ def delete(
896
951
  if storage is not None:
897
952
  logger.warning("storage arg is ignored if storage key is non-semantic")
898
953
  else:
899
- # for files with non-virtual semantic storage keys (key is not None)
954
+ # for artifacts with non-virtual semantic storage keys (key is not None)
900
955
  # ask for extra-confirmation
901
956
  if storage is None:
902
957
  response = input(
@@ -913,8 +968,8 @@ def delete(
913
968
  logger.success(f"deleted {colors.yellow(f'{filepath}')}")
914
969
 
915
970
 
916
- def _delete_skip_storage(file, *args, **kwargs) -> None:
917
- super(File, file).delete(*args, **kwargs)
971
+ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
972
+ super(Artifact, artifact).delete(*args, **kwargs)
918
973
 
919
974
 
920
975
  # docstring handled through attach_func_to_class_method
@@ -933,15 +988,15 @@ def save(self, *args, **kwargs) -> None:
933
988
 
934
989
  def _save_skip_storage(file, *args, **kwargs) -> None:
935
990
  save_feature_sets(file)
936
- super(File, file).save(*args, **kwargs)
991
+ super(Artifact, file).save(*args, **kwargs)
937
992
  save_feature_set_links(file)
938
993
 
939
994
 
940
995
  @property # type: ignore
941
- @doc_args(File.path.__doc__)
996
+ @doc_args(Artifact.path.__doc__)
942
997
  def path(self) -> Union[Path, UPath]:
943
998
  """{}"""
944
- return filepath_from_file(self)
999
+ return filepath_from_artifact(self)
945
1000
 
946
1001
 
947
1002
  @classmethod # type: ignore
@@ -990,17 +1045,17 @@ if _TESTING:
990
1045
  from inspect import signature
991
1046
 
992
1047
  SIGS = {
993
- name: signature(getattr(File, name))
1048
+ name: signature(getattr(Artifact, name))
994
1049
  for name in METHOD_NAMES
995
1050
  if name != "__init__"
996
1051
  }
997
1052
 
998
1053
  for name in METHOD_NAMES:
999
- attach_func_to_class_method(name, File, globals())
1054
+ attach_func_to_class_method(name, Artifact, globals())
1000
1055
 
1001
1056
  # privates currently dealt with separately
1002
- File._delete_skip_storage = _delete_skip_storage
1003
- File._save_skip_storage = _save_skip_storage
1004
- setattr(File, "path", path)
1057
+ Artifact._delete_skip_storage = _delete_skip_storage
1058
+ Artifact._save_skip_storage = _save_skip_storage
1059
+ setattr(Artifact, "path", path)
1005
1060
  # this seems a Django-generated function
1006
- delattr(File, "get_visibility_display")
1061
+ delattr(Artifact, "get_visibility_display")