lamindb 0.45.0__py3-none-any.whl → 0.46a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_file.py CHANGED
@@ -1,33 +1,53 @@
1
+ from itertools import islice
1
2
  from pathlib import Path, PurePath, PurePosixPath
2
- from typing import Any, Optional, Tuple, Union
3
+ from typing import Any, Dict, List, Optional, Tuple, Union, overload # noqa
3
4
 
4
5
  import lamindb_setup
5
6
  import pandas as pd
6
7
  from anndata import AnnData
7
8
  from appdirs import AppDirs
8
- from lamin_logger import logger
9
- from lnschema_core import File, Run, ids
9
+ from django.db.models.query_utils import DeferredAttribute as Field
10
+ from lamin_logger import colors, logger
11
+ from lamindb_setup import settings as setup_settings
12
+ from lamindb_setup.dev._docs import doc_args
13
+ from lnschema_core import FeatureSet, File, Run, ids
10
14
  from lnschema_core.types import DataLike, PathLike
11
15
 
12
- from lamindb._file_access import auto_storage_key_from_file
16
+ from lamindb._context import context
13
17
  from lamindb.dev._settings import settings
14
- from lamindb.dev.hashing import hash_file
15
- from lamindb.dev.storage import UPath, infer_suffix, size_adata, write_to_file
18
+ from lamindb.dev.hashing import b16_to_b64, hash_file
19
+ from lamindb.dev.storage import (
20
+ UPath,
21
+ delete_storage,
22
+ infer_suffix,
23
+ load_to_memory,
24
+ size_adata,
25
+ write_to_file,
26
+ )
27
+ from lamindb.dev.storage.file import auto_storage_key_from_file, filepath_from_file
28
+ from lamindb.dev.utils import attach_func_to_class_method
16
29
 
17
- from ._file_access import AUTO_KEY_PREFIX
30
+ from . import _TESTING
31
+ from .dev.storage.file import AUTO_KEY_PREFIX
18
32
 
19
- DIRS = AppDirs("lamindb", "laminlabs")
33
+ try:
34
+ from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
35
+ except ImportError:
36
+
37
+ class AnnDataAccessor: # type: ignore
38
+ pass
20
39
 
21
- NO_NAME_ERROR = """\
22
- Pass a name or key in `ln.File(...)`.
23
- """
40
+ class BackedAccessor: # type: ignore
41
+ pass
42
+
43
+
44
+ DIRS = AppDirs("lamindb", "laminlabs")
24
45
 
25
46
 
26
47
  def serialize(
48
+ provisional_id: str,
27
49
  data: Union[Path, UPath, str, pd.DataFrame, AnnData],
28
- name: Optional[str],
29
50
  format,
30
- key: Optional[str],
31
51
  ) -> Tuple[Any, Union[Path, UPath], str]:
32
52
  """Serialize a data object that's provided as file or in memory."""
33
53
  # Convert str to either Path or UPath
@@ -60,13 +80,7 @@ def serialize(
60
80
  elif isinstance(data, (pd.DataFrame, AnnData)):
61
81
  memory_rep = data
62
82
  suffix = infer_suffix(data, format)
63
- # the following filepath is always local
64
- if name is None and key is None:
65
- raise ValueError(NO_NAME_ERROR)
66
- elif name is not None:
67
- cache_name = name
68
- else:
69
- cache_name = Path(key).name
83
+ cache_name = f"{provisional_id}{suffix}"
70
84
  if lamindb_setup.settings.storage.cache_dir is not None:
71
85
  filepath = lamindb_setup.settings.storage.cache_dir / cache_name
72
86
  else:
@@ -74,21 +88,30 @@ def serialize(
74
88
  cache_dir = Path(DIRS.user_cache_dir)
75
89
  cache_dir.mkdir(parents=True, exist_ok=True)
76
90
  filepath = cache_dir / cache_name
91
+ # Alex: I don't understand the line below
77
92
  if filepath.suffixes == []:
78
93
  filepath = filepath.with_suffix(suffix)
79
94
  if suffix != ".zarr":
80
95
  write_to_file(data, filepath)
81
96
  else:
82
- raise NotImplementedError("Recording not yet implemented for this type.")
97
+ raise NotImplementedError(
98
+ f"Do not know how to create a file object from {data}, pass a filepath"
99
+ " instead!"
100
+ )
83
101
  return memory_rep, filepath, suffix
84
102
 
85
103
 
86
- def get_hash(
87
- local_filepath, suffix, check_hash: bool = True
88
- ) -> Optional[Union[str, File]]:
104
+ def get_hash(filepath, suffix, check_hash: bool = True) -> Optional[Union[str, File]]:
89
105
  if suffix in {".zarr", ".zrad"}:
90
106
  return None
91
- hash = hash_file(local_filepath)
107
+ if isinstance(filepath, UPath):
108
+ stat = filepath.stat()
109
+ if "ETag" in stat:
110
+ hash = b16_to_b64(stat["ETag"])
111
+ else:
112
+ logger.warning(f"Did not find hash for filepath {filepath}")
113
+ else:
114
+ hash = hash_file(filepath)
92
115
  if not check_hash:
93
116
  return hash
94
117
  result = File.select(hash=hash).list()
@@ -165,7 +188,7 @@ def get_path_size_hash(
165
188
  else:
166
189
  size = filepath.stat().st_size # type: ignore
167
190
  localpath = filepath
168
- hash = get_hash(filepath, suffix, check_hash=check_hash)
191
+ hash = get_hash(filepath, suffix, check_hash=check_hash)
169
192
 
170
193
  return localpath, cloudpath, size, hash
171
194
 
@@ -213,17 +236,17 @@ def get_relative_path_to_root(
213
236
  return get_relative_path_to_directory(path, root)
214
237
 
215
238
 
216
- # expose to user via ln.File
239
+ # TODO: integrate this whole function into __init__
217
240
  def get_file_kwargs_from_data(
218
- data: Union[Path, UPath, str, pd.DataFrame, AnnData],
219
241
  *,
220
- name: Optional[str] = None,
221
- key: Optional[str] = None,
222
- run: Optional[Run] = None,
223
- format: Optional[str] = None,
242
+ data: Union[Path, UPath, str, pd.DataFrame, AnnData],
243
+ key: Optional[str],
244
+ run: Optional[Run],
245
+ format: Optional[str],
246
+ provisional_id: str,
224
247
  ):
225
248
  run = get_run(run)
226
- memory_rep, filepath, suffix = serialize(data, name, format, key)
249
+ memory_rep, filepath, suffix = serialize(provisional_id, data, format)
227
250
  # the following will return a localpath that is not None if filepath is local
228
251
  # it will return a cloudpath that is not None if filepath is on the cloud
229
252
  local_filepath, cloud_filepath, size, hash = get_path_size_hash(
@@ -238,14 +261,10 @@ def get_file_kwargs_from_data(
238
261
  if memory_rep is None and key is None and check_path_in_storage:
239
262
  key = get_relative_path_to_root(path=filepath).as_posix()
240
263
 
241
- if name is None and key is None:
242
- raise ValueError(NO_NAME_ERROR)
243
-
244
264
  if key is not None and key.startswith(AUTO_KEY_PREFIX):
245
265
  raise ValueError(f"Key cannot start with {AUTO_KEY_PREFIX}")
246
266
 
247
267
  kwargs = dict(
248
- name=name,
249
268
  suffix=suffix,
250
269
  hash=hash,
251
270
  key=key,
@@ -282,7 +301,7 @@ def log_storage_hint(
282
301
  logger.hint(hint)
283
302
 
284
303
 
285
- def init_file(file: File, *args, **kwargs):
304
+ def __init__(file: File, *args, **kwargs):
286
305
  # Below checks for the Django-internal call in from_db()
287
306
  # it'd be better if we could avoid this, but not being able to create a File
288
307
  # from data with the default constructor renders the central class of the API
@@ -297,18 +316,51 @@ def init_file(file: File, *args, **kwargs):
297
316
  # now we proceed with the user-facing constructor
298
317
  if len(args) > 1:
299
318
  raise ValueError("Only one non-keyword arg allowed: data")
300
- data: Union[PathLike, DataLike] = kwargs["data"] if len(args) == 0 else args[0]
301
- key: Optional[str] = kwargs["key"] if "key" in kwargs else None
302
- name: Optional[str] = kwargs["name"] if "name" in kwargs else None
303
- run: Optional[Run] = kwargs["run"] if "run" in kwargs else None
304
- format = kwargs["format"] if "format" in kwargs else None
305
-
319
+ data: Union[PathLike, DataLike] = kwargs.pop("data") if len(args) == 0 else args[0]
320
+ key: Optional[str] = kwargs.pop("key") if "key" in kwargs else None
321
+ run: Optional[Run] = kwargs.pop("run") if "run" in kwargs else None
322
+ description: Optional[str] = (
323
+ kwargs.pop("description") if "description" in kwargs else None
324
+ )
325
+ feature_sets: Optional[List[FeatureSet]] = (
326
+ kwargs.pop("feature_sets") if "feature_sets" in kwargs else None
327
+ )
328
+ name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
329
+ var_ref: Optional[Field] = kwargs.pop("var_ref") if "var_ref" in kwargs else None
330
+ format = kwargs.pop("format") if "format" in kwargs else None
331
+
332
+ if not len(kwargs) == 0:
333
+ raise ValueError("Only data, key, run, name & feature_sets can be passed.")
334
+
335
+ if name is not None and description is not None:
336
+ raise ValueError("Only pass description, do not pass a name")
337
+ if name is not None:
338
+ logger.warning("Argument `name` is deprecated, please use `description`")
339
+ description = name
340
+
341
+ if feature_sets is None:
342
+ # if var_ref is None:
343
+ # response = input(
344
+ # "Are you sure you want to create a feature_set without reference?"
345
+ # " (y/n)\n If n: please rerun by providing reference to `var_ref=`"
346
+ # )
347
+ # if response != "y":
348
+ # return None
349
+ feature_sets = []
350
+ if isinstance(data, pd.DataFrame):
351
+ feature_set = FeatureSet.from_values(data.columns)
352
+ feature_sets.append(feature_set)
353
+ elif isinstance(data, AnnData) and var_ref is not None:
354
+ feature_sets.append(FeatureSet.from_values(data.var.index, var_ref))
355
+ feature_sets.append(FeatureSet.from_values(data.obs.columns))
356
+
357
+ provisional_id = ids.base62_20()
306
358
  kwargs, privates = get_file_kwargs_from_data(
307
359
  data=data,
308
- name=name,
309
360
  key=key,
310
361
  run=run,
311
362
  format=format,
363
+ provisional_id=provisional_id,
312
364
  )
313
365
  # an object with the same hash already exists
314
366
  if isinstance(kwargs, File):
@@ -318,9 +370,12 @@ def init_file(file: File, *args, **kwargs):
318
370
  getattr(kwargs, field.attname) for field in file._meta.concrete_fields
319
371
  ]
320
372
  super(File, file).__init__(*new_args)
373
+ file._state.adding = False
374
+ file._state.db = "default"
321
375
  return None
322
376
 
323
- kwargs["id"] = ids.base62_20()
377
+ kwargs["id"] = provisional_id
378
+ kwargs["description"] = description
324
379
  log_storage_hint(
325
380
  check_path_in_storage=privates["check_path_in_storage"],
326
381
  key=kwargs["key"],
@@ -346,16 +401,22 @@ def init_file(file: File, *args, **kwargs):
346
401
  file._cloud_filepath = privates["cloud_filepath"]
347
402
  file._memory_rep = privates["memory_rep"]
348
403
  file._to_store = not privates["check_path_in_storage"]
404
+ file._feature_sets = (
405
+ feature_sets if isinstance(feature_sets, list) else [feature_sets]
406
+ )
349
407
 
350
408
  super(File, file).__init__(**kwargs)
351
409
 
352
410
 
411
+ @classmethod # type: ignore
412
+ @doc_args(File.from_dir.__doc__)
353
413
  def from_dir(
354
- path: Union[Path, UPath, str],
414
+ cls,
415
+ path: PathLike,
355
416
  *,
356
417
  run: Optional[Run] = None,
357
- ):
358
- """Create file records from a directory."""
418
+ ) -> List["File"]:
419
+ """{}"""
359
420
  folderpath = UPath(path)
360
421
  check_path_in_storage = get_check_path_in_storage(folderpath)
361
422
 
@@ -388,51 +449,231 @@ def from_dir(
388
449
  return files
389
450
 
390
451
 
391
- def replace_file(
392
- file: File,
393
- data: Union[PathLike, DataLike] = None,
452
+ def replace(
453
+ self,
454
+ data: Union[PathLike, DataLike],
394
455
  run: Optional[Run] = None,
395
456
  format: Optional[str] = None,
396
- ):
457
+ ) -> None:
397
458
  kwargs, privates = get_file_kwargs_from_data(
459
+ provisional_id=self.id,
398
460
  data=data,
399
- name=file.name,
400
- key=file.key,
461
+ key=self.key,
401
462
  run=run,
402
463
  format=format,
403
464
  )
404
- if file.key is not None:
405
- key_path = PurePosixPath(file.key)
406
- if isinstance(data, (Path, str)) and kwargs["name"] is not None:
407
- new_name = kwargs["name"] # use the name from the data filepath
408
- else:
409
- # do not change the key stem to file.name
410
- new_name = key_path.stem # use the stem of the key for in-memory data
411
- if PurePosixPath(new_name).suffixes == []:
412
- new_name = f"{new_name}{kwargs['suffix']}"
413
- if key_path.name != new_name:
414
- file._clear_storagekey = file.key
415
- file.key = str(key_path.with_name(new_name))
465
+ if self.key is not None:
466
+ key_path = PurePosixPath(self.key)
467
+ new_filename = f"{key_path.stem}{kwargs['suffix']}"
468
+ # the following will only be true if the suffix changes!
469
+ if key_path.name != new_filename:
470
+ self._clear_storagekey = self.key
471
+ self.key = str(key_path.with_name(new_filename))
416
472
  logger.warning(
417
- f"Replacing the file will replace key '{key_path}' with '{file.key}'"
473
+ f"Replacing the file will replace key '{key_path}' with '{self.key}'"
418
474
  f" and delete '{key_path}' upon `save()`"
419
475
  )
420
476
  else:
421
- file.key = kwargs["key"]
422
- old_storage = auto_storage_key_from_file(file)
477
+ self.key = kwargs["key"]
478
+ old_storage = auto_storage_key_from_file(self)
423
479
  new_storage = (
424
- file.key if file.key is not None else f"{file.id}{kwargs['suffix']}"
480
+ self.key if self.key is not None else f"{self.id}{kwargs['suffix']}"
425
481
  )
426
482
  if old_storage != new_storage:
427
- file._clear_storagekey = old_storage
428
-
429
- file.suffix = kwargs["suffix"]
430
- file.size = kwargs["size"]
431
- file.hash = kwargs["hash"]
432
- file.run = kwargs["run"]
433
- file._local_filepath = privates["local_filepath"]
434
- file._cloud_filepath = privates["cloud_filepath"]
435
- file._memory_rep = privates["memory_rep"]
436
- file._to_store = not privates[
483
+ self._clear_storagekey = old_storage
484
+
485
+ self.suffix = kwargs["suffix"]
486
+ self.size = kwargs["size"]
487
+ self.hash = kwargs["hash"]
488
+ self.run = kwargs["run"]
489
+ self._local_filepath = privates["local_filepath"]
490
+ self._cloud_filepath = privates["cloud_filepath"]
491
+ self._memory_rep = privates["memory_rep"]
492
+ self._to_store = not privates[
437
493
  "check_path_in_storage"
438
494
  ] # no need to upload if new file is already in storage
495
+
496
+
497
+ def backed(
498
+ self, is_run_input: Optional[bool] = None
499
+ ) -> Union["AnnDataAccessor", "BackedAccessor"]:
500
+ suffixes = (".h5", ".hdf5", ".h5ad", ".zrad", ".zarr")
501
+ if self.suffix not in suffixes:
502
+ raise ValueError(
503
+ "File should have a zarr or h5 object as the underlying data, please use"
504
+ " one of the following suffixes for the object name:"
505
+ f" {', '.join(suffixes)}."
506
+ )
507
+ _track_run_input(self, is_run_input)
508
+ from lamindb.dev.storage._backed_access import backed_access
509
+
510
+ return backed_access(self)
511
+
512
+
513
+ def _track_run_input(file: File, is_run_input: Optional[bool] = None):
514
+ if is_run_input is None:
515
+ if context.run is not None and not settings.track_run_inputs:
516
+ logger.hint("Track this file as a run input by passing `is_run_input=True`")
517
+ track_run_input = settings.track_run_inputs
518
+ else:
519
+ track_run_input = is_run_input
520
+ if track_run_input:
521
+ if context.run is None:
522
+ raise ValueError(
523
+ "No global run context set. Call ln.context.track() or link input to a"
524
+ " run object via `run.inputs.append(file)`"
525
+ )
526
+ if not file.input_of.contains(context.run):
527
+ context.run.save()
528
+ file.input_of.add(context.run)
529
+
530
+
531
+ def load(self, is_run_input: Optional[bool] = None, stream: bool = False) -> DataLike:
532
+ _track_run_input(self, is_run_input)
533
+ return load_to_memory(filepath_from_file(self), stream=stream)
534
+
535
+
536
+ def stage(self, is_run_input: Optional[bool] = None) -> Path:
537
+ if self.suffix in (".zrad", ".zarr"):
538
+ raise RuntimeError("zarr object can't be staged, please use load() or stream()")
539
+ _track_run_input(self, is_run_input)
540
+ return setup_settings.instance.storage.cloud_to_local(filepath_from_file(self))
541
+
542
+
543
+ def delete(self, storage: Optional[bool] = None) -> None:
544
+ if storage is None:
545
+ response = input(f"Are you sure you want to delete {self} from storage? (y/n)")
546
+ delete_in_storage = response == "y"
547
+ else:
548
+ delete_in_storage = storage
549
+
550
+ if delete_in_storage:
551
+ filepath = self.path()
552
+ delete_storage(filepath)
553
+ logger.success(f"Deleted stored object {colors.yellow(f'{filepath}')}")
554
+ self._delete_skip_storage()
555
+
556
+
557
+ def _delete_skip_storage(file, *args, **kwargs) -> None:
558
+ super(File, file).delete(*args, **kwargs)
559
+
560
+
561
+ def save(self, *args, **kwargs) -> None:
562
+ self._save_skip_storage(*args, **kwargs)
563
+ from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
564
+
565
+ exception = check_and_attempt_upload(self)
566
+ if exception is not None:
567
+ self._delete_skip_storage()
568
+ raise RuntimeError(exception)
569
+ exception = check_and_attempt_clearing(self)
570
+ if exception is not None:
571
+ raise RuntimeError(exception)
572
+
573
+
574
+ def _save_skip_storage(file, *args, **kwargs) -> None:
575
+ if file.transform is not None:
576
+ file.transform.save()
577
+ if file.run is not None:
578
+ file.run.save()
579
+ if hasattr(file, "_feature_sets"):
580
+ for feature_set in file._feature_sets:
581
+ feature_set.save()
582
+ super(File, file).save(*args, **kwargs)
583
+ if hasattr(file, "_feature_sets"):
584
+ file.feature_sets.set(file._feature_sets)
585
+
586
+
587
+ def path(self) -> Union[Path, UPath]:
588
+ return filepath_from_file(self)
589
+
590
+
591
+ # adapted from: https://stackoverflow.com/questions/9727673/list-directory-tree-structure-in-python # noqa
592
+ @classmethod # type: ignore
593
+ @doc_args(File.tree.__doc__)
594
+ def tree(
595
+ cls: File,
596
+ prefix: Optional[str] = None,
597
+ *,
598
+ level: int = -1,
599
+ limit_to_directories: bool = False,
600
+ length_limit: int = 1000,
601
+ ):
602
+ """{}"""
603
+ space = " "
604
+ branch = "│ "
605
+ tee = "├── "
606
+ last = "└── "
607
+
608
+ if prefix is None:
609
+ dir_path = settings.storage
610
+ else:
611
+ dir_path = settings.storage / prefix
612
+ files = 0
613
+ directories = 0
614
+
615
+ def inner(dir_path: Union[Path, UPath], prefix: str = "", level=-1):
616
+ nonlocal files, directories
617
+ if not level:
618
+ return # 0, stop iterating
619
+ stripped_dir_path = dir_path.as_posix().rstrip("/")
620
+ # do not iterate through zarr directories
621
+ if stripped_dir_path.endswith((".zarr", ".zrad")):
622
+ return
623
+ # this is needed so that the passed folder is not listed
624
+ contents = [
625
+ i
626
+ for i in dir_path.iterdir()
627
+ if i.as_posix().rstrip("/") != stripped_dir_path
628
+ ]
629
+ if limit_to_directories:
630
+ contents = [d for d in contents if d.is_dir()]
631
+ pointers = [tee] * (len(contents) - 1) + [last]
632
+ for pointer, path in zip(pointers, contents):
633
+ if path.is_dir():
634
+ yield prefix + pointer + path.name
635
+ directories += 1
636
+ extension = branch if pointer == tee else space
637
+ yield from inner(path, prefix=prefix + extension, level=level - 1)
638
+ elif not limit_to_directories:
639
+ yield prefix + pointer + path.name
640
+ files += 1
641
+
642
+ folder_tree = f"{dir_path.name}"
643
+ iterator = inner(dir_path, level=level)
644
+ for line in islice(iterator, length_limit):
645
+ folder_tree += f"\n{line}"
646
+ if next(iterator, None):
647
+ folder_tree += f"... length_limit, {length_limit}, reached, counted:"
648
+ print(folder_tree)
649
+ print(f"\n{directories} directories" + (f", {files} files" if files else ""))
650
+
651
+
652
+ METHOD_NAMES = [
653
+ "__init__",
654
+ "backed",
655
+ "stage",
656
+ "load",
657
+ "delete",
658
+ "save",
659
+ "replace",
660
+ "path",
661
+ "from_dir",
662
+ "tree",
663
+ ]
664
+
665
+ if _TESTING:
666
+ from inspect import signature
667
+
668
+ SIGS = {
669
+ name: signature(getattr(File, name))
670
+ for name in METHOD_NAMES
671
+ if name != "__init__"
672
+ }
673
+
674
+ for name in METHOD_NAMES:
675
+ attach_func_to_class_method(name, File, globals())
676
+
677
+ # privates currently dealt with separately
678
+ File._delete_skip_storage = _delete_skip_storage
679
+ File._save_skip_storage = _save_skip_storage