lamindb 0.76.1__py3-none-any.whl → 0.76.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """A data framework for biology.
2
2
 
3
- Registries:
3
+ Records
4
+ =======
4
5
 
5
6
  .. autosummary::
6
7
  :toctree: .
@@ -16,18 +17,20 @@ Registries:
16
17
  FeatureSet
17
18
  Param
18
19
 
19
- Key functionality:
20
+ Key functionality
21
+ =================
20
22
 
21
23
  .. autosummary::
22
24
  :toctree: .
23
25
 
24
26
  context
25
27
  connect
26
- Curate
28
+ Curator
27
29
  view
28
30
  save
29
31
 
30
- Modules & settings:
32
+ Modules & settings
33
+ ==================
31
34
 
32
35
  .. autosummary::
33
36
  :toctree: .
@@ -41,7 +44,7 @@ Modules & settings:
41
44
  """
42
45
 
43
46
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
44
- __version__ = "0.76.1"
47
+ __version__ = "0.76.3"
45
48
 
46
49
  import os as _os
47
50
 
@@ -91,7 +94,7 @@ if _check_instance_setup(from_lamindb=True):
91
94
  _ulabel,
92
95
  integrations,
93
96
  )
94
- from ._curate import Curate
97
+ from ._curate import Curator
95
98
  from ._save import save
96
99
  from ._view import view
97
100
  from .core._context import context
@@ -107,6 +110,7 @@ if _check_instance_setup(from_lamindb=True):
107
110
 
108
111
  track = context.track # backward compat
109
112
  finish = context.finish # backward compat
113
+ Curate = Curator # backward compat
110
114
  settings.__doc__ = """Global :class:`~lamindb.core.Settings`."""
111
115
  context.__doc__ = """Global :class:`~lamindb.core.Context`."""
112
116
  from django.db.models import Q
lamindb/_artifact.py CHANGED
@@ -9,6 +9,7 @@ import fsspec
9
9
  import lamindb_setup as ln_setup
10
10
  import pandas as pd
11
11
  from anndata import AnnData
12
+ from django.db.models import Q, QuerySet
12
13
  from lamin_utils import colors, logger
13
14
  from lamindb_setup import settings as setup_settings
14
15
  from lamindb_setup._init_instance import register_storage_in_instance
@@ -44,7 +45,10 @@ from lamindb.core.storage.paths import (
44
45
  check_path_is_child_of_root,
45
46
  filepath_from_artifact,
46
47
  )
47
- from lamindb.core.versioning import get_uid_from_old_version, init_uid
48
+ from lamindb.core.versioning import (
49
+ create_uid,
50
+ message_update_key_in_version_family,
51
+ )
48
52
 
49
53
  from .core._data import (
50
54
  add_transform_to_kwargs,
@@ -192,12 +196,14 @@ def process_data(
192
196
 
193
197
  def get_stat_or_artifact(
194
198
  path: UPath,
199
+ key: str | None = None,
195
200
  check_hash: bool = True,
196
- using_key: str | None = None,
197
- ) -> tuple[int, str | None, str | None, int | None] | Artifact:
201
+ is_replace: bool = False,
202
+ instance: str | None = None,
203
+ ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
198
204
  n_objects = None
199
205
  if settings.creation.artifact_skip_size_hash:
200
- return None, None, None, n_objects
206
+ return None, None, None, n_objects, None
201
207
  stat = path.stat() # one network request
202
208
  if not isinstance(path, LocalPathClasses):
203
209
  size, hash, hash_type = None, None, None
@@ -210,7 +216,7 @@ def get_stat_or_artifact(
210
216
  size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
211
217
  if hash is None:
212
218
  logger.warning(f"did not add hash for {path}")
213
- return size, hash, hash_type, n_objects
219
+ return size, hash, hash_type, n_objects, None
214
220
  else:
215
221
  if path.is_dir():
216
222
  size, hash, hash_type, n_objects = hash_dir(path)
@@ -218,17 +224,26 @@ def get_stat_or_artifact(
218
224
  hash, hash_type = hash_file(path)
219
225
  size = stat.st_size
220
226
  if not check_hash:
221
- return size, hash, hash_type, n_objects
222
- # also checks hidden and trashed files
223
- # in Alex's mind the following two lines should be equivalent
224
- # but they aren't according to pytest tests/test_artifact.py::test_from_dir_single_artifact
225
- if using_key is None:
226
- result = Artifact.filter(hash=hash, visibility=None).all()
227
+ return size, hash, hash_type, n_objects, None
228
+ previous_artifact_version = None
229
+ if key is None or is_replace:
230
+ result = Artifact.objects.using(instance).filter(hash=hash).all()
231
+ artifact_with_same_hash_exists = len(result) > 0
227
232
  else:
233
+ storage_id = settings.storage.id
228
234
  result = (
229
- Artifact.objects.using(using_key).filter(hash=hash, visibility=None).all()
235
+ Artifact.objects.using(instance)
236
+ .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
237
+ .order_by("-created_at")
238
+ .all()
230
239
  )
231
- if len(result) > 0:
240
+ artifact_with_same_hash_exists = len(result.filter(hash=hash).all()) > 0
241
+ if not artifact_with_same_hash_exists and len(result) > 0:
242
+ logger.important(
243
+ f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
244
+ )
245
+ previous_artifact_version = result[0]
246
+ if artifact_with_same_hash_exists:
232
247
  if settings.creation.artifact_if_hash_exists == "error":
233
248
  msg = f"artifact with same hash exists: {result[0]}"
234
249
  hint = (
@@ -241,7 +256,7 @@ def get_stat_or_artifact(
241
256
  "creating new Artifact object despite existing artifact with same hash:"
242
257
  f" {result[0]}"
243
258
  )
244
- return size, hash, hash_type, n_objects
259
+ return size, hash, hash_type, n_objects, None
245
260
  else:
246
261
  if result[0].visibility == -1:
247
262
  raise FileExistsError(
@@ -251,11 +266,11 @@ def get_stat_or_artifact(
251
266
  logger.important(f"returning existing artifact with same hash: {result[0]}")
252
267
  return result[0]
253
268
  else:
254
- return size, hash, hash_type, n_objects
269
+ return size, hash, hash_type, n_objects, previous_artifact_version
255
270
 
256
271
 
257
272
  def check_path_in_existing_storage(
258
- path: Path | UPath, using_key: str | None
273
+ path: Path | UPath, using_key: str | None = None
259
274
  ) -> Storage | bool:
260
275
  for storage in Storage.objects.using(using_key).filter().all():
261
276
  # if path is part of storage, return it
@@ -290,8 +305,10 @@ def get_artifact_kwargs_from_data(
290
305
  run: Run | None,
291
306
  format: str | None,
292
307
  provisional_uid: str,
308
+ version: str | None,
293
309
  default_storage: Storage,
294
310
  using_key: str | None = None,
311
+ is_replace: bool = False,
295
312
  skip_check_exists: bool = False,
296
313
  ):
297
314
  run = get_run(run)
@@ -306,7 +323,9 @@ def get_artifact_kwargs_from_data(
306
323
  )
307
324
  stat_or_artifact = get_stat_or_artifact(
308
325
  path=path,
309
- using_key=using_key,
326
+ key=key,
327
+ instance=using_key,
328
+ is_replace=is_replace,
310
329
  )
311
330
  if isinstance(stat_or_artifact, Artifact):
312
331
  artifact = stat_or_artifact
@@ -321,7 +340,12 @@ def get_artifact_kwargs_from_data(
321
340
  stat_or_artifact.transform = run.transform
322
341
  return artifact, None
323
342
  else:
324
- size, hash, hash_type, n_objects = stat_or_artifact
343
+ size, hash, hash_type, n_objects, revises = stat_or_artifact
344
+
345
+ if revises is not None: # update provisional_uid
346
+ provisional_uid, revises = create_uid(revises=revises, version=version)
347
+ if path.as_posix().startswith(settings._storage_settings.cache_dir.as_posix()):
348
+ path = path.rename(f"{provisional_uid}{suffix}")
325
349
 
326
350
  check_path_in_storage = False
327
351
  if use_existing_storage_key:
@@ -365,6 +389,7 @@ def get_artifact_kwargs_from_data(
365
389
  key_is_virtual = False
366
390
 
367
391
  kwargs = {
392
+ "uid": provisional_uid,
368
393
  "suffix": suffix,
369
394
  "hash": hash,
370
395
  "_hash_type": hash_type,
@@ -509,9 +534,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
509
534
  description: str | None = (
510
535
  kwargs.pop("description") if "description" in kwargs else None
511
536
  )
512
- is_new_version_of: Artifact | None = (
513
- kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
514
- )
537
+ revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
515
538
  version: str | None = kwargs.pop("version") if "version" in kwargs else None
516
539
  visibility: int | None = (
517
540
  kwargs.pop("visibility")
@@ -522,6 +545,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
522
545
  skip_check_exists = (
523
546
  kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
524
547
  )
548
+ _uid = kwargs.pop("_uid", None)
525
549
  if "default_storage" in kwargs:
526
550
  default_storage = kwargs.pop("default_storage")
527
551
  else:
@@ -534,28 +558,43 @@ def __init__(artifact: Artifact, *args, **kwargs):
534
558
  )
535
559
  accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None
536
560
  accessor = _check_accessor_artifact(data=data, accessor=accessor)
561
+ if "is_new_version_of" in kwargs:
562
+ logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
563
+ revises = kwargs.pop("is_new_version_of")
564
+ assert not ( # noqa: S101
565
+ revises is not None and _uid is not None
566
+ ), "Can not init with both `revises` and `_uid`"
537
567
  if not len(kwargs) == 0:
538
568
  raise ValueError(
539
- "Only data, key, run, description, version, is_new_version_of, visibility"
569
+ "Only data, key, run, description, version, revises, visibility"
540
570
  f" can be passed, you passed: {kwargs}"
541
571
  )
542
-
543
- if is_new_version_of is None:
544
- provisional_uid = init_uid(version=version, n_full_id=20)
545
- else:
546
- if not isinstance(is_new_version_of, Artifact):
547
- raise TypeError("is_new_version_of has to be of type ln.Artifact")
548
- provisional_uid, version = get_uid_from_old_version(
549
- is_new_version_of, version, using_key
572
+ if revises is not None and key is not None and revises.key != key:
573
+ note = message_update_key_in_version_family(
574
+ suid=revises.stem_uid,
575
+ existing_key=revises.key,
576
+ new_key=key,
577
+ registry="Artifact",
550
578
  )
579
+ raise ValueError(
580
+ f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
581
+ )
582
+ if _uid is not None:
583
+ provisional_uid, revises = _uid, None
584
+ else:
585
+ provisional_uid, revises = create_uid(revises=revises, version=version)
586
+ if revises is not None:
587
+ if not isinstance(revises, Artifact):
588
+ raise TypeError("`revises` has to be of type `Artifact`")
551
589
  if description is None:
552
- description = is_new_version_of.description
590
+ description = revises.description
553
591
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
554
592
  data=data,
555
593
  key=key,
556
594
  run=run,
557
595
  format=format,
558
596
  provisional_uid=provisional_uid,
597
+ version=version,
559
598
  default_storage=default_storage,
560
599
  using_key=using_key,
561
600
  skip_check_exists=skip_check_exists,
@@ -576,25 +615,23 @@ def __init__(artifact: Artifact, *args, **kwargs):
576
615
  else:
577
616
  kwargs = kwargs_or_artifact
578
617
 
618
+ # only set key now so that we don't do a look-up on it in case revises is passed
619
+ if revises is not None:
620
+ kwargs["key"] = revises.key
579
621
  # in case we have a new version of a folder with a different hash, print a
580
622
  # warning that the old version can't be recovered
581
- if (
582
- is_new_version_of is not None
583
- and is_new_version_of.n_objects is not None
584
- and is_new_version_of.n_objects > 1
585
- ):
623
+ if revises is not None and revises.n_objects is not None and revises.n_objects > 1:
586
624
  logger.warning(
587
- f"artifact version {version} will _update_ the state of folder {is_new_version_of.path} - "
588
- "to _retain_ the old state by duplicating the entire folder, do _not_ pass `is_new_version_of`"
625
+ f"artifact version {version} will _update_ the state of folder {revises.path} - "
626
+ "to _retain_ the old state by duplicating the entire folder, do _not_ pass `revises`"
589
627
  )
590
628
 
591
629
  kwargs["type"] = type
592
- kwargs["uid"] = provisional_uid
593
630
  kwargs["version"] = version
594
631
  kwargs["description"] = description
595
632
  kwargs["visibility"] = visibility
596
633
  kwargs["_accessor"] = accessor
597
- kwargs["is_new_version_of"] = is_new_version_of
634
+ kwargs["revises"] = revises
598
635
  # this check needs to come down here because key might be populated from an
599
636
  # existing file path during get_artifact_kwargs_from_data()
600
637
  if (
@@ -623,8 +660,7 @@ def from_df(
623
660
  key: str | None = None,
624
661
  description: str | None = None,
625
662
  run: Run | None = None,
626
- version: str | None = None,
627
- is_new_version_of: Artifact | None = None,
663
+ revises: Artifact | None = None,
628
664
  **kwargs,
629
665
  ) -> Artifact:
630
666
  """{}""" # noqa: D415
@@ -633,8 +669,7 @@ def from_df(
633
669
  key=key,
634
670
  run=run,
635
671
  description=description,
636
- version=version,
637
- is_new_version_of=is_new_version_of,
672
+ revises=revises,
638
673
  _accessor="DataFrame",
639
674
  type="dataset",
640
675
  **kwargs,
@@ -650,8 +685,7 @@ def from_anndata(
650
685
  key: str | None = None,
651
686
  description: str | None = None,
652
687
  run: Run | None = None,
653
- version: str | None = None,
654
- is_new_version_of: Artifact | None = None,
688
+ revises: Artifact | None = None,
655
689
  **kwargs,
656
690
  ) -> Artifact:
657
691
  """{}""" # noqa: D415
@@ -662,8 +696,7 @@ def from_anndata(
662
696
  key=key,
663
697
  run=run,
664
698
  description=description,
665
- version=version,
666
- is_new_version_of=is_new_version_of,
699
+ revises=revises,
667
700
  _accessor="AnnData",
668
701
  type="dataset",
669
702
  **kwargs,
@@ -679,8 +712,7 @@ def from_mudata(
679
712
  key: str | None = None,
680
713
  description: str | None = None,
681
714
  run: Run | None = None,
682
- version: str | None = None,
683
- is_new_version_of: Artifact | None = None,
715
+ revises: Artifact | None = None,
684
716
  **kwargs,
685
717
  ) -> Artifact:
686
718
  """{}""" # noqa: D415
@@ -689,8 +721,7 @@ def from_mudata(
689
721
  key=key,
690
722
  run=run,
691
723
  description=description,
692
- version=version,
693
- is_new_version_of=is_new_version_of,
724
+ revises=revises,
694
725
  _accessor="MuData",
695
726
  type="dataset",
696
727
  **kwargs,
@@ -815,6 +846,8 @@ def replace(
815
846
  run=run,
816
847
  format=format,
817
848
  default_storage=default_storage,
849
+ version=None,
850
+ is_replace=True,
818
851
  )
819
852
 
820
853
  # this artifact already exists
@@ -913,7 +946,7 @@ def open(
913
946
  logger.warning(
914
947
  "The hash of the tiledbsoma store has changed, creating a new version of the artifact."
915
948
  )
916
- new_version = Artifact(filepath, is_new_version_of=self).save()
949
+ new_version = Artifact(filepath, revises=self).save()
917
950
  init_self_from_db(self, new_version)
918
951
 
919
952
  if localpath != filepath and localpath.exists():
lamindb/_can_validate.py CHANGED
@@ -332,7 +332,9 @@ def _standardize(
332
332
  # here, we can safely import bionty
333
333
  from bionty._bionty import create_or_get_organism_record
334
334
 
335
- organism_record = create_or_get_organism_record(organism=organism, orm=registry)
335
+ organism_record = create_or_get_organism_record(
336
+ organism=organism, registry=registry
337
+ )
336
338
  organism = (
337
339
  organism_record.name if organism_record is not None else organism_record
338
340
  )
@@ -403,7 +405,10 @@ def _standardize(
403
405
  logger.warning(warn_msg)
404
406
 
405
407
  mapper.update(std_names_bt_mapper)
406
- result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
408
+ if pd.api.types.is_categorical_dtype(std_names_db):
409
+ result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist()
410
+ else:
411
+ result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist()
407
412
  return _return(result=result, mapper=mapper)
408
413
 
409
414
  else:
@@ -514,7 +519,9 @@ def _filter_query_based_on_organism(
514
519
  # here, we can safely import bionty
515
520
  from bionty._bionty import create_or_get_organism_record
516
521
 
517
- organism_record = create_or_get_organism_record(organism=organism, orm=registry)
522
+ organism_record = create_or_get_organism_record(
523
+ organism=organism, registry=registry
524
+ )
518
525
  if organism_record is not None:
519
526
  queryset = queryset.filter(organism__name=organism_record.name)
520
527
 
lamindb/_collection.py CHANGED
@@ -11,7 +11,6 @@ from typing import (
11
11
  import anndata as ad
12
12
  import lamindb_setup as ln_setup
13
13
  import pandas as pd
14
- from anndata import AnnData
15
14
  from lamin_utils import logger
16
15
  from lamindb_setup.core._docs import doc_args
17
16
  from lamindb_setup.core.hashing import hash_set
@@ -27,7 +26,7 @@ from lamindb._artifact import update_attributes
27
26
  from lamindb._utils import attach_func_to_class_method
28
27
  from lamindb.core._data import _track_run_input
29
28
  from lamindb.core._mapped_collection import MappedCollection
30
- from lamindb.core.versioning import get_uid_from_old_version, init_uid
29
+ from lamindb.core.versioning import process_revises
31
30
 
32
31
  from . import Artifact, Run
33
32
  from ._record import init_self_from_db
@@ -37,10 +36,10 @@ from .core._data import (
37
36
  save_feature_set_links,
38
37
  save_feature_sets,
39
38
  )
39
+ from .core._settings import settings
40
40
 
41
41
  if TYPE_CHECKING:
42
42
  from lamindb.core.storage import UPath
43
- from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
44
43
 
45
44
  from ._query_set import QuerySet
46
45
 
@@ -72,9 +71,7 @@ def __init__(
72
71
  kwargs.pop("reference_type") if "reference_type" in kwargs else None
73
72
  )
74
73
  run: Run | None = kwargs.pop("run") if "run" in kwargs else None
75
- is_new_version_of: Collection | None = (
76
- kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
77
- )
74
+ revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None
78
75
  version: str | None = kwargs.pop("version") if "version" in kwargs else None
79
76
  visibility: int | None = (
80
77
  kwargs.pop("visibility")
@@ -84,18 +81,16 @@ def __init__(
84
81
  feature_sets: dict[str, FeatureSet] = (
85
82
  kwargs.pop("feature_sets") if "feature_sets" in kwargs else {}
86
83
  )
84
+ if "is_new_version_of" in kwargs:
85
+ logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
86
+ revises = kwargs.pop("is_new_version_of")
87
87
  if not len(kwargs) == 0:
88
88
  raise ValueError(
89
89
  f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}"
90
90
  )
91
- if is_new_version_of is None:
92
- provisional_uid = init_uid(version=version, n_full_id=20)
93
- else:
94
- if not isinstance(is_new_version_of, Collection):
95
- raise TypeError("is_new_version_of has to be of type ln.Collection")
96
- provisional_uid, version = get_uid_from_old_version(is_new_version_of, version)
97
- if name is None:
98
- name = is_new_version_of.name
91
+ provisional_uid, version, name, revises = process_revises(
92
+ revises, version, name, Collection
93
+ )
99
94
  run = get_run(run)
100
95
  if isinstance(artifacts, Artifact):
101
96
  artifacts = [artifacts]
@@ -147,6 +142,9 @@ def __init__(
147
142
  else:
148
143
  kwargs = {}
149
144
  add_transform_to_kwargs(kwargs, run)
145
+ search_names_setting = settings.creation.search_names
146
+ if revises is not None and name == revises.name:
147
+ settings.creation.search_names = False
150
148
  super(Collection, collection).__init__(
151
149
  uid=provisional_uid,
152
150
  name=name,
@@ -158,14 +156,15 @@ def __init__(
158
156
  run=run,
159
157
  version=version,
160
158
  visibility=visibility,
161
- is_new_version_of=is_new_version_of,
159
+ revises=revises,
162
160
  **kwargs,
163
161
  )
162
+ settings.creation.search_names = search_names_setting
164
163
  collection._artifacts = artifacts
165
164
  collection._feature_sets = feature_sets
166
165
  # register provenance
167
- if is_new_version_of is not None:
168
- _track_run_input(is_new_version_of, run=run)
166
+ if revises is not None:
167
+ _track_run_input(revises, run=run)
169
168
  _track_run_input(artifacts, run=run)
170
169
 
171
170
 
@@ -192,7 +191,7 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
192
191
  feature_sets_union = {}
193
192
  logger.debug("union")
194
193
  for slot, feature_set_ids_slot in feature_sets_by_slots.items():
195
- feature_set_1 = FeatureSet.filter(id=feature_set_ids_slot[0]).one()
194
+ feature_set_1 = FeatureSet.get(id=feature_set_ids_slot[0])
196
195
  related_name = feature_set_1._get_related_name()
197
196
  features_registry = getattr(FeatureSet, related_name).field.model
198
197
  start_time = logger.debug("run filter")