lamindb 0.64.2__py3-none-any.whl → 0.65.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_transform.py CHANGED
@@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Optional
2
2
 
3
3
  from lnschema_core.models import TRANSFORM_TYPE_DEFAULT, Transform
4
4
 
5
- from .dev.versioning import get_ids_from_old_version, init_uid
5
+ from .dev.versioning import get_uid_from_old_version, init_uid
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from lnschema_core.types import TransformType
@@ -19,9 +19,7 @@ def __init__(transform: Transform, *args, **kwargs):
19
19
  is_new_version_of: Optional[Transform] = (
20
20
  kwargs.pop("is_new_version_of") if "is_new_version_of" in kwargs else None
21
21
  )
22
- initial_version_id: Optional[int] = (
23
- kwargs.pop("initial_version_id") if "initial_version_id" in kwargs else None
24
- )
22
+ (kwargs.pop("initial_version_id") if "initial_version_id" in kwargs else None)
25
23
  version: Optional[str] = kwargs.pop("version") if "version" in kwargs else None
26
24
  type: Optional[TransformType] = (
27
25
  kwargs.pop("type") if "type" in kwargs else TRANSFORM_TYPE_DEFAULT
@@ -37,12 +35,12 @@ def __init__(transform: Transform, *args, **kwargs):
37
35
  f" but you passed: {kwargs}"
38
36
  )
39
37
  if is_new_version_of is None:
40
- new_uid = init_uid(version=version, n_full_id=14)
38
+ new_uid = init_uid(version=version, n_full_id=Transform._len_full_uid)
41
39
  else:
42
40
  if not isinstance(is_new_version_of, Transform):
43
41
  raise TypeError("is_new_version_of has to be of type ln.Transform")
44
- new_uid, initial_version_id, version = get_ids_from_old_version(
45
- is_new_version_of, version, n_full_id=14
42
+ new_uid, version = get_uid_from_old_version(
43
+ is_new_version_of, version, n_full_id=Transform._len_full_uid
46
44
  )
47
45
  if name is None:
48
46
  name = is_new_version_of.name
@@ -60,7 +58,6 @@ def __init__(transform: Transform, *args, **kwargs):
60
58
  short_name=short_name,
61
59
  type=type,
62
60
  version=version,
63
- initial_version_id=initial_version_id,
64
61
  reference=reference,
65
62
  _has_consciously_provided_uid=has_consciously_provided_uid,
66
63
  )
lamindb/dev/__init__.py CHANGED
@@ -24,6 +24,7 @@ Functionality of data registries:
24
24
  FeatureManager
25
25
  LabelManager
26
26
  IsTree
27
+ IsVersioned
27
28
 
28
29
  Functionality of metadata registries:
29
30
 
@@ -47,11 +48,18 @@ Auxiliary tools:
47
48
  Settings
48
49
  types
49
50
  exceptions
50
- MappedDataset
51
+ MappedCollection
51
52
  """
52
53
 
53
54
  from lamin_utils._inspect import InspectResult
54
- from lnschema_core.models import CanValidate, Data, HasParents, IsTree, Registry
55
+ from lnschema_core.models import (
56
+ CanValidate,
57
+ Data,
58
+ HasParents,
59
+ IsTree,
60
+ IsVersioned,
61
+ Registry,
62
+ )
55
63
 
56
64
  from lamindb._query_manager import QueryManager
57
65
  from lamindb._query_set import QuerySet
@@ -59,6 +67,6 @@ from lamindb.dev._feature_manager import FeatureManager
59
67
  from lamindb.dev._label_manager import LabelManager
60
68
 
61
69
  from . import _data, datasets, exceptions, fields, types
62
- from ._mapped_dataset import MappedDataset
70
+ from ._mapped_collection import MappedCollection
63
71
  from ._run_context import run_context
64
72
  from ._settings import Settings
lamindb/dev/_data.py CHANGED
@@ -5,8 +5,8 @@ from lamin_utils import colors, logger
5
5
  from lamindb_setup.dev._docs import doc_args
6
6
  from lnschema_core.models import (
7
7
  Artifact,
8
+ Collection,
8
9
  Data,
9
- Dataset,
10
10
  Feature,
11
11
  FeatureSet,
12
12
  Registry,
@@ -54,7 +54,7 @@ def add_transform_to_kwargs(kwargs: Dict[str, Any], run: Run):
54
54
  kwargs["transform"] = run.transform
55
55
 
56
56
 
57
- def save_feature_sets(self: Union[Artifact, Dataset]) -> None:
57
+ def save_feature_sets(self: Union[Artifact, Collection]) -> None:
58
58
  if hasattr(self, "_feature_sets"):
59
59
  saved_feature_sets = {}
60
60
  for key, feature_set in self._feature_sets.items():
@@ -72,7 +72,7 @@ def save_feature_sets(self: Union[Artifact, Dataset]) -> None:
72
72
  )
73
73
 
74
74
 
75
- def save_feature_set_links(self: Union[Artifact, Dataset]) -> None:
75
+ def save_feature_set_links(self: Union[Artifact, Collection]) -> None:
76
76
  from lamindb._save import bulk_create
77
77
 
78
78
  Data = self.__class__
@@ -113,10 +113,9 @@ def describe(self: Data):
113
113
  "created_by": "👤",
114
114
  "transform": _transform_emoji(self.transform),
115
115
  "run": "👣",
116
- "initial_version": "🔖",
117
- "file": "📄",
116
+ "artifact": "📄",
118
117
  }
119
- if len(foreign_key_fields) > 0: # always True for Artifact and Dataset
118
+ if len(foreign_key_fields) > 0: # always True for Artifact and Collection
120
119
  record_msg = f"{colors.green(model_name)}{__repr__(self, include_foreign_keys=False).lstrip(model_name)}"
121
120
  msg += f"{record_msg}\n\n"
122
121
 
@@ -209,7 +208,7 @@ def add_labels(
209
208
  ) -> None:
210
209
  """{}."""
211
210
  if self._state.adding:
212
- raise ValueError("Please save the file/dataset before adding a label!")
211
+ raise ValueError("Please save the artifact/collection before adding a label!")
213
212
 
214
213
  if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
215
214
  records = records.list()
@@ -331,7 +330,7 @@ def add_labels(
331
330
  id=old_feature_set_link.feature_set_id
332
331
  ).one()
333
332
  logger.info(
334
- "no file links to it anymore, deleting feature set"
333
+ "nothing links to it anymore, deleting feature set"
335
334
  f" {old_feature_set}"
336
335
  )
337
336
  old_feature_set.delete()
@@ -368,7 +367,7 @@ def _track_run_input(
368
367
  if run is None:
369
368
  if settings.track_run_inputs:
370
369
  logger.hint(
371
- "you can auto-track this file as a run input by calling"
370
+ "you can auto-track these data as a run input by calling"
372
371
  " `ln.track()`"
373
372
  )
374
373
  # assume we have a run record
@@ -390,7 +389,7 @@ def _track_run_input(
390
389
  track_run_input = True
391
390
  else:
392
391
  logger.hint(
393
- "track this file as a run input by passing `is_run_input=True`"
392
+ "track these data as a run input by passing `is_run_input=True`"
394
393
  )
395
394
  else:
396
395
  track_run_input = is_run_input
@@ -409,9 +408,10 @@ def _track_run_input(
409
408
  for data_id in input_data_ids
410
409
  ]
411
410
  else:
412
- LinkORM = run.input_datasets.through
411
+ LinkORM = run.input_collections.through
413
412
  links = [
414
- LinkORM(run_id=run.id, dataset_id=data_id) for data_id in input_data_ids
413
+ LinkORM(run_id=run.id, collection_id=data_id)
414
+ for data_id in input_data_ids
415
415
  ]
416
416
  LinkORM.objects.bulk_create(links, ignore_conflicts=True)
417
417
  # generalize below for more than one data batch
@@ -1,8 +1,8 @@
1
+ from itertools import compress
1
2
  from typing import Dict, Union
2
3
 
3
- import numpy as np
4
- from lamin_utils import colors
5
- from lnschema_core.models import Artifact, Data, Dataset, Feature
4
+ from lamin_utils import colors, logger
5
+ from lnschema_core.models import Artifact, Collection, Data, Feature
6
6
 
7
7
  from lamindb._feature_set import FeatureSet
8
8
  from lamindb._query_set import QuerySet
@@ -15,15 +15,15 @@ from lamindb._registry import (
15
15
  from lamindb._save import save
16
16
 
17
17
 
18
- def get_host_id_field(host: Union[Artifact, Dataset]) -> str:
18
+ def get_host_id_field(host: Union[Artifact, Collection]) -> str:
19
19
  if isinstance(host, Artifact):
20
20
  host_id_field = "artifact_id"
21
21
  else:
22
- host_id_field = "dataset_id"
22
+ host_id_field = "collection_id"
23
23
  return host_id_field
24
24
 
25
25
 
26
- def get_accessor_by_orm(host: Union[Artifact, Dataset]) -> Dict:
26
+ def get_accessor_by_orm(host: Union[Artifact, Collection]) -> Dict:
27
27
  dictionary = {
28
28
  field.related_model.__get_name_with_schema__(): field.name
29
29
  for field in host._meta.related_objects
@@ -56,7 +56,7 @@ def get_feature_set_by_slot(host) -> Dict:
56
56
 
57
57
 
58
58
  def get_label_links(
59
- host: Union[Artifact, Dataset], registry: str, feature: Feature
59
+ host: Union[Artifact, Collection], registry: str, feature: Feature
60
60
  ) -> QuerySet:
61
61
  host_id_field = get_host_id_field(host)
62
62
  kwargs = {host_id_field: host.id, "feature_id": feature.id}
@@ -68,7 +68,7 @@ def get_label_links(
68
68
  return link_records
69
69
 
70
70
 
71
- def get_feature_set_links(host: Union[Artifact, Dataset]) -> QuerySet:
71
+ def get_feature_set_links(host: Union[Artifact, Collection]) -> QuerySet:
72
72
  host_id_field = get_host_id_field(host)
73
73
  kwargs = {host_id_field: host.id}
74
74
  feature_set_links = host.feature_sets.through.objects.filter(**kwargs)
@@ -124,7 +124,7 @@ class FeatureManager:
124
124
  See :class:`~lamindb.dev.Data` for more information.
125
125
  """
126
126
 
127
- def __init__(self, host: Union[Artifact, Dataset]):
127
+ def __init__(self, host: Union[Artifact, Collection]):
128
128
  self._host = host
129
129
  self._feature_set_by_slot = get_feature_set_by_slot(host)
130
130
  self._accessor_by_orm = get_accessor_by_orm(host)
@@ -160,7 +160,7 @@ class FeatureManager:
160
160
  """
161
161
  if self._host._state.adding:
162
162
  raise ValueError(
163
- "Please save the artifact or dataset before adding a feature set!"
163
+ "Please save the artifact or collection before adding a feature set!"
164
164
  )
165
165
  host_db = self._host._state.db
166
166
  feature_set.save(using=host_db)
@@ -179,31 +179,53 @@ class FeatureManager:
179
179
  self._host.feature_sets.through(**kwargs).save(using=host_db)
180
180
  self._feature_set_by_slot[slot] = feature_set
181
181
 
182
- def _add_from(self, data: Data):
183
- """Transfer features from a artifact or dataset."""
182
+ def _add_from(self, data: Data, parents: bool = True):
183
+ """Transfer features from a artifact or collection."""
184
184
  for slot, feature_set in data.features._feature_set_by_slot.items():
185
185
  members = feature_set.members
186
+ if members.count() == 0:
187
+ continue
186
188
  registry = members[0].__class__
187
189
  # note here the features are transferred based on an unique field
188
190
  field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
189
- member_uids = np.array([getattr(member, field) for member in members])
190
- validated = registry.objects.using(self._host._state.db).validate(
191
- member_uids, field=field, mute=True
192
- )
193
- new_features = [members[int(i)] for i in np.argwhere(~validated).flatten()]
194
- if len(new_features) > 0:
195
- mute = True if len(new_features) > 10 else False
191
+ if hasattr(registry, "ontology_id") and parents:
192
+ field = "ontology_id"
193
+ if registry.__get_name_with_schema__() == "bionty.Organism":
194
+ parents = False
195
+ # this will be e.g. be a list of ontology_ids or uids
196
+ member_uids = list(members.values_list(field, flat=True))
197
+ # create records from ontology_id in order to populate parents
198
+ if field == "ontology_id" and len(member_uids) > 0:
199
+ # create from bionty
200
+ records = registry.from_values(member_uids, field=field)
201
+ if len(records) > 0:
202
+ save(records, parents=parents)
203
+ validated = registry.validate(member_uids, field=field, mute=True)
204
+ new_members_uids = list(compress(member_uids, ~validated))
205
+ new_members = members.filter(**{f"{field}__in": new_members_uids}).all()
206
+ if new_members.count() > 0:
207
+ mute = True if new_members.count() > 10 else False
196
208
  # transfer foreign keys needs to be run before transfer to default db
197
- transfer_fk_to_default_db_bulk(new_features)
198
- for feature in new_features:
209
+ transfer_fk_to_default_db_bulk(new_members)
210
+ for feature in new_members:
199
211
  # not calling save=True here as in labels, because want to
200
212
  # bulk save below
201
213
  transfer_to_default_db(feature, mute=mute)
202
- save(new_features)
214
+ logger.info(
215
+ f"saving {new_members.count()} new {registry.__name__} records"
216
+ )
217
+ save(new_members, parents=parents)
203
218
 
204
219
  # create a new feature set from feature values using the same uid
205
220
  feature_set_self = FeatureSet.from_values(
206
221
  member_uids, field=getattr(registry, field)
207
222
  )
223
+ if feature_set_self is None:
224
+ if hasattr(registry, "organism"):
225
+ logger.warning(
226
+ f"FeatureSet is not transferred, check if organism is set correctly: {feature_set}"
227
+ )
228
+ continue
208
229
  feature_set_self.uid = feature_set.uid
230
+ logger.info(f"saving {slot} featureset: {feature_set_self}")
209
231
  self._host.features.add_feature_set(feature_set_self, slot)
@@ -2,7 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union
2
2
 
3
3
  import numpy as np
4
4
  from lamin_utils import colors, logger
5
- from lnschema_core.models import Artifact, Data, Dataset, Feature, Registry
5
+ from lnschema_core.models import Artifact, Collection, Data, Feature, Registry
6
6
 
7
7
  from lamindb._feature_set import dict_related_model_to_related_name
8
8
  from lamindb._from_values import _print_values
@@ -23,11 +23,12 @@ def get_labels_as_dict(self: Data):
23
23
  ).items():
24
24
  if related_name in {
25
25
  "feature_sets",
26
- "files",
26
+ "artifacts",
27
27
  "input_of",
28
- "datasets",
28
+ "collections",
29
29
  "source_of",
30
30
  "report_of",
31
+ "environment_of",
31
32
  }:
32
33
  continue
33
34
  if self.id is not None:
@@ -49,13 +50,16 @@ def print_labels(self: Data):
49
50
  return ""
50
51
 
51
52
 
52
- def transfer_add_labels(labels, features_lookup_self, self, row):
53
+ def transfer_add_labels(labels, features_lookup_self, self, row, parents: bool = True):
53
54
  def transfer_single_registry(validated_labels, new_labels):
54
55
  # here the new labels are transferred to the self db
55
56
  if len(new_labels) > 0:
56
57
  transfer_fk_to_default_db_bulk(new_labels)
57
58
  for label in new_labels:
58
59
  transfer_to_default_db(label, mute=True)
60
+ # not saving parents for Organism during transfer
61
+ registry = new_labels[0].__class__
62
+ logger.info(f"saving {len(new_labels)} new {registry.__name__} records")
59
63
  save(new_labels)
60
64
  # link labels records from self db
61
65
  self._host.labels.add(
@@ -64,7 +68,7 @@ def transfer_add_labels(labels, features_lookup_self, self, row):
64
68
  )
65
69
 
66
70
  # validate labels on the default db
67
- result = validate_labels(labels)
71
+ result = validate_labels(labels, parents=parents)
68
72
  if isinstance(result, Dict):
69
73
  for _, (validated_labels, new_labels) in result.items():
70
74
  transfer_single_registry(validated_labels, new_labels)
@@ -72,17 +76,34 @@ def transfer_add_labels(labels, features_lookup_self, self, row):
72
76
  transfer_single_registry(*result)
73
77
 
74
78
 
75
- def validate_labels(labels: Union[QuerySet, List, Dict]):
79
+ def validate_labels(labels: Union[QuerySet, List, Dict], parents: bool = True):
76
80
  def validate_labels_registry(
77
- labels: Union[QuerySet, List, Dict],
81
+ labels: Union[QuerySet, List, Dict], parents: bool = True
78
82
  ) -> Tuple[List[str], List[str]]:
79
83
  if len(labels) == 0:
80
84
  return [], []
81
85
  registry = labels[0].__class__
82
86
  field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
87
+ if hasattr(registry, "ontology_id") and parents:
88
+ field = "ontology_id"
89
+ if registry.__get_name_with_schema__() == "bionty.Organism":
90
+ parents = False
91
+ # if the field value is None, use uid field
83
92
  label_uids = np.array(
84
93
  [getattr(label, field) for label in labels if label is not None]
85
94
  )
95
+ # save labels from ontology_ids so that parents are populated
96
+ if field == "ontology_id" and len(label_uids) > 0:
97
+ try:
98
+ records = registry.from_values(label_uids, field=field)
99
+ if len(records) > 0:
100
+ save(records, parents=parents)
101
+ except Exception:
102
+ pass
103
+ field = "uid"
104
+ label_uids = np.array(
105
+ [getattr(label, field) for label in labels if label is not None]
106
+ )
86
107
  validated = registry.validate(label_uids, field=field, mute=True)
87
108
  validated_uids = label_uids[validated]
88
109
  validated_labels = registry.filter(**{f"{field}__in": validated_uids}).list()
@@ -92,9 +113,11 @@ def validate_labels(labels: Union[QuerySet, List, Dict]):
92
113
  if isinstance(labels, Dict):
93
114
  result = {}
94
115
  for registry, labels_registry in labels.items():
95
- result[registry] = validate_labels_registry(labels_registry)
116
+ result[registry] = validate_labels_registry(
117
+ labels_registry, parents=parents
118
+ )
96
119
  else:
97
- return validate_labels_registry(labels)
120
+ return validate_labels_registry(labels, parents=parents)
98
121
 
99
122
 
100
123
  class LabelManager:
@@ -107,7 +130,7 @@ class LabelManager:
107
130
  See :class:`~lamindb.dev.Data` for more information.
108
131
  """
109
132
 
110
- def __init__(self, host: Union[Artifact, Dataset]):
133
+ def __init__(self, host: Union[Artifact, Collection]):
111
134
  self._host = host
112
135
 
113
136
  def __repr__(self) -> str:
@@ -150,8 +173,8 @@ class LabelManager:
150
173
 
151
174
  return get_labels(self._host, feature=feature, mute=mute, flat_names=flat_names)
152
175
 
153
- def add_from(self, data: Data):
154
- """Transfer labels from a file or dataset.
176
+ def add_from(self, data: Data, parents: bool = True):
177
+ """Transfer labels from a file or collection.
155
178
 
156
179
  Examples:
157
180
  >>> file1 = ln.Artifact(pd.DataFrame(index=[0, 1]))
@@ -185,13 +208,15 @@ class LabelManager:
185
208
  labels = labels.all()
186
209
  if len(labels) == 0:
187
210
  continue
188
- validated_labels, new_labels = validate_labels(labels.all())
211
+ validated_labels, new_labels = validate_labels(
212
+ labels.all(), parents=parents
213
+ )
189
214
  if len(new_labels) > 0:
190
215
  transfer_fk_to_default_db_bulk(new_labels)
191
216
  for label in new_labels:
192
217
  transfer_to_default_db(label, mute=True)
193
- save(new_labels)
194
- # this should not occur as file and dataset should have the same attributes
218
+ save(new_labels, parents=parents)
219
+ # this should not occur as file and collection should have the same attributes
195
220
  # but this might not be true for custom schema
196
221
  labels_list = validated_labels + new_labels
197
222
  if hasattr(self._host, related_name):