lamindb 0.48a3__py3-none-any.whl → 0.48.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_from_values.py CHANGED
@@ -2,6 +2,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2
2
 
3
3
  import pandas as pd
4
4
  from django.core.exceptions import FieldDoesNotExist
5
+ from django.db.models import Case, When
5
6
  from django.db.models.query_utils import DeferredAttribute as Field
6
7
  from lamin_utils import colors, logger
7
8
  from lnschema_core.models import ORM, Feature
@@ -17,7 +18,7 @@ def get_or_create_records(
17
18
  *,
18
19
  from_bionty: bool = False,
19
20
  **kwargs,
20
- ) -> List:
21
+ ) -> List[ORM]:
21
22
  """Get or create records from iterables."""
22
23
  upon_create_search_names = settings.upon_create_search_names
23
24
  settings.upon_create_search_names = False
@@ -59,7 +60,7 @@ def get_or_create_records(
59
60
  params["type"] = str(types[value])
60
61
  records.append(ORM(**params, **kwargs))
61
62
  s = "" if len(unmapped_values) == 1 else "s"
62
- print_unmapped_values = ", ".join(unmapped_values[:7])
63
+ print_unmapped_values = ", ".join(unmapped_values[:10])
63
64
  if len(unmapped_values) > 10:
64
65
  print_unmapped_values += ", ..."
65
66
  additional_info = " "
@@ -69,6 +70,27 @@ def get_or_create_records(
69
70
  f"Created {colors.yellow(f'{len(unmapped_values)} {ORM.__name__} record{s}')} for{additional_info}" # noqa
70
71
  f"{colors.yellow(f'{field_name}{s}')}: {print_unmapped_values}" # noqa
71
72
  )
73
+ if ORM.__module__.startswith("lnschema_bionty."):
74
+ if isinstance(iterable, pd.Series):
75
+ feature = iterable.name
76
+ else:
77
+ logger.warning(
78
+ "Did not receive values as pd.Series, inferring feature from"
79
+ f" reference ORM: {ORM.__name__}"
80
+ )
81
+ feature = ORM.__name__.lower()
82
+ if isinstance(feature, str):
83
+ feature_name = feature
84
+ feature = Feature.select(name=feature).one_or_none()
85
+ elif feature is not None:
86
+ feature_name = feature.name
87
+ if feature is not None:
88
+ for record in records:
89
+ record._feature = feature
90
+ if feature_name is not None:
91
+ for record in records:
92
+ record._feature = feature_name
93
+ logger.info(f"Mapping records to feature '{feature_name}'")
72
94
  return records
73
95
  finally:
74
96
  settings.upon_create_search_names = upon_create_search_names
@@ -97,10 +119,14 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
97
119
  syn_msg = ""
98
120
  if len(syn_mapper) > 0:
99
121
  s = "" if len(syn_mapper) == 1 else "s"
122
+ names = list(syn_mapper.keys())
123
+ print_values = ", ".join(names[:10])
124
+ if len(names) > 10:
125
+ print_values += ", ..."
100
126
  syn_msg = (
101
127
  "Loaded"
102
128
  f" {colors.green(f'{len(syn_mapper)} {model.__name__} record{s}')} that" # noqa
103
- f" matched {colors.green('synonyms')}"
129
+ f" matched {colors.green('synonyms')}: {print_values}"
104
130
  )
105
131
  iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
106
132
 
@@ -112,22 +138,37 @@ def get_existing_records(iterable_idx: pd.Index, field: Field, kwargs: Dict = {}
112
138
 
113
139
  from ._select import select
114
140
 
115
- stmt = select(model, **condition)
141
+ query_set = select(model, **condition)
142
+
143
+ # new we have to sort the list of queried records
144
+ preserved = Case(
145
+ *[
146
+ When(**{field_name: value}, then=pos)
147
+ for pos, value in enumerate(iterable_idx)
148
+ ]
149
+ )
150
+ records = query_set.order_by(preserved).list()
116
151
 
117
- records = stmt.list() # existing records
118
152
  n_name = len(records) - len(syn_mapper)
153
+ names = [getattr(record, field_name) for record in records]
154
+ names = [name for name in names if name not in syn_mapper.values()]
119
155
  if n_name > 0:
120
156
  s = "" if n_name == 1 else "s"
157
+ print_values = ", ".join(names[:10])
158
+ if len(names) > 10:
159
+ print_values += ", ..."
121
160
  logger.info(
122
161
  "Loaded"
123
162
  f" {colors.green(f'{n_name} {model.__name__} record{s}')} that"
124
- f" matched {colors.green(f'{field_name}')}"
163
+ f" matched {colors.green(f'{field_name}')}: {print_values}"
125
164
  )
126
165
  # make sure that synonyms logging appears after the field logging
127
166
  if len(syn_msg) > 0:
128
167
  logger.info(syn_msg)
129
168
 
130
- existing_values = iterable_idx.intersection(stmt.values_list(field_name, flat=True))
169
+ existing_values = iterable_idx.intersection(
170
+ query_set.values_list(field_name, flat=True)
171
+ )
131
172
  nonexist_values = iterable_idx.difference(existing_values)
132
173
 
133
174
  return records, nonexist_values
@@ -161,10 +202,14 @@ def create_records_from_bionty(
161
202
  msg_syn: str = ""
162
203
  if len(syn_mapper) > 0:
163
204
  s = "" if len(syn_mapper) == 1 else "s"
205
+ names = list(syn_mapper.keys())
206
+ print_values = ", ".join(names[:10])
207
+ if len(names) > 10:
208
+ print_values += ", ..."
164
209
  msg_syn = (
165
210
  "Loaded"
166
211
  f" {colors.purple(f'{len(syn_mapper)} {model.__name__} record{s} from Bionty')} that" # noqa
167
- f" matched {colors.purple('synonyms')}"
212
+ f" matched {colors.purple('synonyms')}: {print_values}"
168
213
  )
169
214
 
170
215
  iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
@@ -179,26 +224,24 @@ def create_records_from_bionty(
179
224
  for bk in bionty_kwargs:
180
225
  records.append(model(**bk, **kwargs))
181
226
 
182
- # logging of BiontySource linking
183
- source_msg = (
184
- ""
185
- if kwargs.get("bionty_source") is None
186
- else f" (bionty_source_id={kwargs.get('bionty_source').id})" # type:ignore # noqa
187
- )
188
-
189
227
  # number of records that matches field (not synonyms)
190
228
  n_name = len(records) - len(syn_mapper)
229
+ names = [getattr(record, field_name) for record in records]
230
+ names = [name for name in names if name not in syn_mapper.values()]
191
231
  if n_name > 0:
192
232
  s = "" if n_name == 1 else "s"
233
+ print_values = ", ".join(names[:10])
234
+ if len(names) > 10:
235
+ print_values += ", ..."
193
236
  msg = (
194
237
  "Loaded"
195
238
  f" {colors.purple(f'{n_name} {model.__name__} record{s} from Bionty')} that" # noqa
196
- f" matched {colors.purple(f'{field_name}')}"
239
+ f" matched {colors.purple(f'{field_name}')}: {print_values}"
197
240
  )
198
- logger.info(msg + source_msg)
241
+ logger.info(msg)
199
242
  # make sure that synonyms logging appears after the field logging
200
243
  if len(msg_syn) > 0:
201
- logger.info(msg_syn + source_msg)
244
+ logger.info(msg_syn)
202
245
  # warning about multi matches
203
246
  if len(multi_msg) > 0:
204
247
  logger.warning(multi_msg)
lamindb/_label.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
 
3
3
  import pandas as pd
4
+ from lamin_utils import logger
4
5
  from lamindb_setup.dev._docs import doc_args
5
6
  from lnschema_core import Feature, Label
6
7
  from lnschema_core.types import ListLike
@@ -11,6 +12,40 @@ from . import _TESTING
11
12
  from ._from_values import get_or_create_records, index_iterable
12
13
 
13
14
 
15
+ def __init__(self, *args, **kwargs):
16
+ if len(args) == len(self._meta.concrete_fields):
17
+ super(Label, self).__init__(*args, **kwargs)
18
+ return None
19
+ # now we proceed with the user-facing constructor
20
+ if len(args) > 0:
21
+ raise ValueError("Only one non-keyword arg allowed")
22
+ name: Optional[str] = kwargs.pop("name") if "name" in kwargs else None
23
+ description: Optional[str] = (
24
+ kwargs.pop("description") if "description" in kwargs else None
25
+ )
26
+ feature: Optional[str] = kwargs.pop("feature") if "feature" in kwargs else None
27
+ feature_id: Optional[str] = (
28
+ kwargs.pop("feature_id") if "feature_id" in kwargs else None
29
+ )
30
+ if len(kwargs) > 0:
31
+ raise ValueError("Only name, description, feature are valid keyword arguments")
32
+ # continue
33
+ if feature is None and feature_id is None:
34
+ logger.warning("Consider passing a corresponding feature for your label!")
35
+ if isinstance(feature, str):
36
+ feature = Feature.select(name=feature).one_or_none()
37
+ if feature is None:
38
+ raise ValueError(
39
+ f"Feature with name {feature} does not exist, please create it:"
40
+ f" ln.Feature(name={feature}, type='float')"
41
+ )
42
+ else:
43
+ feature_id = feature.id
44
+ super(Label, self).__init__(
45
+ name=name, description=description, feature_id=feature_id
46
+ )
47
+
48
+
14
49
  @classmethod # type:ignore
15
50
  @doc_args(Label.from_values.__doc__)
16
51
  def from_values(
@@ -33,6 +68,7 @@ def from_values(
33
68
 
34
69
 
35
70
  METHOD_NAMES = [
71
+ "__init__",
36
72
  "from_values",
37
73
  ]
38
74
 
lamindb/_manager.py CHANGED
@@ -43,9 +43,9 @@ class Manager(models.Manager):
43
43
  else:
44
44
  return [item for item in self.values_list(field, flat=True)]
45
45
 
46
- def df(self):
46
+ def df(self, **kwargs):
47
47
  """Convert to DataFrame."""
48
- return self.all().df()
48
+ return self.all().df(**kwargs)
49
49
 
50
50
 
51
51
  setattr(models.Manager, "list", Manager.list)
lamindb/_orm.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
5
  from django.core.exceptions import FieldDoesNotExist
6
6
  from django.db.models import Manager, QuerySet
7
7
  from django.db.models.query_utils import DeferredAttribute as Field
8
- from lamin_utils import logger
8
+ from lamin_utils import colors, logger
9
9
  from lamin_utils._lookup import Lookup
10
10
  from lamin_utils._search import search as base_search
11
11
  from lamindb_setup.dev._docs import doc_args
@@ -16,7 +16,9 @@ from lnschema_core.types import ListLike, StrField
16
16
  from lamindb.dev.utils import attach_func_to_class_method
17
17
 
18
18
  from . import _TESTING
19
+ from ._feature_manager import create_features_df
19
20
  from ._from_values import _has_species_field, get_or_create_records
21
+ from .dev._settings import settings
20
22
 
21
23
  IPYTHON = getattr(builtins, "__IPYTHON__", False)
22
24
 
@@ -395,9 +397,40 @@ def map_synonyms(
395
397
  )
396
398
 
397
399
 
400
+ def _labels_with_feature_names(labels: Union[QuerySet, Manager]) -> Dict:
401
+ from django.db.models import F
402
+
403
+ df = labels.annotate(feature_name=F("feature__name")).df()
404
+ return df.groupby("feature_name")["name"].apply(list).to_dict()
405
+
406
+
398
407
  def describe(self):
399
- model_name = self.__class__.__name__
408
+ model_name = colors.green(self.__class__.__name__)
400
409
  msg = ""
410
+
411
+ def dict_related_model_to_related_name(orm):
412
+ d: Dict = {
413
+ f"{i.related_model.__get_schema_name__()}.{i.related_model.__name__}": (
414
+ i.related_name
415
+ )
416
+ for i in orm._meta.related_objects
417
+ if i.related_name is not None
418
+ }
419
+ d.update(
420
+ {
421
+ f"{i.related_model.__get_schema_name__()}.{i.related_model.__name__}": (
422
+ i.name
423
+ )
424
+ for i in orm._meta.many_to_many
425
+ if i.name is not None
426
+ }
427
+ )
428
+
429
+ return d
430
+
431
+ file_related_models = dict_related_model_to_related_name(self)
432
+
433
+ # Display the file record
401
434
  fields = self._meta.fields
402
435
  direct_fields = []
403
436
  foreign_key_fields = []
@@ -406,42 +439,126 @@ def describe(self):
406
439
  foreign_key_fields.append(f.name)
407
440
  else:
408
441
  direct_fields.append(f.name)
442
+
443
+ # Display Provenance
409
444
  # display line by line the foreign key fields
445
+ emojis = {"storage": "💾", "created_by": "👤", "transform": "💫", "run": "🚗"}
410
446
  if len(foreign_key_fields) > 0:
411
447
  record_msg = f"{model_name}({''.join([f'{i}={self.__getattribute__(i)}, ' for i in direct_fields])})" # noqa
412
448
  msg += f"{record_msg.rstrip(', )')})\n\n"
413
449
 
414
- msg += "One/Many-to-One:\n "
450
+ msg += f"{colors.green('Provenance')}:\n "
415
451
  related_msg = "".join(
416
- [f"🔗 {i}: {self.__getattribute__(i)}\n " for i in foreign_key_fields]
452
+ [
453
+ f"{emojis.get(i, '📎')} {i}: {self.__getattribute__(i)}\n "
454
+ for i in foreign_key_fields
455
+ ]
417
456
  )
418
457
  msg += related_msg
458
+ # input of
459
+ if self.input_of.exists():
460
+ values = [format_datetime(i.run_at) for i in self.input_of.all()]
461
+ msg += f"⬇️ input_of ({colors.italic('core.Run')}): {values}\n "
419
462
  msg = msg.rstrip(" ")
420
463
 
421
- # display many-to-many relationship objects
422
- # fields in the model definition
423
- related_names = [i.name for i in self._meta.many_to_many]
424
- # fields back linked
425
- related_names += [i.related_name for i in self._meta.related_objects]
426
- msg += "Many-to-Many:\n"
427
- for related_name in related_names:
428
- related_objects = self.__getattribute__(related_name)
429
- count = related_objects.count()
430
- if count > 0:
431
- try:
432
- field = get_default_str_field(related_objects)
433
- except ValueError:
434
- field = "id"
435
- objects_list = list(related_objects.values_list(field, flat=True)[:10])
436
- if field == "created_at":
437
- objects_list = [format_datetime(i) for i in objects_list]
438
- msg_objects = f" 🔗 {related_name} ({count}): {objects_list}\n"
439
- if count > 10:
440
- msg_objects = msg_objects.replace("]", " ... ]")
441
- msg += msg_objects
464
+ if not self.feature_sets.exists():
465
+ print(msg)
466
+ return
467
+ else:
468
+ feature_sets_related_models = dict_related_model_to_related_name(
469
+ self.feature_sets.first()
470
+ )
471
+ # Display Features by slot
472
+ msg += f"{colors.green('Features')}:\n"
473
+ # var
474
+ feature_sets = self.feature_sets.exclude(ref_orm="Feature")
475
+ if feature_sets.exists():
476
+ for feature_set in feature_sets.all():
477
+ key = f"{feature_set.ref_schema}.{feature_set.ref_orm}"
478
+ related_name = feature_sets_related_models.get(key)
479
+ values = (
480
+ feature_set.__getattribute__(related_name)
481
+ .all()[:5]
482
+ .list(feature_set.ref_field)
483
+ )
484
+ slots = self.feature_sets.through.objects.filter(
485
+ file=self, feature_set=feature_set
486
+ ).list("slot")
487
+ for slot in slots:
488
+ if slot == "var":
489
+ slot += " (X)"
490
+ msg += f" 🗺️ {colors.bold(slot)}:\n"
491
+ ref = colors.italic(f"{key}.{feature_set.ref_field}")
492
+ msg += f" 🔗 index ({feature_set.n}, {ref}): {values}\n".replace(
493
+ "]", "...]"
494
+ )
495
+
496
+ # obs
497
+ # ref_orm=Feature, combine all features into one dataframe
498
+ feature_sets = self.feature_sets.filter(ref_orm="Feature").all()
499
+ if feature_sets.exists():
500
+ features_df = create_features_df(
501
+ file=self, feature_sets=feature_sets.all(), exclude=True
502
+ )
503
+ for slot in features_df["slot"].unique():
504
+ df_slot = features_df[features_df.slot == slot]
505
+ if slot == "obs":
506
+ slot += " (metadata)"
507
+ msg += f" 🗺️ {colors.bold(slot)}:\n"
508
+ df_label_index = df_slot[
509
+ (df_slot["labels_orm"] == "Label")
510
+ & (df_slot["labels_schema"] == "core")
511
+ ].index
512
+
513
+ # for labels
514
+ if len(df_label_index) > 0:
515
+ labels_schema = "core"
516
+ labels_orm = "Label"
517
+ key = f"{labels_schema}.{labels_orm}"
518
+ related_name = file_related_models.get(key)
519
+ related_objects = self.__getattribute__(related_name)
520
+ labels = _labels_with_feature_names(related_objects)
521
+ msg_objects = ""
522
+ for k, v in labels.items():
523
+ msg_objects_k = (
524
+ f" 🔗 {k} ({len(v)}, {colors.italic(key)}): {v[:5]}\n"
525
+ )
526
+ if len(v) > 5:
527
+ msg_objects_k = msg_objects_k.replace("]", " ... ]")
528
+ msg_objects += msg_objects_k
529
+ msg += msg_objects
530
+
531
+ # for non-labels
532
+ nonlabel_index = df_slot.index.difference(df_label_index)
533
+ if len(nonlabel_index) == 0:
534
+ continue
535
+ df_nonlabels = df_slot.loc[nonlabel_index]
536
+ df_nonlabels = (
537
+ df_nonlabels.groupby(["labels_schema", "labels_orm"], group_keys=False)[
538
+ "name"
539
+ ]
540
+ .apply(lambda x: "|".join(x))
541
+ .reset_index()
542
+ )
543
+ for _, row in df_nonlabels.iterrows():
544
+ key = f"{row.labels_schema}.{row.labels_orm}"
545
+ related_name = file_related_models.get(key)
546
+ related_objects = self.__getattribute__(related_name)
547
+ count = related_objects.count()
548
+ count_str = f"{count}, {colors.italic(f'{key}')}"
549
+ try:
550
+ field = get_default_str_field(related_objects)
551
+ except ValueError:
552
+ field = "id"
553
+ values = list(related_objects.values_list(field, flat=True)[:5])
554
+ msg_objects = f" 🔗 {row['name']} ({count_str}): {values}\n"
555
+ msg += msg_objects
442
556
  msg = msg.rstrip("\n")
443
- msg = msg.rstrip("Many-to-Many:")
444
- print(msg)
557
+ msg = msg.rstrip("Features:")
558
+ verbosity = settings.verbosity
559
+ settings.verbosity = 2
560
+ logger.info(msg)
561
+ settings.verbosity = verbosity
445
562
 
446
563
 
447
564
  def set_abbr(self, value: str):
lamindb/_queryset.py CHANGED
@@ -100,6 +100,8 @@ class QuerySet(models.QuerySet):
100
100
  df.run_at = format_and_convert_to_local_time(df.run_at)
101
101
  if "id" in df.columns:
102
102
  df = df.set_index("id")
103
+ if len(df) == 0:
104
+ return df
103
105
  if include is not None:
104
106
  if isinstance(include, str):
105
107
  include = [include]
@@ -121,7 +123,7 @@ class QuerySet(models.QuerySet):
121
123
  if field.field.model != ORM
122
124
  else field.field.related_model
123
125
  )
124
- if field.field.model == related_ORM:
126
+ if ORM == related_ORM:
125
127
  left_side_link_model = f"from_{ORM.__name__.lower()}"
126
128
  values_expression = f"to_{ORM.__name__.lower()}__{lookup_str}"
127
129
  else:
@@ -135,7 +137,7 @@ class QuerySet(models.QuerySet):
135
137
  link_groupby = link_df.groupby(left_side_link_model)[
136
138
  values_expression
137
139
  ].apply(list)
138
- df = pd.concat((link_groupby, df), axis=1)
140
+ df = pd.concat((link_groupby, df), axis=1, join="inner")
139
141
  df.rename(columns={values_expression: expression}, inplace=True)
140
142
  return df
141
143
 
lamindb/_save.py CHANGED
@@ -76,13 +76,23 @@ def save(records: Iterable[ORM], **kwargs) -> None: # type: ignore
76
76
  non_files_with_parents = {r for r in non_files if hasattr(r, "_parents")}
77
77
 
78
78
  if len(non_files_with_parents) > 0 and kwargs.get("parents") is not False:
79
- # save the record with parents one by one
80
- logger.warning(
81
- "Now recursing through parents: "
82
- "this only happens once, but is much slower than bulk saving"
83
- )
84
- for record in non_files_with_parents:
85
- record._save_ontology_parents()
79
+ # this can only happen within lnschema_bionty right now!!
80
+ # we might extend to core lamindb later
81
+ import lnschema_bionty as lb
82
+
83
+ if kwargs.get("parents") or (
84
+ kwargs.get("parents") is None and lb.settings.auto_save_parents
85
+ ):
86
+ # save the record with parents one by one
87
+ logger.warning(
88
+ "Now recursing through parents: "
89
+ "this only happens once, but is much slower than bulk saving"
90
+ )
91
+ logger.hint(
92
+ "You can switch this off via: lb.settings.auto_save_parents = False"
93
+ )
94
+ for record in non_files_with_parents:
95
+ record._save_ontology_parents()
86
96
 
87
97
  if files:
88
98
  with transaction.atomic():
lamindb/dev/__init__.py CHANGED
@@ -6,16 +6,20 @@
6
6
  ORM
7
7
  QuerySet
8
8
  Manager
9
+ FeatureManager
9
10
  datasets
10
11
  hashing
11
12
  storage
12
13
  Settings
14
+ run_context
13
15
  """
14
16
 
15
17
  from lnschema_core.models import ORM
16
18
 
19
+ from lamindb._feature_manager import FeatureManager
17
20
  from lamindb._manager import Manager
18
21
  from lamindb._queryset import QuerySet
19
22
 
23
+ from .._context import run_context
20
24
  from . import datasets # noqa
21
25
  from ._settings import Settings