lamindb 0.76.15__py3-none-any.whl → 0.77.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/__init__.py CHANGED
@@ -43,7 +43,7 @@ Modules and settings.
43
43
  """
44
44
 
45
45
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
46
- __version__ = "0.76.15"
46
+ __version__ = "0.77.0"
47
47
 
48
48
  import os as _os
49
49
 
@@ -79,7 +79,7 @@ if _check_instance_setup(from_module="lnschema_core"):
79
79
  from . import core # isort: split
80
80
  from . import (
81
81
  _artifact,
82
- _can_validate,
82
+ _can_curate,
83
83
  _collection,
84
84
  _curate,
85
85
  _feature,
lamindb/_artifact.py CHANGED
@@ -111,7 +111,12 @@ def process_pathlike(
111
111
  # for the storage root: the bucket
112
112
  if not isinstance(filepath, LocalPathClasses):
113
113
  # for a cloud path, new_root is always the bucket name
114
- new_root = list(filepath.parents)[-1]
114
+ if filepath.protocol == "hf":
115
+ hf_path = filepath.fs.resolve_path(filepath.as_posix())
116
+ hf_path.path_in_repo = ""
117
+ new_root = "hf://" + hf_path.unresolve()
118
+ else:
119
+ new_root = list(filepath.parents)[-1]
115
120
  # do not register remote storage locations on hub if the current instance
116
121
  # is not managed on the hub
117
122
  storage_settings, _ = init_storage(
@@ -213,9 +218,9 @@ def get_stat_or_artifact(
213
218
  if stat is not None:
214
219
  # convert UPathStatResult to fsspec info dict
215
220
  stat = stat.as_info()
216
- if "ETag" in stat: # is file
221
+ if (store_type := stat["type"]) == "file":
217
222
  size, hash, hash_type = get_stat_file_cloud(stat)
218
- elif stat["type"] == "directory":
223
+ elif store_type == "directory":
219
224
  size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
220
225
  if hash is None:
221
226
  logger.warning(f"did not add hash for {path}")
@@ -240,7 +245,7 @@ def get_stat_or_artifact(
240
245
  .order_by("-created_at")
241
246
  .all()
242
247
  )
243
- artifact_with_same_hash_exists = len(result.filter(hash=hash).all()) > 0
248
+ artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
244
249
  if not artifact_with_same_hash_exists and len(result) > 0:
245
250
  logger.important(
246
251
  f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  from django.core.exceptions import FieldDoesNotExist
9
9
  from lamin_utils import colors, logger
10
10
  from lamindb_setup.core._docs import doc_args
11
- from lnschema_core import CanValidate, Record
11
+ from lnschema_core import CanCurate, Record
12
12
 
13
13
  from ._from_values import _has_organism_field, _print_values, get_or_create_records
14
14
  from ._record import _queryset, get_name_field
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
 
24
24
  # from_values doesn't apply for QuerySet or Manager
25
25
  @classmethod # type:ignore
26
- @doc_args(CanValidate.from_values.__doc__)
26
+ @doc_args(CanCurate.from_values.__doc__)
27
27
  def from_values(
28
28
  cls,
29
29
  values: ListLike,
@@ -49,7 +49,7 @@ def from_values(
49
49
 
50
50
 
51
51
  @classmethod # type: ignore
52
- @doc_args(CanValidate.inspect.__doc__)
52
+ @doc_args(CanCurate.inspect.__doc__)
53
53
  def inspect(
54
54
  cls,
55
55
  values: ListLike,
@@ -71,7 +71,7 @@ def inspect(
71
71
 
72
72
 
73
73
  @classmethod # type: ignore
74
- @doc_args(CanValidate.validate.__doc__)
74
+ @doc_args(CanCurate.validate.__doc__)
75
75
  def validate(
76
76
  cls,
77
77
  values: ListLike,
@@ -108,14 +108,14 @@ def _check_organism_db(organism: Record, using_key: str | None):
108
108
 
109
109
  def _concat_lists(values: ListLike) -> list[str]:
110
110
  """Concatenate a list of lists of strings into a single list."""
111
- if len(values) > 0 and isinstance(values, (list, pd.Series)):
112
- try:
113
- if isinstance(values[0], list):
114
- if isinstance(values, pd.Series):
115
- values = values.tolist()
116
- values = sum([v for v in values if isinstance(v, list)], [])
117
- except KeyError:
118
- pass
111
+ if isinstance(values, (list, pd.Series)) and len(values) > 0:
112
+ first_item = values[0] if isinstance(values, list) else values.iloc[0]
113
+ if isinstance(first_item, list):
114
+ if isinstance(values, pd.Series):
115
+ values = values.tolist()
116
+ values = [
117
+ v for sublist in values if isinstance(sublist, list) for v in sublist
118
+ ]
119
119
  return values
120
120
 
121
121
 
@@ -250,7 +250,7 @@ def _validate(
250
250
  f"Your {cls.__name__} registry is empty, consider populating it first!"
251
251
  )
252
252
  if hasattr(cls, "source_id"):
253
- msg += "\n → use `.import_from_source()` to import records from a source, e.g. a public ontology"
253
+ msg += "\n → use `.import_source()` to import records from a source, e.g. a public ontology"
254
254
  logger.warning(msg)
255
255
  return np.array([False] * len(values))
256
256
 
@@ -268,7 +268,7 @@ def _validate(
268
268
 
269
269
 
270
270
  @classmethod # type: ignore
271
- @doc_args(CanValidate.standardize.__doc__)
271
+ @doc_args(CanCurate.standardize.__doc__)
272
272
  def standardize(
273
273
  cls,
274
274
  values: ListLike,
@@ -388,7 +388,11 @@ def _standardize(
388
388
 
389
389
  try:
390
390
  registry._meta.get_field(synonyms_field)
391
- fields = {i for i in [field, return_field, synonyms_field] if i is not None}
391
+ fields = {
392
+ field_name
393
+ for field_name in [field, return_field, synonyms_field]
394
+ if field_name is not None
395
+ }
392
396
  df = _filter_query_based_on_organism(
393
397
  queryset=queryset,
394
398
  field=field,
@@ -445,14 +449,19 @@ def _standardize(
445
449
  if len(std_names_bt_mapper) > 0 and not mute:
446
450
  s = "" if len(std_names_bt_mapper) == 1 else "s"
447
451
  field_print = "synonym" if field == return_field else field
448
- warn_msg = (
449
- f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty:"
450
- f" {list(std_names_bt_mapper.keys())}"
452
+
453
+ reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}"
454
+ truncated_note = (
455
+ " (output truncated)" if len(std_names_bt_mapper) > 10 else ""
451
456
  )
452
- warn_msg += (
453
- f"\n please add corresponding {registry._meta.model.__name__} records via"
454
- f" `.from_values({list(set(std_names_bt_mapper.values()))})`"
457
+
458
+ warn_msg = (
459
+ f"found {len(std_names_bt_mapper)} {field_print}{s} in Bionty{truncated_note}:"
460
+ f" {reduced_mapped_keys_str}\n"
461
+ f" please add corresponding {registry._meta.model.__name__} records via{truncated_note}:"
462
+ f" `.from_values({reduced_mapped_keys_str})`"
455
463
  )
464
+
456
465
  logger.warning(warn_msg)
457
466
 
458
467
  mapper.update(std_names_bt_mapper)
@@ -612,10 +621,10 @@ if ln_setup._TESTING: # type: ignore
612
621
  from inspect import signature
613
622
 
614
623
  SIGS = {
615
- name: signature(getattr(CanValidate, name))
624
+ name: signature(getattr(CanCurate, name))
616
625
  for name in METHOD_NAMES
617
626
  if not name.startswith("__")
618
627
  }
619
628
 
620
629
  for name in METHOD_NAMES:
621
- attach_func_to_class_method(name, CanValidate, globals())
630
+ attach_func_to_class_method(name, CanCurate, globals())
lamindb/_curate.py CHANGED
@@ -20,6 +20,7 @@ from .core.exceptions import ValidationError
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from collections.abc import Iterable
23
+ from typing import Any
23
24
 
24
25
  from lamindb_setup.core.types import UPathStr
25
26
  from lnschema_core.types import FieldAttr
@@ -226,7 +227,7 @@ class DataFrameCurator(BaseCurator):
226
227
  f"the following keys passed to {name} are not allowed: {nonval_keys}"
227
228
  )
228
229
 
229
- def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
230
+ def _save_columns(self, validated_only: bool = True) -> None:
230
231
  """Save column name records."""
231
232
  # Always save features specified as the fields keys
232
233
  update_registry(
@@ -238,7 +239,7 @@ class DataFrameCurator(BaseCurator):
238
239
  validated_only=False,
239
240
  source=self._sources.get("columns"),
240
241
  exclude=self._exclude.get("columns"),
241
- **kwargs,
242
+ **self._kwargs, # type: ignore
242
243
  )
243
244
 
244
245
  # Save the rest of the columns based on validated_only
@@ -255,7 +256,7 @@ class DataFrameCurator(BaseCurator):
255
256
  source=self._sources.get("columns"),
256
257
  exclude=self._exclude.get("columns"),
257
258
  warning=False, # Do not warn about missing columns, just an info message
258
- **kwargs,
259
+ **self._kwargs, # type: ignore
259
260
  )
260
261
 
261
262
  def add_new_from(self, key: str, organism: str | None = None, **kwargs):
@@ -292,7 +293,7 @@ class DataFrameCurator(BaseCurator):
292
293
  f"Feature {categorical} is not part of the fields!"
293
294
  )
294
295
  update_registry(
295
- values=flatten_unique(self._df[categorical]),
296
+ values=_flatten_unique(self._df[categorical]),
296
297
  field=self.fields[categorical],
297
298
  key=categorical,
298
299
  using_key=self._using_key,
@@ -305,7 +306,6 @@ class DataFrameCurator(BaseCurator):
305
306
  def _update_registry_all(self, validated_only: bool = True, **kwargs):
306
307
  """Save labels for all features."""
307
308
  for name in self.fields.keys():
308
- logger.info(f"saving validated records of '{name}'")
309
309
  self._update_registry(name, validated_only=validated_only, **kwargs)
310
310
 
311
311
  def validate(self, organism: str | None = None) -> bool:
@@ -436,12 +436,15 @@ class AnnDataCurator(DataFrameCurator):
436
436
  ) -> None:
437
437
  from lamindb_setup.core import upath
438
438
 
439
+ if isinstance(var_index, str):
440
+ raise TypeError("var_index parameter has to be a bionty field")
441
+
439
442
  from ._artifact import data_is_anndata
440
443
 
441
444
  if sources is None:
442
445
  sources = {}
443
446
  if not data_is_anndata(data):
444
- raise ValueError(
447
+ raise TypeError(
445
448
  "data has to be an AnnData object or a path to AnnData-like"
446
449
  )
447
450
  if isinstance(data, ad.AnnData):
@@ -451,6 +454,11 @@ class AnnDataCurator(DataFrameCurator):
451
454
 
452
455
  self._adata = backed_access(upath.create_path(data))
453
456
 
457
+ if "symbol" in str(var_index):
458
+ logger.warning(
459
+ "Curating gene symbols is discouraged. See FAQ for more details."
460
+ )
461
+
454
462
  self._data = data
455
463
  self._var_field = var_index
456
464
  super().__init__(
@@ -512,10 +520,8 @@ class AnnDataCurator(DataFrameCurator):
512
520
 
513
521
  def _update_registry_all(self, validated_only: bool = True, **kwargs):
514
522
  """Save labels for all features."""
515
- logger.info("saving validated records of 'var_index'")
516
523
  self._save_from_var_index(validated_only=validated_only, **self._kwargs)
517
524
  for name in self._obs_fields.keys():
518
- logger.info(f"saving validated terms of '{name}'")
519
525
  self._update_registry(name, validated_only=validated_only, **self._kwargs)
520
526
 
521
527
  def add_new_from_var_index(self, organism: str | None = None, **kwargs):
@@ -1229,7 +1235,7 @@ def validate_categories(
1229
1235
  if n_non_validated == 0:
1230
1236
  if n_validated == 0:
1231
1237
  logger.indent = ""
1232
- logger.success(f"{key} is validated against {colors.italic(model_field)}")
1238
+ logger.success(f"'{key}' is validated against {colors.italic(model_field)}")
1233
1239
  return True, []
1234
1240
  else:
1235
1241
  # validated values still need to be saved to the current instance
@@ -1434,8 +1440,8 @@ def save_artifact(
1434
1440
  return artifact
1435
1441
 
1436
1442
 
1437
- def flatten_unique(series):
1438
- """Flatten a pandas series if it contains lists."""
1443
+ def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]:
1444
+ """Flatten a Pandas series containing lists or single items into a unique list of elements."""
1439
1445
  result = set()
1440
1446
 
1441
1447
  for item in series:
@@ -1505,9 +1511,14 @@ def update_registry(
1505
1511
 
1506
1512
  public_records = [r for r in existing_and_public_records if r._state.adding]
1507
1513
  # here we check to only save the public records if they are from the specified source
1508
- # we check the uid because r.source and soruce can be from different instances
1514
+ # we check the uid because r.source and source can be from different instances
1509
1515
  if source:
1510
1516
  public_records = [r for r in public_records if r.source.uid == source.uid]
1517
+
1518
+ if public_records:
1519
+ settings.verbosity = "info"
1520
+ logger.info(f"saving validated records of '{key}'")
1521
+ settings.verbosity = "error"
1511
1522
  ln_save(public_records)
1512
1523
  labels_saved["from public"] = [
1513
1524
  getattr(r, field.field.name) for r in public_records
@@ -1720,7 +1731,7 @@ def _save_organism(name: str): # pragma: no cover
1720
1731
 
1721
1732
  def _ref_is_name(field: FieldAttr) -> bool | None:
1722
1733
  """Check if the reference field is a name field."""
1723
- from ._can_validate import get_name_field
1734
+ from ._can_curate import get_name_field
1724
1735
 
1725
1736
  name_field = get_name_field(field.field.model)
1726
1737
  return field.field.name == name_field
lamindb/_finish.py CHANGED
@@ -103,10 +103,10 @@ def save_context_core(
103
103
 
104
104
  # for scripts, things are easy
105
105
  is_consecutive = True
106
- is_notebook = transform.type == "notebook"
106
+ is_ipynb = filepath.suffix == ".ipynb"
107
107
  source_code_path = filepath
108
108
  # for notebooks, we need more work
109
- if is_notebook:
109
+ if is_ipynb:
110
110
  try:
111
111
  import jupytext
112
112
  from nbproject.dev import (
@@ -198,7 +198,7 @@ def save_context_core(
198
198
  run.finished_at = datetime.now(timezone.utc)
199
199
 
200
200
  # track report and set is_consecutive
201
- if not is_notebook:
201
+ if not is_ipynb:
202
202
  run.is_consecutive = True
203
203
  run.save()
204
204
  else:
@@ -234,8 +234,15 @@ def save_context_core(
234
234
  # finalize
235
235
  if not from_cli:
236
236
  run_time = run.finished_at - run.started_at
237
+ days = run_time.days
238
+ seconds = run_time.seconds
239
+ hours = seconds // 3600
240
+ minutes = (seconds % 3600) // 60
241
+ secs = seconds % 60
242
+ formatted_run_time = f"{days}d {hours}h {minutes}m {secs}s"
243
+
237
244
  logger.important(
238
- f"finished Run('{run.uid[:8]}') after {run_time} at {format_field_value(run.finished_at)}"
245
+ f"finished Run('{run.uid[:8]}') after {formatted_run_time} at {format_field_value(run.finished_at)}"
239
246
  )
240
247
  if ln_setup.settings.instance.is_on_hub:
241
248
  identifier = ln_setup.settings.instance.slug
@@ -244,9 +251,7 @@ def save_context_core(
244
251
  )
245
252
  if not from_cli:
246
253
  thing, name = (
247
- ("notebook", "notebook.ipynb")
248
- if is_notebook
249
- else ("script", "script.py")
254
+ ("notebook", "notebook.ipynb") if is_ipynb else ("script", "script.py")
250
255
  )
251
256
  logger.important(
252
257
  f"if you want to update your {thing} without re-running it, use `lamin save {name}`"
lamindb/_parents.py CHANGED
@@ -19,7 +19,14 @@ if TYPE_CHECKING:
19
19
  LAMIN_GREEN_LIGHTER = "#10b981"
20
20
  LAMIN_GREEN_DARKER = "#065f46"
21
21
  GREEN_FILL = "honeydew"
22
- TRANSFORM_EMOJIS = {"notebook": "📔", "app": "🖥️", "pipeline": "🧩"}
22
+ TRANSFORM_EMOJIS = {
23
+ "notebook": "📔",
24
+ "upload": "🖥️",
25
+ "pipeline": "🧩",
26
+ "script": "📝",
27
+ "function": "🔧",
28
+ "glue": "🧲",
29
+ }
23
30
  is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
24
31
 
25
32
 
lamindb/_query_set.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections import UserList
4
- from typing import TYPE_CHECKING, NamedTuple
4
+ from collections.abc import Iterable
5
+ from collections.abc import Iterable as IterableType
6
+ from typing import TYPE_CHECKING, Any, NamedTuple
5
7
 
6
8
  import pandas as pd
7
9
  from django.db import models
@@ -10,7 +12,7 @@ from lamin_utils import colors, logger
10
12
  from lamindb_setup.core._docs import doc_args
11
13
  from lnschema_core.models import (
12
14
  Artifact,
13
- CanValidate,
15
+ CanCurate,
14
16
  Collection,
15
17
  IsVersioned,
16
18
  Record,
@@ -69,8 +71,33 @@ def one_helper(self):
69
71
  return self[0]
70
72
 
71
73
 
72
- def process_expressions(registry: Registry, expressions: dict) -> dict:
73
- if registry in {Artifact, Collection}:
74
+ def process_expressions(queryset: QuerySet, expressions: dict) -> dict:
75
+ def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]:
76
+ if isinstance(value, Record):
77
+ if value._state.db != target_db:
78
+ logger.warning(
79
+ f"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'"
80
+ )
81
+ return f"{key}__uid", value.uid
82
+ return key, value
83
+
84
+ if (
85
+ key.endswith("__in")
86
+ and isinstance(value, IterableType)
87
+ and not isinstance(value, str)
88
+ ):
89
+ if any(isinstance(v, Record) and v._state.db != target_db for v in value):
90
+ logger.warning(
91
+ f"passing records from another database to query {target_db}, matching on uids"
92
+ )
93
+ return key.replace("__in", "__uid__in"), [
94
+ v.uid if isinstance(v, Record) else v for v in value
95
+ ]
96
+ return key, value
97
+
98
+ return key, value
99
+
100
+ if queryset.model in {Artifact, Collection}:
74
101
  # visibility is set to 0 unless expressions contains id or uid equality
75
102
  if not (
76
103
  "id" in expressions
@@ -87,7 +114,17 @@ def process_expressions(registry: Registry, expressions: dict) -> dict:
87
114
  # sense for a non-NULLABLE column
88
115
  elif visibility in expressions and expressions[visibility] is None:
89
116
  expressions.pop(visibility)
90
- return expressions
117
+ if queryset._db is not None:
118
+ # only check for database mismatch if there is a defined database on the
119
+ # queryset
120
+ return dict(
121
+ (
122
+ _map_databases(value, key, queryset._db)
123
+ for key, value in expressions.items()
124
+ )
125
+ )
126
+ else:
127
+ return expressions
91
128
 
92
129
 
93
130
  def get(
@@ -114,7 +151,7 @@ def get(
114
151
  return qs.one()
115
152
  else:
116
153
  assert idlike is None # noqa: S101
117
- expressions = process_expressions(registry, expressions)
154
+ expressions = process_expressions(qs, expressions)
118
155
  return registry.objects.using(qs.db).get(**expressions)
119
156
 
120
157
 
@@ -282,6 +319,14 @@ class QuerySet(models.QuerySet):
282
319
  """Query a single record. Raises error if there are more or none."""
283
320
  return get(self, idlike, **expressions)
284
321
 
322
+ def filter(self, *queries, **expressions) -> QuerySet:
323
+ """Query a set of records."""
324
+ expressions = process_expressions(self, expressions)
325
+ if len(expressions) > 0:
326
+ return super().filter(*queries, **expressions)
327
+ else:
328
+ return self
329
+
285
330
  def one(self) -> Record:
286
331
  """Exactly one result. Raises error if there are more or none."""
287
332
  return one_helper(self)
@@ -309,7 +354,7 @@ class QuerySet(models.QuerySet):
309
354
 
310
355
 
311
356
  # -------------------------------------------------------------------------------------
312
- # CanValidate
357
+ # CanCurate
313
358
  # -------------------------------------------------------------------------------------
314
359
 
315
360
 
@@ -329,26 +374,26 @@ def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple:
329
374
  return _lookup(cls=self, field=field, **kwargs)
330
375
 
331
376
 
332
- @doc_args(CanValidate.validate.__doc__)
377
+ @doc_args(CanCurate.validate.__doc__)
333
378
  def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs):
334
379
  """{}""" # noqa: D415
335
- from ._can_validate import _validate
380
+ from ._can_curate import _validate
336
381
 
337
382
  return _validate(cls=self, values=values, field=field, **kwargs)
338
383
 
339
384
 
340
- @doc_args(CanValidate.inspect.__doc__)
385
+ @doc_args(CanCurate.inspect.__doc__)
341
386
  def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs):
342
387
  """{}""" # noqa: D415
343
- from ._can_validate import _inspect
388
+ from ._can_curate import _inspect
344
389
 
345
390
  return _inspect(cls=self, values=values, field=field, **kwargs)
346
391
 
347
392
 
348
- @doc_args(CanValidate.standardize.__doc__)
393
+ @doc_args(CanCurate.standardize.__doc__)
349
394
  def standardize(self, values: Iterable, field: str | StrField | None = None, **kwargs):
350
395
  """{}""" # noqa: D415
351
- from ._can_validate import _standardize
396
+ from ._can_curate import _standardize
352
397
 
353
398
  return _standardize(cls=self, values=values, field=field, **kwargs)
354
399