lamindb 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +2 -2
- lamindb/_finish.py +1 -1
- lamindb/_tracked.py +3 -15
- lamindb/core/_context.py +45 -19
- lamindb/curators/_legacy.py +1 -1
- lamindb/curators/core.py +51 -21
- lamindb/errors.py +6 -0
- lamindb/examples/datasets/_core.py +1 -1
- lamindb/integrations/__init__.py +0 -18
- lamindb/integrations/lightning.py +13 -10
- lamindb/migrations/0134_run_params.py +17 -0
- lamindb/migrations/{0133_squashed.py → 0134_squashed.py} +93 -90
- lamindb/models/_feature_manager.py +30 -20
- lamindb/models/_label_manager.py +3 -5
- lamindb/models/artifact.py +250 -291
- lamindb/models/artifact_set.py +4 -4
- lamindb/models/block.py +11 -9
- lamindb/models/can_curate.py +1 -1
- lamindb/models/collection.py +16 -17
- lamindb/models/feature.py +2 -2
- lamindb/models/has_parents.py +1 -3
- lamindb/models/query_manager.py +7 -7
- lamindb/models/query_set.py +38 -12
- lamindb/models/run.py +53 -49
- lamindb/models/schema.py +79 -65
- lamindb/models/sqlrecord.py +32 -17
- lamindb/models/transform.py +6 -3
- {lamindb-1.12.0.dist-info → lamindb-1.13.0.dist-info}/METADATA +26 -22
- {lamindb-1.12.0.dist-info → lamindb-1.13.0.dist-info}/RECORD +31 -30
- {lamindb-1.12.0.dist-info → lamindb-1.13.0.dist-info}/LICENSE +0 -0
- {lamindb-1.12.0.dist-info → lamindb-1.13.0.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
"""A data
|
1
|
+
"""A data framework for biology.
|
2
2
|
|
3
3
|
Data lineage
|
4
4
|
============
|
@@ -110,7 +110,7 @@ Backwards compatibility.
|
|
110
110
|
|
111
111
|
# ruff: noqa: I001
|
112
112
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
113
|
-
__version__ = "1.
|
113
|
+
__version__ = "1.13.0"
|
114
114
|
|
115
115
|
import warnings as _warnings
|
116
116
|
|
lamindb/_finish.py
CHANGED
@@ -495,7 +495,7 @@ def save_context_core(
|
|
495
495
|
)
|
496
496
|
|
497
497
|
logger.important(
|
498
|
-
f"finished Run('{run.uid
|
498
|
+
f"finished Run('{run.uid}') after {formatted_run_time} at {format_field_value(run.finished_at)}"
|
499
499
|
)
|
500
500
|
if ln_setup.settings.instance.is_on_hub:
|
501
501
|
instance_slug = ln_setup.settings.instance.slug
|
lamindb/_tracked.py
CHANGED
@@ -4,9 +4,8 @@ from contextvars import ContextVar
|
|
4
4
|
from datetime import datetime, timezone
|
5
5
|
from typing import Callable, ParamSpec, TypeVar
|
6
6
|
|
7
|
-
from .core._context import context
|
7
|
+
from .core._context import context, serialize_params_to_json
|
8
8
|
from .models import Run, Transform
|
9
|
-
from .models._feature_manager import infer_feature_type_convert_json
|
10
9
|
|
11
10
|
P = ParamSpec("P")
|
12
11
|
R = TypeVar("R")
|
@@ -92,26 +91,15 @@ def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]
|
|
92
91
|
run = Run(transform=transform, initiated_by_run=initiated_by_run) # type: ignore
|
93
92
|
run.started_at = datetime.now(timezone.utc)
|
94
93
|
run._status_code = -1 # started
|
95
|
-
run.save()
|
96
94
|
|
97
95
|
# Bind arguments to get a mapping of parameter names to values
|
98
96
|
bound_args = sig.bind(*args, **kwargs)
|
99
97
|
bound_args.apply_defaults()
|
100
98
|
params = dict(bound_args.arguments)
|
101
99
|
|
102
|
-
# Remove the run parameter if it exists (we'll inject our own)
|
103
|
-
params.pop("run", None)
|
104
|
-
|
105
|
-
# Deal with non-trivial parameter values
|
106
|
-
filtered_params = {}
|
107
|
-
for key, value in params.items():
|
108
|
-
dtype, _, _ = infer_feature_type_convert_json(key, value)
|
109
|
-
if (dtype == "?" or dtype.startswith("cat")) and dtype != "cat ? str":
|
110
|
-
continue
|
111
|
-
filtered_params[key] = value
|
112
|
-
|
113
100
|
# Add parameters to the run
|
114
|
-
run.
|
101
|
+
run.params = serialize_params_to_json(params)
|
102
|
+
run.save()
|
115
103
|
|
116
104
|
# Set the run in context and execute function
|
117
105
|
token = current_tracked_run.set(run)
|
lamindb/core/_context.py
CHANGED
@@ -23,7 +23,8 @@ from ..errors import (
|
|
23
23
|
TrackNotCalled,
|
24
24
|
UpdateContext,
|
25
25
|
)
|
26
|
-
from ..models import Run, Transform, format_field_value
|
26
|
+
from ..models import Run, SQLRecord, Transform, format_field_value
|
27
|
+
from ..models._feature_manager import infer_feature_type_convert_json
|
27
28
|
from ..models._is_versioned import bump_version as bump_version_function
|
28
29
|
from ..models._is_versioned import (
|
29
30
|
increment_base62,
|
@@ -235,6 +236,22 @@ class LogStreamTracker:
|
|
235
236
|
self.original_excepthook(exc_type, exc_value, exc_traceback)
|
236
237
|
|
237
238
|
|
239
|
+
def serialize_params_to_json(params: dict) -> dict:
|
240
|
+
serialized_params = {}
|
241
|
+
for key, value in params.items():
|
242
|
+
if isinstance(value, SQLRecord):
|
243
|
+
value = f"{value.__class__.__get_name_with_module__()}[{value.uid}]"
|
244
|
+
else:
|
245
|
+
dtype, _, _ = infer_feature_type_convert_json(key, value)
|
246
|
+
if (dtype == "?" or dtype.startswith("cat")) and dtype != "cat ? str":
|
247
|
+
logger.warning(
|
248
|
+
f"skipping param {key} because dtype not JSON serializable"
|
249
|
+
)
|
250
|
+
continue
|
251
|
+
serialized_params[key] = value
|
252
|
+
return serialized_params
|
253
|
+
|
254
|
+
|
238
255
|
class Context:
|
239
256
|
"""Run context.
|
240
257
|
|
@@ -325,6 +342,7 @@ class Context:
|
|
325
342
|
project: str | Project | None = None,
|
326
343
|
space: str | Space | None = None,
|
327
344
|
branch: str | Branch | None = None,
|
345
|
+
features: dict | None = None,
|
328
346
|
params: dict | None = None,
|
329
347
|
new_run: bool | None = None,
|
330
348
|
path: str | None = None,
|
@@ -343,7 +361,8 @@ class Context:
|
|
343
361
|
Default: the `"all"` space. Note that bionty entities ignore this setting and always get written to the `"all"` space.
|
344
362
|
If you want to manually move entities to a different space, set the `.space` field (:doc:`docs:access`).
|
345
363
|
branch: A branch (or its `name` or `uid`) on which to store records.
|
346
|
-
|
364
|
+
features: A dictionary of features & values to track for the run.
|
365
|
+
params: A dictionary of params & values to track for the run.
|
347
366
|
new_run: If `False`, loads the latest run of transform
|
348
367
|
(default notebook), if `True`, creates new run (default non-notebook).
|
349
368
|
path: Filepath of notebook or script. Only needed if it can't be
|
@@ -465,10 +484,14 @@ class Context:
|
|
465
484
|
transform_exists = Transform.filter(id=transform.id).first()
|
466
485
|
if transform_exists is None:
|
467
486
|
transform.save()
|
468
|
-
self._logging_message_track +=
|
487
|
+
self._logging_message_track += (
|
488
|
+
f"created Transform('{transform.uid}', key='{transform.key}')"
|
489
|
+
)
|
469
490
|
transform_exists = transform
|
470
491
|
else:
|
471
|
-
self._logging_message_track +=
|
492
|
+
self._logging_message_track += (
|
493
|
+
f"loaded Transform('{transform.uid}', key='{transform.key}')"
|
494
|
+
)
|
472
495
|
self._transform = transform_exists
|
473
496
|
|
474
497
|
if new_run is None: # for notebooks, default to loading latest runs
|
@@ -493,25 +516,26 @@ class Context:
|
|
493
516
|
if run is not None: # loaded latest run
|
494
517
|
run.started_at = datetime.now(timezone.utc) # update run time
|
495
518
|
run._status_code = -2 # re-started
|
496
|
-
self._logging_message_track += f", re-started Run('{run.uid
|
519
|
+
self._logging_message_track += f", re-started Run('{run.uid}') at {format_field_value(run.started_at)}"
|
497
520
|
|
498
521
|
if run is None: # create new run
|
499
|
-
run = Run(
|
500
|
-
transform=self._transform,
|
501
|
-
params=params,
|
502
|
-
)
|
522
|
+
run = Run(transform=self._transform)
|
503
523
|
run.started_at = datetime.now(timezone.utc)
|
504
524
|
run._status_code = -1 # started
|
505
|
-
self._logging_message_track += f", started new Run('{run.uid
|
525
|
+
self._logging_message_track += f", started new Run('{run.uid}') at {format_field_value(run.started_at)}"
|
506
526
|
# can only determine at ln.finish() if run was consecutive in
|
507
527
|
# interactive session, otherwise, is consecutive
|
508
528
|
run.is_consecutive = True if is_run_from_ipython else None
|
509
|
-
# need to save in all cases
|
510
|
-
run.save()
|
511
529
|
if params is not None:
|
512
|
-
run.
|
530
|
+
run.params = serialize_params_to_json(params)
|
513
531
|
self._logging_message_track += "\n→ params: " + ", ".join(
|
514
|
-
f"{key}={value}" for key, value in params.items()
|
532
|
+
f"{key}={value}" for key, value in run.params.items()
|
533
|
+
)
|
534
|
+
run.save() # need to save now
|
535
|
+
if features is not None:
|
536
|
+
run.features.add_values(features)
|
537
|
+
self._logging_message_track += "\n→ features: " + ", ".join(
|
538
|
+
f"{key}={value}" for key, value in features.items()
|
515
539
|
)
|
516
540
|
self._run = run
|
517
541
|
track_python_environment(run)
|
@@ -835,7 +859,9 @@ class Context:
|
|
835
859
|
reference_type=transform_ref_type,
|
836
860
|
type=transform_type,
|
837
861
|
).save()
|
838
|
-
self._logging_message_track +=
|
862
|
+
self._logging_message_track += (
|
863
|
+
f"created Transform('{transform.uid}', key='{transform.key}')"
|
864
|
+
)
|
839
865
|
else:
|
840
866
|
uid = transform.uid
|
841
867
|
# transform was already saved via `finish()`
|
@@ -874,9 +900,7 @@ class Context:
|
|
874
900
|
if transform_hash != transform.hash:
|
875
901
|
bump_revision = True
|
876
902
|
else:
|
877
|
-
self._logging_message_track += (
|
878
|
-
f"loaded Transform('{transform.uid}')"
|
879
|
-
)
|
903
|
+
self._logging_message_track += f"loaded Transform('{transform.uid}', key='{transform.key}')"
|
880
904
|
if bump_revision:
|
881
905
|
change_type = (
|
882
906
|
"re-running notebook with already-saved source code"
|
@@ -890,7 +914,9 @@ class Context:
|
|
890
914
|
f'✗ {change_type}, please update the `uid` argument in `track()` to "{uid[:-4]}{increment_base62(uid[-4:])}"'
|
891
915
|
)
|
892
916
|
else:
|
893
|
-
self._logging_message_track +=
|
917
|
+
self._logging_message_track += (
|
918
|
+
f"loaded Transform('{transform.uid}', key='{transform.key}')"
|
919
|
+
)
|
894
920
|
self._transform = transform
|
895
921
|
|
896
922
|
def _finish(self, ignore_non_consecutive: None | bool = None) -> None:
|
lamindb/curators/_legacy.py
CHANGED
lamindb/curators/core.py
CHANGED
@@ -289,13 +289,7 @@ class Curator:
|
|
289
289
|
|
290
290
|
artifact_info = ""
|
291
291
|
if self._artifact is not None:
|
292
|
-
|
293
|
-
short_uid = (
|
294
|
-
str(artifact_uid)[:8] + "..."
|
295
|
-
if len(str(artifact_uid)) > 8
|
296
|
-
else str(artifact_uid)
|
297
|
-
)
|
298
|
-
artifact_info = f", artifact: {colors.italic(short_uid)}"
|
292
|
+
artifact_info = f", artifact: {colors.italic(self._artifact.uid)}"
|
299
293
|
|
300
294
|
return (
|
301
295
|
f"{cls_name}{artifact_info}(Schema: {schema_str}{extra_info}{status_str})"
|
@@ -337,7 +331,7 @@ class SlotsCurator(Curator):
|
|
337
331
|
def validate(self) -> None:
|
338
332
|
"""{}""" # noqa: D415
|
339
333
|
for slot, curator in self._slots.items():
|
340
|
-
logger.
|
334
|
+
logger.debug(f"validating slot {slot} ...")
|
341
335
|
curator.validate()
|
342
336
|
# set _is_validated to True as no slot raised an error
|
343
337
|
self._is_validated = True
|
@@ -403,6 +397,16 @@ class SlotsCurator(Curator):
|
|
403
397
|
)
|
404
398
|
|
405
399
|
|
400
|
+
def convert_dict_to_dataframe_for_validation(d: dict, schema: Schema) -> pd.DataFrame:
|
401
|
+
"""Convert a dictionary to a DataFrame for validation against a schema."""
|
402
|
+
df = pd.DataFrame([d])
|
403
|
+
for feature in schema.members:
|
404
|
+
if feature.dtype.startswith("cat"):
|
405
|
+
if feature.name in df.columns:
|
406
|
+
df[feature.name] = pd.Categorical(df[feature.name])
|
407
|
+
return df
|
408
|
+
|
409
|
+
|
406
410
|
# This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
|
407
411
|
# Such an approach was never intended and there is room for a DictCurator in the future.
|
408
412
|
# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
|
@@ -702,13 +706,11 @@ class DataFrameCurator(SlotsCurator):
|
|
702
706
|
) -> None:
|
703
707
|
super().__init__(dataset=dataset, schema=schema)
|
704
708
|
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
slot=slot,
|
711
|
-
)
|
709
|
+
self._atomic_curator = ComponentCurator(
|
710
|
+
dataset=dataset,
|
711
|
+
schema=schema,
|
712
|
+
slot=slot,
|
713
|
+
)
|
712
714
|
|
713
715
|
# Handle (nested) attrs
|
714
716
|
if slot is None and schema.slots:
|
@@ -724,11 +726,11 @@ class DataFrameCurator(SlotsCurator):
|
|
724
726
|
data = _resolve_schema_slot_path(
|
725
727
|
attrs_dict, deeper_keys, slot_name, "attrs"
|
726
728
|
)
|
727
|
-
df =
|
729
|
+
df = convert_dict_to_dataframe_for_validation(data, slot_schema)
|
728
730
|
self._slots[slot_name] = ComponentCurator(
|
729
731
|
df, slot_schema, slot=slot_name
|
730
732
|
)
|
731
|
-
|
733
|
+
elif slot_name != "__external__":
|
732
734
|
raise ValueError(
|
733
735
|
f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
|
734
736
|
)
|
@@ -783,6 +785,26 @@ class DataFrameCurator(SlotsCurator):
|
|
783
785
|
)
|
784
786
|
|
785
787
|
|
788
|
+
class ExperimentalDictCurator(DataFrameCurator):
|
789
|
+
"""Curator for `dict` based on `DataFrameCurator`."""
|
790
|
+
|
791
|
+
def __init__(
|
792
|
+
self,
|
793
|
+
dataset: dict | Artifact,
|
794
|
+
schema: Schema,
|
795
|
+
slot: str | None = None,
|
796
|
+
) -> None:
|
797
|
+
if not isinstance(dataset, dict) and not isinstance(dataset, Artifact):
|
798
|
+
raise InvalidArgument("The dataset must be a dict or dict-like artifact.")
|
799
|
+
if isinstance(dataset, Artifact):
|
800
|
+
assert dataset.otype == "dict", "Artifact must be of otype 'dict'." # noqa: S101
|
801
|
+
d = dataset.load(is_run_input=False)
|
802
|
+
else:
|
803
|
+
d = dataset
|
804
|
+
df = convert_dict_to_dataframe_for_validation(d, schema)
|
805
|
+
super().__init__(df, schema, slot=slot)
|
806
|
+
|
807
|
+
|
786
808
|
def _resolve_schema_slot_path(
|
787
809
|
target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
|
788
810
|
) -> Any:
|
@@ -803,13 +825,18 @@ def _resolve_schema_slot_path(
|
|
803
825
|
base_path += f"['{key}']"
|
804
826
|
try:
|
805
827
|
current = current[key]
|
806
|
-
except
|
828
|
+
except (
|
829
|
+
KeyError,
|
830
|
+
TypeError,
|
831
|
+
): # if not a dict, raises TypeError; if a dict and key not found, raises KeyError
|
807
832
|
available = (
|
808
|
-
list(current.keys())
|
833
|
+
list(current.keys())
|
834
|
+
if isinstance(current, dict)
|
835
|
+
else "none (not a dict)"
|
809
836
|
)
|
810
837
|
raise InvalidArgument(
|
811
838
|
f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
|
812
|
-
f"not found. Available keys at this level: {available}"
|
839
|
+
f"not found. Available keys at this level: {available}."
|
813
840
|
) from None
|
814
841
|
|
815
842
|
return current
|
@@ -1478,7 +1505,10 @@ class CatVector:
|
|
1478
1505
|
if type_record is not None:
|
1479
1506
|
# if subtype_str is set, we need to set the type for new records
|
1480
1507
|
init_kwargs["type"] = type_record
|
1481
|
-
|
1508
|
+
# here we create non-validated records skipping validation since we already ensured that they don't exist
|
1509
|
+
non_validated_records.append(
|
1510
|
+
registry(**init_kwargs, **create_kwargs, _skip_validation=True)
|
1511
|
+
)
|
1482
1512
|
if len(non_validated_records) > 0:
|
1483
1513
|
ln_save(non_validated_records)
|
1484
1514
|
model_field = colors.italic(registry.__get_name_with_module__())
|
lamindb/errors.py
CHANGED
@@ -52,6 +52,12 @@ class UnknownStorageLocation(Exception):
|
|
52
52
|
pass
|
53
53
|
|
54
54
|
|
55
|
+
class NoStorageLocationForSpace(Exception):
|
56
|
+
"""No storage location found for space."""
|
57
|
+
|
58
|
+
pass
|
59
|
+
|
60
|
+
|
55
61
|
# equivalent to Django's DoesNotExist
|
56
62
|
# and SQLAlchemy's NoResultFound
|
57
63
|
class DoesNotExist(Exception):
|
@@ -267,7 +267,7 @@ def anndata_file_pbmc68k_test() -> Path:
|
|
267
267
|
|
268
268
|
To reproduce::
|
269
269
|
|
270
|
-
pbmc68k = ln.
|
270
|
+
pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()
|
271
271
|
pbmc68k_test = pbmc68k[:30, :200].copy()
|
272
272
|
pbmc68k_test.raw = pbmc68k_test[:, :100]
|
273
273
|
pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr")
|
lamindb/integrations/__init__.py
CHANGED
@@ -9,29 +9,11 @@
|
|
9
9
|
lightning
|
10
10
|
"""
|
11
11
|
|
12
|
-
from typing import Any
|
13
|
-
|
14
|
-
|
15
|
-
def __getattr__(attr_name: str) -> Any:
|
16
|
-
# Defers import until accessed to avoid requiring PyTorch Lightning
|
17
|
-
if attr_name == "lightning":
|
18
|
-
from lamindb.integrations import lightning
|
19
|
-
|
20
|
-
return lightning
|
21
|
-
raise AttributeError(f"module has no attribute {attr_name!r}")
|
22
|
-
|
23
|
-
|
24
12
|
from lamindb.core.storage import save_tiledbsoma_experiment
|
25
13
|
|
26
14
|
from ._croissant import curate_from_croissant
|
27
15
|
from ._vitessce import save_vitessce_config
|
28
16
|
|
29
|
-
|
30
|
-
def __dir__():
|
31
|
-
# Makes lazy imports discoverable to dir() to enable autocomplete including lazy modules
|
32
|
-
return __all__
|
33
|
-
|
34
|
-
|
35
17
|
__all__ = [
|
36
18
|
"lightning",
|
37
19
|
"save_tiledbsoma_experiment",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""PyTorch Lightning
|
1
|
+
"""PyTorch Lightning.
|
2
2
|
|
3
3
|
.. autosummary::
|
4
4
|
:toctree: .
|
@@ -20,21 +20,24 @@ class Callback(pl.Callback):
|
|
20
20
|
|
21
21
|
Creates version families of artifacts for given `key` (relative file path).
|
22
22
|
|
23
|
+
See also: :doc:`docs:mlflow` & :doc:`docs:wandb`.
|
24
|
+
|
23
25
|
Args:
|
24
|
-
path:
|
25
|
-
key:
|
26
|
-
features:
|
26
|
+
path: A local path to the checkpoint.
|
27
|
+
key: The `key` for the checkpoint artifact.
|
28
|
+
features: Features to annotate the checkpoint.
|
27
29
|
|
28
30
|
Examples:
|
29
31
|
|
30
|
-
Create a callback
|
32
|
+
Create a callback that creates artifacts for checkpoints and annotates them by the MLflow run ID::
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
34
|
+
import lightning as pl
|
35
|
+
from lamindb.integrations import lightning as ll
|
36
|
+
|
37
|
+
lamindb_callback = ll.Callback(
|
38
|
+
path=checkpoint_filename, key=artifact_key, features={"mlflow_run_id": mlflow_run.info.run_id}
|
37
39
|
)
|
40
|
+
trainer = pl.Trainer(callbacks=[lamindb_callback])
|
38
41
|
"""
|
39
42
|
|
40
43
|
def __init__(
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# Generated by Django 5.2 on 2025-10-13 07:42
|
2
|
+
|
3
|
+
from django.db import migrations, models
|
4
|
+
|
5
|
+
|
6
|
+
class Migration(migrations.Migration):
|
7
|
+
dependencies = [
|
8
|
+
("lamindb", "0133_artifactuser_artifact_users"),
|
9
|
+
]
|
10
|
+
|
11
|
+
operations = [
|
12
|
+
migrations.AddField(
|
13
|
+
model_name="run",
|
14
|
+
name="params",
|
15
|
+
field=models.JSONField(null=True),
|
16
|
+
),
|
17
|
+
]
|