lamindb 1.10.2__py3-none-any.whl → 1.11a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +14 -12
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +2 -2
  8. lamindb/core/storage/_anndata_accessor.py +29 -9
  9. lamindb/curators/_legacy.py +16 -3
  10. lamindb/curators/core.py +432 -186
  11. lamindb/examples/cellxgene/__init__.py +8 -3
  12. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  13. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  14. lamindb/examples/croissant/__init__.py +12 -2
  15. lamindb/examples/datasets/__init__.py +2 -2
  16. lamindb/examples/datasets/_core.py +1 -1
  17. lamindb/examples/datasets/_small.py +66 -22
  18. lamindb/examples/datasets/mini_immuno.py +1 -0
  19. lamindb/migrations/0119_squashed.py +5 -2
  20. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  21. lamindb/migrations/0121_recorduser.py +53 -0
  22. lamindb/models/__init__.py +3 -1
  23. lamindb/models/_describe.py +2 -2
  24. lamindb/models/_feature_manager.py +53 -53
  25. lamindb/models/_from_values.py +2 -2
  26. lamindb/models/_is_versioned.py +4 -4
  27. lamindb/models/_label_manager.py +4 -4
  28. lamindb/models/artifact.py +305 -116
  29. lamindb/models/artifact_set.py +36 -1
  30. lamindb/models/can_curate.py +1 -2
  31. lamindb/models/collection.py +3 -34
  32. lamindb/models/feature.py +111 -7
  33. lamindb/models/has_parents.py +11 -11
  34. lamindb/models/project.py +18 -0
  35. lamindb/models/query_manager.py +16 -7
  36. lamindb/models/query_set.py +59 -34
  37. lamindb/models/record.py +25 -4
  38. lamindb/models/run.py +8 -6
  39. lamindb/models/schema.py +54 -26
  40. lamindb/models/sqlrecord.py +123 -25
  41. lamindb/models/storage.py +59 -14
  42. lamindb/models/transform.py +17 -17
  43. lamindb/models/ulabel.py +6 -1
  44. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/METADATA +4 -5
  45. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/RECORD +47 -44
  46. {lamindb-1.10.2.dist-info → lamindb-1.11a1.dist-info}/WHEEL +1 -1
  47. {lamindb-1.10.2.dist-info/licenses → lamindb-1.11a1.dist-info}/LICENSE +0 -0
lamindb/__init__.py CHANGED
@@ -83,7 +83,7 @@ Curators and integrations.
83
83
  curators
84
84
  integrations
85
85
 
86
- Low-level functionality.
86
+ Examples, errors, and setup.
87
87
 
88
88
  .. autosummary::
89
89
  :toctree: .
@@ -91,6 +91,12 @@ Low-level functionality.
91
91
  examples
92
92
  errors
93
93
  setup
94
+
95
+ Low-level functionality.
96
+
97
+ .. autosummary::
98
+ :toctree: .
99
+
94
100
  base
95
101
  core
96
102
  models
@@ -108,63 +114,97 @@ Backwards compatibility.
108
114
 
109
115
  # ruff: noqa: I001
110
116
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
111
- __version__ = "1.10.2"
117
+ __version__ = "1.11a1"
112
118
 
113
- import warnings
119
+ import warnings as _warnings
114
120
 
115
121
  # through SpatialData
116
- warnings.filterwarnings(
122
+ _warnings.filterwarnings(
117
123
  "ignore", message="The legacy Dask DataFrame implementation is deprecated"
118
124
  )
119
125
 
120
- from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
121
126
  from lamindb_setup._check_setup import _check_instance_setup
122
127
  from lamindb_setup._connect_instance import connect
123
128
  from lamindb_setup.core.upath import UPath
124
129
 
125
130
  from . import base, errors, setup
126
131
 
127
-
128
- def __getattr__(name):
129
- raise _InstanceNotSetupError()
130
-
131
-
132
- if _check_instance_setup(from_module="lamindb"):
133
- del __getattr__ # so that imports work out
134
- from . import base
135
- from ._tracked import tracked
136
- from ._view import view
137
- from .core._context import context
138
- from .core._settings import settings
139
- from .curators._legacy import CatManager as Curator
140
- from .models import (
141
- Artifact,
142
- Collection,
143
- Feature,
144
- FeatureSet, # backward compat
145
- Person,
146
- Project,
147
- Reference,
148
- Run,
149
- Schema,
150
- Storage,
151
- Transform,
152
- ULabel,
153
- User,
154
- Space,
155
- Branch,
156
- Record,
157
- )
158
- from .models.save import save
159
- from . import core
160
- from . import integrations
161
- from . import curators
162
- from . import examples
163
-
164
- track = context._track
165
- finish = context._finish
166
- settings.__doc__ = """Global live settings (:class:`~lamindb.core.Settings`)."""
167
- context.__doc__ = """Global run context (:class:`~lamindb.core.Context`)."""
168
- from django.db.models import Q
169
-
170
- Param = Feature # backward compat
132
+ _check_instance_setup(from_module="lamindb")
133
+
134
+ from ._tracked import tracked
135
+ from ._view import view
136
+ from .core._context import context
137
+ from .core._settings import settings
138
+ from .curators._legacy import CatManager as Curator
139
+ from .models import (
140
+ Artifact,
141
+ Collection,
142
+ Feature,
143
+ FeatureSet, # backward compat
144
+ Person,
145
+ Project,
146
+ Reference,
147
+ Run,
148
+ Schema,
149
+ Storage,
150
+ Transform,
151
+ ULabel,
152
+ User,
153
+ Space,
154
+ Branch,
155
+ Record,
156
+ )
157
+ from .models.save import save
158
+ from . import core
159
+ from . import integrations
160
+ from . import curators
161
+ from . import examples
162
+
163
+ track = context._track
164
+ finish = context._finish
165
+ settings.__doc__ = """Global live settings (:class:`~lamindb.core.Settings`)."""
166
+ context.__doc__ = """Global run context (:class:`~lamindb.core.Context`)."""
167
+ from django.db.models import Q
168
+
169
+ Param = Feature # backward compat
170
+
171
+ __all__ = [
172
+ # data lineage
173
+ "track",
174
+ "finish",
175
+ "tracked",
176
+ # registries
177
+ "Artifact",
178
+ "Storage",
179
+ "Transform",
180
+ "Run",
181
+ "Feature",
182
+ "ULabel",
183
+ "Schema",
184
+ "Record",
185
+ "User",
186
+ "Collection",
187
+ "Project",
188
+ "Space",
189
+ "Branch",
190
+ "Reference",
191
+ "Person",
192
+ # other
193
+ "connect",
194
+ "view",
195
+ "save",
196
+ "UPath",
197
+ "settings",
198
+ "context",
199
+ # curators and integrations
200
+ "curators",
201
+ "integrations",
202
+ # examples, errors, setup
203
+ "examples",
204
+ "errors",
205
+ "setup",
206
+ # low-level functionality
207
+ "base",
208
+ "core",
209
+ "models",
210
+ ]
lamindb/_finish.py CHANGED
@@ -264,12 +264,14 @@ def save_context_core(
264
264
  if (
265
265
  is_run_from_ipython and notebook_runner != "nbconvert" and filepath.exists()
266
266
  ): # python notebooks in interactive session
267
- import nbproject
268
-
269
- # it might be that the user modifies the title just before ln.finish()
270
- if (nbproject_title := nbproject.meta.live.title) != transform.description:
271
- transform.description = nbproject_title
272
- transform.save()
267
+ if is_ipynb:
268
+ # ignore this for py:percent notebooks
269
+ import nbproject
270
+
271
+ # it might be that the user modifies the title just before ln.finish()
272
+ if (nbproject_title := nbproject.meta.live.title) != transform.description:
273
+ transform.description = nbproject_title
274
+ transform.save()
273
275
  if not ln_setup._TESTING:
274
276
  save_source_code_and_report = check_filepath_recently_saved(
275
277
  filepath, is_retry
@@ -349,7 +351,7 @@ def save_context_core(
349
351
  if transform_hash != transform.hash:
350
352
  response = input(
351
353
  f"You are about to overwrite existing source code (hash '{transform.hash}') for Transform('{transform.uid}')."
352
- f" Proceed? (y/n)"
354
+ f" Proceed? (y/n) "
353
355
  )
354
356
  if response == "y":
355
357
  transform.source_code = source_code_path.read_text()
@@ -365,11 +367,11 @@ def save_context_core(
365
367
 
366
368
  if run is not None:
367
369
  base_path = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}"
368
- paths = [base_path / "run_env_pip.txt", base_path / "r_pak_lockfile.json"]
370
+ paths = [base_path / "run_env_pip.txt", base_path / "r_environment.txt"]
369
371
  existing_paths = [path for path in paths if path.exists()]
370
372
  if len(existing_paths) == 2:
371
373
  # let's not store the python environment for an R session for now
372
- existing_paths = [base_path / "r_pak_lockfile.json"]
374
+ existing_paths = [base_path / "r_environment.txt"]
373
375
 
374
376
  if existing_paths:
375
377
  overwrite_env = True
@@ -387,8 +389,8 @@ def save_context_core(
387
389
  if len(existing_paths) == 1:
388
390
  if existing_paths[0].name == "run_env_pip.txt":
389
391
  description = "requirements.txt"
390
- elif existing_paths[0].name == "r_pak_lockfile.json":
391
- description = "r_pak_lockfile.json"
392
+ elif existing_paths[0].name == "r_environment.txt":
393
+ description = "r_environment.txt"
392
394
  env_hash, _ = hash_file(artifact_path)
393
395
  else:
394
396
  description = "environments"
@@ -432,7 +434,7 @@ def save_context_core(
432
434
  hash, _ = hash_file(report_path) # ignore hash_type for now
433
435
  if hash != run.report.hash:
434
436
  response = input(
435
- f"You are about to overwrite an existing report (hash '{run.report.hash}') for Run('{run.uid}'). Proceed? (y/n)"
437
+ f"You are about to overwrite an existing report (hash '{run.report.hash}') for Run('{run.uid}'). Proceed? (y/n) "
436
438
  )
437
439
  if response == "y":
438
440
  run.report.replace(report_path)
lamindb/_tracked.py CHANGED
@@ -52,7 +52,7 @@ def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]
52
52
  artifact = ln.Artifact.get(key=input_artifact_key)
53
53
  df = artifact.load() # auto-tracked as input
54
54
  new_df = df.iloc[:subset_rows, :subset_cols]
55
- ln.Artifact.from_df(new_df, key=output_artifact_key).save() # auto-tracked as output
55
+ ln.Artifact.from_dataframe(new_df, key=output_artifact_key).save() # auto-tracked as output
56
56
  """
57
57
 
58
58
  def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:
@@ -104,9 +104,7 @@ def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]
104
104
  # Deal with non-trivial parameter values
105
105
  filtered_params = {}
106
106
  for key, value in params.items():
107
- dtype, _, _ = infer_feature_type_convert_json(
108
- key, value, str_as_ulabel=False
109
- )
107
+ dtype, _, _ = infer_feature_type_convert_json(key, value)
110
108
  if (dtype == "?" or dtype.startswith("cat")) and dtype != "cat ? str":
111
109
  continue
112
110
  filtered_params[key] = value
lamindb/_view.py CHANGED
@@ -162,7 +162,7 @@ def view(
162
162
  logger.print(section)
163
163
  logger.print("*" * len(section_no_color))
164
164
  for registry in sorted(filtered_registries, key=lambda x: x.__name__):
165
- df = registry.df(limit=limit)
165
+ df = registry.to_dataframe(limit=limit)
166
166
  if df.shape[0] > 0:
167
167
  logger.print(colors.blue(colors.bold(registry.__name__)))
168
168
  show(df)
lamindb/base/__init__.py CHANGED
@@ -10,6 +10,7 @@ Modules:
10
10
  uids
11
11
  types
12
12
  fields
13
+ dtypes
13
14
 
14
15
  Utils:
15
16
 
@@ -23,4 +24,4 @@ Utils:
23
24
 
24
25
  from lamindb_setup.core import deprecated, doc_args
25
26
 
26
- from . import fields, types, uids
27
+ from . import dtypes, fields, types, uids
lamindb/base/dtypes.py ADDED
@@ -0,0 +1,76 @@
1
+ from datetime import datetime
2
+ from typing import Any, Callable, Iterable
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def is_list_of_type(value: Any, expected_type: Any) -> bool:
8
+ """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure."""
9
+ if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
10
+ # handle nested lists recursively
11
+ return all(is_list_of_type(item, expected_type) for item in value)
12
+ return isinstance(value, expected_type)
13
+
14
+
15
+ def check_dtype(expected_type: Any) -> Callable:
16
+ """Creates a check function for Pandera that validates a column's dtype.
17
+
18
+ Supports both standard dtype checking and mixed list/single values for the same type.
19
+ For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
20
+
21
+ Args:
22
+ expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
23
+
24
+ Returns:
25
+ A function that checks if a series has the expected dtype or contains mixed types
26
+ """
27
+
28
+ def check_function(series):
29
+ # first check if the series is entirely of the expected dtype (fast path)
30
+ if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
31
+ return True
32
+ elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
33
+ return True
34
+ elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
35
+ return True
36
+ elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
37
+ return True
38
+ elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
39
+ return True
40
+
41
+ # if we're here, it might be a mixed column with object dtype
42
+ # need to check each value individually
43
+ if series.dtype == "object" and expected_type.startswith("list"):
44
+ expected_type_member = expected_type.replace("list[", "").removesuffix("]")
45
+ if expected_type_member == "int":
46
+ return series.apply(lambda x: is_list_of_type(x, int)).all()
47
+ elif expected_type_member == "float":
48
+ return series.apply(lambda x: is_list_of_type(x, float)).all()
49
+ elif expected_type_member == "num":
50
+ # for numeric, accept either int or float
51
+ return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
52
+ elif (
53
+ expected_type_member == "str"
54
+ or expected_type_member == "path"
55
+ or expected_type_member.startswith("cat[")
56
+ ):
57
+ return series.apply(lambda x: is_list_of_type(x, str)).all()
58
+
59
+ # if we get here, the validation failed
60
+ return False
61
+
62
+ return check_function
63
+
64
+
65
+ def is_valid_datetime_str(date_string: str) -> bool | str:
66
+ try:
67
+ dt = datetime.fromisoformat(date_string)
68
+ return dt.isoformat()
69
+ except ValueError:
70
+ return False
71
+
72
+
73
+ def is_iterable_of_sqlrecord(value: Any):
74
+ from lamindb.models import SQLRecord
75
+
76
+ return isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord)
lamindb/core/_settings.py CHANGED
@@ -206,7 +206,7 @@ class Settings:
206
206
  exists = ln.Storage.filter(root=ssettings.root_as_str).one_or_none()
207
207
  if exists is None:
208
208
  response = input(
209
- f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n)"
209
+ f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n) "
210
210
  )
211
211
  # logger.warning(f"deprecated call because storage location does **not yet** exist; going forward, please create through ln.Storage(root={path}).save() going forward")
212
212
  if response != "y":
@@ -256,7 +256,7 @@ class Settings:
256
256
  exists = ln.Storage.filter(root=ssettings.root_as_str).one_or_none()
257
257
  if exists is None:
258
258
  response = input(
259
- f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n)"
259
+ f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n) "
260
260
  )
261
261
  # logger.warning(f"deprecated call because storage location does **not yet** exist; going forward, please create through ln.Storage(root={path}).save() going forward")
262
262
  if response != "y":
@@ -295,7 +295,7 @@ except ImportError:
295
295
  if ZARR_INSTALLED:
296
296
  from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
297
297
 
298
- from ._zarr import get_zarr_store
298
+ from ._zarr import IS_ZARR_V3, get_zarr_store
299
299
 
300
300
  ArrayTypes.append(zarr.Array)
301
301
  GroupTypes.append(zarr.Group)
@@ -306,11 +306,17 @@ if ZARR_INSTALLED:
306
306
  assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
307
307
 
308
308
  store = get_zarr_store(filepath)
309
- storage = zarr.open(store, mode=mode)
309
+ kwargs = {}
310
+ if IS_ZARR_V3 and mode != "r":
311
+ # otherwise unable to write
312
+ kwargs["use_consolidated"] = False
313
+ storage = zarr.open(store, mode=mode, **kwargs)
310
314
  # zarr v2 re-initializes the mapper
311
315
  # we need to put back the correct one
312
316
  # S3FSMap is returned from get_zarr_store only for zarr v2
313
317
  if isinstance(store, S3FSMap):
318
+ assert not IS_ZARR_V3 # noqa: S101
319
+
314
320
  storage.store.map = store
315
321
  conn = None
316
322
  return conn, storage
@@ -363,10 +369,10 @@ if ZARR_INSTALLED:
363
369
  # this is needed because accessing zarr.Group.keys() directly is very slow
364
370
  @registry.register("zarr")
365
371
  def keys(storage: zarr.Group):
366
- if hasattr(storage, "_sync_iter"): # zarr v3
372
+ if IS_ZARR_V3:
367
373
  paths = storage._sync_iter(storage.store.list())
368
374
  else:
369
- paths = storage.store.keys() # zarr v2
375
+ paths = storage.store.keys()
370
376
 
371
377
  attrs_keys: dict[str, list] = {}
372
378
  obs_var_arrays = []
@@ -748,22 +754,36 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
748
754
 
749
755
  def close(self):
750
756
  """Closes the connection."""
751
- if hasattr(self, "storage") and hasattr(self.storage, "close"):
752
- self.storage.close()
753
- if hasattr(self, "_conn") and hasattr(self._conn, "close"):
754
- self._conn.close()
755
- self._closed = True
757
+ storage = self.storage
758
+ connection = self._conn
756
759
 
757
760
  if self._updated and (artifact := self._artifact) is not None:
758
761
  from lamindb.models.artifact import Artifact
759
762
  from lamindb.models.sqlrecord import init_self_from_db
760
763
 
764
+ # now self._updated can only be True for zarr
765
+ assert ZARR_INSTALLED # noqa: S101
766
+
767
+ store = storage.store
768
+ keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()
769
+ # this checks that there consolidated metadata was written before
770
+ # need to update it
771
+ # zmetadata is in spatialdata sometimes for some reason
772
+ if ".zmetadata" in keys or "zmetadata" in keys:
773
+ zarr.consolidate_metadata(store)
774
+
761
775
  new_version = Artifact(
762
776
  artifact.path, revises=artifact, _is_internal_call=True
763
777
  ).save()
764
778
  # note: sets _state.db = "default"
765
779
  init_self_from_db(artifact, new_version)
766
780
 
781
+ if hasattr(storage, "close"):
782
+ storage.close()
783
+ if hasattr(connection, "close"):
784
+ connection.close()
785
+ self._closed = True
786
+
767
787
  @property
768
788
  def closed(self):
769
789
  return self._closed
@@ -133,7 +133,7 @@ class CatManager:
133
133
 
134
134
  if self._artifact is None:
135
135
  if isinstance(self._dataset, pd.DataFrame):
136
- artifact = Artifact.from_df(
136
+ artifact = Artifact.from_dataframe(
137
137
  self._dataset,
138
138
  key=key,
139
139
  description=description,
@@ -1275,7 +1275,7 @@ class TiledbsomaCatManager(CatManager):
1275
1275
  empty_dict, schema=self._obs_pa_schema
1276
1276
  ).to_pandas()
1277
1277
  # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1278
- feature_sets["obs"] = Schema.from_df(
1278
+ feature_sets["obs"] = Schema.from_dataframe(
1279
1279
  df=mock_df,
1280
1280
  field=self._columns_field,
1281
1281
  mute=True,
@@ -1367,7 +1367,7 @@ def legacy_annotate_artifact(
1367
1367
 
1368
1368
 
1369
1369
  @classmethod # type: ignore
1370
- def from_df(
1370
+ def from_dataframe(
1371
1371
  cls,
1372
1372
  df: pd.DataFrame,
1373
1373
  categoricals: dict[str, FieldAttr] | None = None,
@@ -1383,6 +1383,18 @@ def from_df(
1383
1383
  )
1384
1384
 
1385
1385
 
1386
+ @classmethod # type: ignore
1387
+ @deprecated("from_dataframe")
1388
+ def from_df(
1389
+ cls,
1390
+ df: pd.DataFrame,
1391
+ categoricals: dict[str, FieldAttr] | None = None,
1392
+ columns: FieldAttr = Feature.name,
1393
+ organism: str | None = None,
1394
+ ) -> DataFrameCatManager:
1395
+ return cls.from_dataframe(df, categoricals, columns, organism)
1396
+
1397
+
1386
1398
  @classmethod # type: ignore
1387
1399
  def from_anndata(
1388
1400
  cls,
@@ -1468,6 +1480,7 @@ def from_spatialdata(
1468
1480
  )
1469
1481
 
1470
1482
 
1483
+ CatManager.from_dataframe = from_dataframe # type: ignore
1471
1484
  CatManager.from_df = from_df # type: ignore
1472
1485
  CatManager.from_anndata = from_anndata # type: ignore
1473
1486
  CatManager.from_mudata = from_mudata # type: ignore