lamindb 1.0.5__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lamindb/__init__.py +14 -5
  2. lamindb/_artifact.py +150 -53
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +85 -51
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +12 -6
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +59 -17
  10. lamindb/_record.py +171 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +33 -10
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +106 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/validation.py +2 -6
  19. lamindb/core/__init__.py +13 -14
  20. lamindb/core/_context.py +7 -7
  21. lamindb/core/_data.py +29 -25
  22. lamindb/core/_describe.py +1 -1
  23. lamindb/core/_django.py +1 -1
  24. lamindb/core/_feature_manager.py +53 -43
  25. lamindb/core/_label_manager.py +4 -4
  26. lamindb/core/_mapped_collection.py +20 -7
  27. lamindb/core/datasets/__init__.py +6 -1
  28. lamindb/core/datasets/_core.py +12 -11
  29. lamindb/core/datasets/_small.py +66 -20
  30. lamindb/core/exceptions.py +1 -90
  31. lamindb/core/loaders.py +6 -12
  32. lamindb/core/relations.py +6 -4
  33. lamindb/core/storage/_anndata_accessor.py +41 -0
  34. lamindb/core/storage/_backed_access.py +2 -2
  35. lamindb/core/storage/_pyarrow_dataset.py +25 -15
  36. lamindb/core/storage/_tiledbsoma.py +56 -12
  37. lamindb/core/storage/paths.py +27 -21
  38. lamindb/core/subsettings/_creation_settings.py +4 -16
  39. lamindb/curators/__init__.py +2168 -833
  40. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  41. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  42. lamindb/errors.py +96 -0
  43. lamindb/integrations/_vitessce.py +3 -3
  44. lamindb/migrations/0069_squashed.py +76 -75
  45. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  46. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  47. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  48. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  49. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  50. lamindb/migrations/0086_various.py +95 -0
  51. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  52. lamindb/migrations/0088_schema_components.py +273 -0
  53. lamindb/migrations/0088_squashed.py +4372 -0
  54. lamindb/models.py +420 -153
  55. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/METADATA +9 -7
  56. lamindb-1.1.0.dist-info/RECORD +95 -0
  57. lamindb/curators/_spatial.py +0 -528
  58. lamindb/migrations/0052_squashed.py +0 -1261
  59. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  60. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  61. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  62. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  63. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  64. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  65. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  66. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  67. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  68. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  69. lamindb/migrations/0063_populate_latest_field.py +0 -45
  70. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  71. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  72. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  73. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  74. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  75. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  76. lamindb-1.0.5.dist-info/RECORD +0 -102
  77. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
  78. {lamindb-1.0.5.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
@@ -8,20 +8,25 @@ import pandas as pd
8
8
 
9
9
 
10
10
  def small_dataset1(
11
- format: Literal["df", "anndata"],
11
+ otype: Literal["DataFrame", "AnnData"],
12
+ gene_symbols_in_index: bool = False,
12
13
  with_typo: bool = False,
13
- ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
14
+ ) -> pd.DataFrame | ad.AnnData:
14
15
  # define the data in the dataset
15
16
  # it's a mix of numerical measurements and observation-level metadata
16
17
  ifng = "IFNJ" if with_typo else "IFNG"
18
+ if gene_symbols_in_index:
19
+ var_ids = ["CD8A", "CD4", "CD14"]
20
+ else:
21
+ var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
17
22
  dataset_dict = {
18
- "CD8A": [1, 2, 3],
19
- "CD4": [3, 4, 5],
20
- "CD14": [5, 6, 7],
21
- "cell_medium": ["DMSO", ifng, "DMSO"],
23
+ var_ids[0]: [1, 2, 3],
24
+ var_ids[1]: [3, 4, 5],
25
+ var_ids[2]: [5, 6, 7],
26
+ "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
22
27
  "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
23
- "cell_type_by_expert": ["B cell", "T cell", "T cell"],
24
- "cell_type_by_model": ["B cell", "T cell", "T cell"],
28
+ "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
29
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
25
30
  }
26
31
  # define the dataset-level metadata
27
32
  metadata = {
@@ -32,8 +37,10 @@ def small_dataset1(
32
37
  }
33
38
  # the dataset as DataFrame
34
39
  dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
35
- if format == "df":
36
- return dataset_df, metadata
40
+ if otype == "DataFrame":
41
+ for key, value in metadata.items():
42
+ dataset_df.attrs[key] = value
43
+ return dataset_df
37
44
  else:
38
45
  dataset_ad = ad.AnnData(
39
46
  dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -42,14 +49,19 @@ def small_dataset1(
42
49
 
43
50
 
44
51
  def small_dataset2(
45
- format: Literal["df", "anndata"],
46
- ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
52
+ otype: Literal["DataFrame", "AnnData"],
53
+ gene_symbols_in_index: bool = False,
54
+ ) -> pd.DataFrame | ad.AnnData:
55
+ if gene_symbols_in_index:
56
+ var_ids = ["CD8A", "CD4", "CD38"]
57
+ else:
58
+ var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
47
59
  dataset_dict = {
48
- "CD8A": [2, 3, 3],
49
- "CD4": [3, 4, 5],
50
- "CD38": [4, 2, 3],
51
- "cell_medium": ["DMSO", "IFNG", "IFNG"],
52
- "cell_type_by_model": ["B cell", "T cell", "T cell"],
60
+ var_ids[0]: [2, 3, 3],
61
+ var_ids[1]: [3, 4, 5],
62
+ var_ids[2]: [4, 2, 3],
63
+ "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
64
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
53
65
  }
54
66
  metadata = {
55
67
  "temperature": 22.6,
@@ -61,11 +73,13 @@ def small_dataset2(
61
73
  index=["sample4", "sample5", "sample6"],
62
74
  )
63
75
  ad.AnnData(
64
- dataset_df[["CD8A", "CD4", "CD38"]],
76
+ dataset_df[var_ids],
65
77
  obs=dataset_df[["cell_medium", "cell_type_by_model"]],
66
78
  )
67
- if format == "df":
68
- return dataset_df, metadata
79
+ if otype == "DataFrame":
80
+ for key, value in metadata.items():
81
+ dataset_df.attrs[key] = value
82
+ return dataset_df
69
83
  else:
70
84
  dataset_ad = ad.AnnData(
71
85
  dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -73,6 +87,38 @@ def small_dataset2(
73
87
  return dataset_ad
74
88
 
75
89
 
90
+ def small_dataset3_cellxgene(
91
+ otype: Literal["DataFrame", "AnnData"] = "AnnData",
92
+ ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
93
+ # TODO: consider other ids for other organisms
94
+ # "ENSMUSG00002076988"
95
+ var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
96
+ dataset_dict = {
97
+ var_ids[0]: [2, 3, 3],
98
+ var_ids[1]: [3, 4, 5],
99
+ var_ids[2]: [4, 2, 3],
100
+ "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
101
+ "organism": ["human", "human", "human"],
102
+ "sex": ["female", "male", "unknown"],
103
+ "tissue": ["lungg", "lungg", "heart"],
104
+ "donor": ["-1", "1", "2"],
105
+ }
106
+ dataset_df = pd.DataFrame(
107
+ dataset_dict,
108
+ index=["barcode1", "barcode2", "barcode3"],
109
+ )
110
+ dataset_df["tissue"] = dataset_df["tissue"].astype("category")
111
+ ad.AnnData(
112
+ dataset_df[var_ids],
113
+ obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
114
+ )
115
+ if otype == "DataFrame":
116
+ return dataset_df
117
+ else:
118
+ dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
119
+ return dataset_ad
120
+
121
+
76
122
  def anndata_with_obs() -> ad.AnnData:
77
123
  """Create a mini anndata with cell_type, disease and tissue."""
78
124
  import anndata as ad
@@ -1,90 +1 @@
1
- """Exceptions.
2
-
3
- .. autosummary::
4
- :toctree: .
5
-
6
- InvalidArgument
7
- DoesNotExist
8
- ValidationError
9
- NotebookNotSaved
10
- MissingContextUID
11
- UpdateContext
12
- IntegrityError
13
- RecordNameChangeIntegrityError
14
-
15
- """
16
-
17
- # inheriting from SystemExit has the sole purpose of suppressing
18
- # the traceback - this isn't optimal but the current best solution
19
- # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1726856875597489
20
-
21
-
22
- class InvalidArgument(SystemExit):
23
- """Invalid method or function argument."""
24
-
25
- pass
26
-
27
-
28
- class TrackNotCalled(SystemExit):
29
- """`ln.track()` wasn't called."""
30
-
31
- pass
32
-
33
-
34
- class NotebookNotSaved(SystemExit):
35
- """Notebook wasn't saved."""
36
-
37
- pass
38
-
39
-
40
- class ValidationError(SystemExit):
41
- """Validation error: not mapped in registry."""
42
-
43
- pass
44
-
45
-
46
- # inspired by Django's DoesNotExist
47
- # equivalent to SQLAlchemy's NoResultFound
48
- class DoesNotExist(SystemExit):
49
- """No record found."""
50
-
51
- pass
52
-
53
-
54
- class InconsistentKey(Exception):
55
- """Inconsistent transform or artifact `key`."""
56
-
57
- pass
58
-
59
-
60
- class RecordNameChangeIntegrityError(SystemExit):
61
- """Custom exception for name change errors."""
62
-
63
- pass
64
-
65
-
66
- # -------------------------------------------------------------------------------------
67
- # run context
68
- # -------------------------------------------------------------------------------------
69
-
70
-
71
- class IntegrityError(Exception):
72
- """Integrity error.
73
-
74
- For instance, it's not allowed to delete artifacts outside managed storage
75
- locations.
76
- """
77
-
78
- pass
79
-
80
-
81
- class MissingContextUID(SystemExit):
82
- """User didn't define transform settings."""
83
-
84
- pass
85
-
86
-
87
- class UpdateContext(SystemExit):
88
- """Transform settings require update."""
89
-
90
- pass
1
+ from ..errors import * # noqa: F403 backward compat
lamindb/core/loaders.py CHANGED
@@ -109,19 +109,13 @@ def load_json(path: UPathStr) -> dict:
109
109
  return data
110
110
 
111
111
 
112
- def load_yaml(path: UPathStr) -> dict | UPathStr:
112
+ def load_yaml(path: UPathStr) -> dict:
113
113
  """Load `.yaml` to `dict`."""
114
- try:
115
- import yaml # type: ignore
116
-
117
- with open(path) as f:
118
- data = yaml.safe_load(f)
119
- return data
120
- except ImportError:
121
- logger.warning(
122
- "Please install PyYAML (`pip install PyYAML`) to load `.yaml` files."
123
- )
124
- return path
114
+ import yaml # type: ignore
115
+
116
+ with open(path) as f:
117
+ data = yaml.safe_load(f)
118
+ return data
125
119
 
126
120
 
127
121
  def load_image(path: UPathStr) -> None | UPathStr:
lamindb/core/relations.py CHANGED
@@ -8,7 +8,7 @@ from lamindb_setup._connect_instance import (
8
8
  )
9
9
  from lamindb_setup.core._settings_store import instance_settings_file
10
10
 
11
- from lamindb.models import LinkORM, Record, Schema
11
+ from lamindb.models import LinkORM, Record, Registry, Schema
12
12
 
13
13
 
14
14
  def get_schema_modules(instance: str | None) -> set[str]:
@@ -35,9 +35,11 @@ def get_schema_modules(instance: str | None) -> set[str]:
35
35
  return shared_schema_modules
36
36
 
37
37
 
38
+ # this function here should likely be renamed
39
+ # it maps the __get_name_with_module__() onto the actual model
38
40
  def dict_module_name_to_model_name(
39
- registry: type[Record], instance: str | None = None
40
- ) -> dict[str, Record]:
41
+ registry: Registry, instance: str | None = None
42
+ ) -> dict[str, Registry]:
41
43
  schema_modules = get_schema_modules(instance)
42
44
  d: dict = {
43
45
  i.related_model.__get_name_with_module__(): i.related_model
@@ -92,7 +94,7 @@ def get_related_name(features_type: type[Record]) -> str:
92
94
  f"Can't create feature sets from {features_type.__name__} because it's not"
93
95
  " related to it!\nYou need to create a link model between Schema and"
94
96
  " your Record in your custom module.\nTo do so, add a"
95
- " line:\n_schemas_m2m = models.ManyToMany(Schema,"
97
+ " line:\n_feature_sets = models.ManyToMany(Schema,"
96
98
  " related_name='mythings')\n"
97
99
  )
98
100
  return candidates[0]
@@ -19,6 +19,7 @@ from fsspec.implementations.local import LocalFileSystem
19
19
  from lamin_utils import logger
20
20
  from lamindb_setup.core.upath import create_mapper, infer_filesystem
21
21
  from packaging import version
22
+ from upath import UPath
22
23
 
23
24
  if TYPE_CHECKING:
24
25
  from collections.abc import Mapping
@@ -741,3 +742,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
741
742
  return AnnDataRawAccessor(
742
743
  self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
743
744
  )
745
+
746
+
747
+ # get the number of observations in an anndata object or file fast and safely
748
+ def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
749
+ if isinstance(object, AnnData):
750
+ return object.n_obs
751
+
752
+ try:
753
+ objectpath = UPath(object)
754
+ suffix = objectpath.suffix
755
+ conn_module = {".h5ad": "h5py", ".zarr": "zarr"}.get(suffix, suffix[1:])
756
+ conn, storage = registry.open(conn_module, objectpath, mode="r")
757
+ except Exception as e:
758
+ logger.warning(f"Could not open {object} to read n_observations: {e}")
759
+ return None
760
+
761
+ n_observations: int | None = None
762
+ try:
763
+ obs = storage["obs"]
764
+ if isinstance(obs, GroupTypes): # type: ignore
765
+ if "_index" in obs.attrs:
766
+ elem_key = _read_attr(obs.attrs, "_index")
767
+ else:
768
+ elem_key = next(iter(obs))
769
+ elem = obs[elem_key]
770
+ if isinstance(elem, ArrayTypes): # type: ignore
771
+ n_observations = elem.shape[0]
772
+ else:
773
+ # assume standard obs group
774
+ n_observations = elem["codes"].shape[0]
775
+ else:
776
+ n_observations = obs.shape[0]
777
+ except Exception as e:
778
+ logger.warning(f"Could not read n_observations from anndata {object}: {e}")
779
+ finally:
780
+ if hasattr(storage, "close"):
781
+ storage.close()
782
+ if hasattr(conn, "close"):
783
+ conn.close()
784
+ return n_observations
@@ -94,8 +94,8 @@ def backed_access(
94
94
  return _open_pyarrow_dataset(objectpath)
95
95
  else:
96
96
  raise ValueError(
97
- "object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix, not"
98
- f" {suffix}."
97
+ "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
98
+ f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
99
99
  )
100
100
 
101
101
  is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
@@ -6,26 +6,36 @@ import pyarrow.dataset
6
6
  from lamindb_setup.core.upath import LocalPathClasses
7
7
 
8
8
  if TYPE_CHECKING:
9
+ from pyarrow.dataset import Dataset as PyArrowDataset
9
10
  from upath import UPath
10
11
 
11
12
 
12
- PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather")
13
+ PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
13
14
 
14
15
 
15
- def _is_pyarrow_dataset(path: UPath) -> bool:
16
- # it is assumed here that path exists
17
- if path.is_file():
18
- return path.suffix in PYARROW_SUFFIXES
16
+ def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
17
+ # it is assumed here that the paths exist
18
+ # we don't check here that the filesystem is the same
19
+ # but this is a requirement for pyarrow.dataset.dataset
20
+ if isinstance(paths, list):
21
+ suffixes = {path.suffix for path in paths}
22
+ elif paths.is_file():
23
+ suffixes = {paths.suffix}
19
24
  else:
20
- objects = path.rglob("*")
21
- suffixes = {object.suffix for object in objects if object.suffix != ""}
22
- return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
23
-
24
-
25
- def _open_pyarrow_dataset(path: UPath) -> pyarrow.dataset.Dataset:
26
- if isinstance(path, LocalPathClasses):
27
- path_str, filesystem = path.as_posix(), None
25
+ suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
26
+ return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
27
+
28
+
29
+ def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
30
+ if isinstance(paths, list):
31
+ path0 = paths[0]
32
+ if isinstance(path0, LocalPathClasses):
33
+ paths_str, filesystem = [path.as_posix() for path in paths], None
34
+ else:
35
+ paths_str, filesystem = [path.path for path in paths], path0.fs
36
+ elif isinstance(paths, LocalPathClasses):
37
+ paths_str, filesystem = paths.as_posix(), None
28
38
  else:
29
- path_str, filesystem = path.path, path.fs
39
+ paths_str, filesystem = paths.path, paths.fs
30
40
 
31
- return pyarrow.dataset.dataset(path_str, filesystem=filesystem)
41
+ return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Literal
4
+ from urllib.parse import urlparse
4
5
 
5
6
  import pandas as pd
6
7
  import pyarrow as pa
@@ -17,6 +18,7 @@ if TYPE_CHECKING:
17
18
  from lamindb_setup.core.types import UPathStr
18
19
  from tiledbsoma import Collection as SOMACollection
19
20
  from tiledbsoma import Experiment as SOMAExperiment
21
+ from tiledbsoma import Measurement as SOMAMeasurement
20
22
  from upath import UPath
21
23
 
22
24
 
@@ -36,9 +38,21 @@ def _load_h5ad_zarr(objpath: UPath):
36
38
 
37
39
 
38
40
  def _tiledb_config_s3(storepath: UPath) -> dict:
39
- region = get_storage_region(storepath)
40
- tiledb_config = {"vfs.s3.region": region}
41
41
  storage_options = storepath.storage_options
42
+ tiledb_config = {}
43
+
44
+ endpoint_url = storage_options.get("endpoint_url", None)
45
+ if endpoint_url is not None:
46
+ tiledb_config["vfs.s3.region"] = ""
47
+ tiledb_config["vfs.s3.use_virtual_addressing"] = "false"
48
+ parsed = urlparse(endpoint_url)
49
+ tiledb_config["vfs.s3.scheme"] = parsed.scheme
50
+ tiledb_config["vfs.s3.endpoint_override"] = (
51
+ parsed._replace(scheme="").geturl().lstrip("/")
52
+ )
53
+ else:
54
+ tiledb_config["vfs.s3.region"] = get_storage_region(storepath)
55
+
42
56
  if "key" in storage_options:
43
57
  tiledb_config["vfs.s3.aws_access_key_id"] = storage_options["key"]
44
58
  if "secret" in storage_options:
@@ -51,7 +65,7 @@ def _tiledb_config_s3(storepath: UPath) -> dict:
51
65
 
52
66
  def _open_tiledbsoma(
53
67
  storepath: UPath, mode: Literal["r", "w"] = "r"
54
- ) -> SOMACollection | SOMAExperiment:
68
+ ) -> SOMACollection | SOMAExperiment | SOMAMeasurement:
55
69
  try:
56
70
  import tiledbsoma as soma
57
71
  except ImportError as e:
@@ -71,6 +85,8 @@ def _open_tiledbsoma(
71
85
  soma_objects = [obj.name for obj in storepath.iterdir()]
72
86
  if "obs" in soma_objects and "ms" in soma_objects:
73
87
  SOMAType = soma.Experiment
88
+ elif "var" in soma_objects:
89
+ SOMAType = soma.Measurement
74
90
  else:
75
91
  SOMAType = soma.Collection
76
92
  return SOMAType.open(storepath_str, mode=mode, context=ctx)
@@ -134,17 +150,17 @@ def save_tiledbsoma_experiment(
134
150
  )
135
151
  storepath = setup_settings.storage.root / storage_key
136
152
 
137
- if storepath.protocol == "s3":
153
+ if storepath.protocol == "s3": # type: ignore
138
154
  ctx = soma.SOMATileDBContext(tiledb_config=_tiledb_config_s3(storepath))
139
155
  else:
140
156
  ctx = None
141
157
 
142
- storepath = storepath.as_posix()
158
+ storepath_str = storepath.as_posix()
143
159
 
144
160
  add_run_uid = True
145
161
  run_uid_dtype = "category"
146
162
  if appending:
147
- with soma.Experiment.open(storepath, mode="r", context=ctx) as store:
163
+ with soma.Experiment.open(storepath_str, mode="r", context=ctx) as store:
148
164
  obs_schema = store["obs"].schema
149
165
  add_run_uid = "lamin_run_uid" in obs_schema.names
150
166
  # this is needed to enable backwards compatibility with tiledbsoma stores
@@ -175,7 +191,7 @@ def save_tiledbsoma_experiment(
175
191
  registration_mapping = kwargs.get("registration_mapping", None)
176
192
  if registration_mapping is None and (appending or len(adata_objects) > 1):
177
193
  registration_mapping = soma_io.register_anndatas(
178
- experiment_uri=storepath if appending else None,
194
+ experiment_uri=storepath_str if appending else None,
179
195
  adatas=adata_objects,
180
196
  measurement_name=measurement_name,
181
197
  obs_field_name=obs_id_name,
@@ -195,19 +211,19 @@ def save_tiledbsoma_experiment(
195
211
  assert len(adata_objects) == 1 # noqa: S101
196
212
  n_observations = adata_objects[0].n_obs
197
213
 
198
- logger.important(f"Writing the tiledbsoma store to {storepath}")
214
+ logger.important(f"Writing the tiledbsoma store to {storepath_str}")
199
215
  for adata_obj in adata_objects:
200
- if resize_experiment and soma.Experiment.exists(storepath, context=ctx):
216
+ if resize_experiment and soma.Experiment.exists(storepath_str, context=ctx):
201
217
  # can only happen if registration_mapping is not None
202
218
  soma_io.resize_experiment(
203
- storepath,
219
+ storepath_str,
204
220
  nobs=n_observations,
205
221
  nvars=registration_mapping.get_var_shapes(),
206
222
  context=ctx,
207
223
  )
208
224
  resize_experiment = False
209
225
  soma_io.from_anndata(
210
- storepath,
226
+ storepath_str,
211
227
  adata_obj,
212
228
  measurement_name,
213
229
  context=ctx,
@@ -217,7 +233,7 @@ def save_tiledbsoma_experiment(
217
233
  **kwargs,
218
234
  )
219
235
 
220
- artifact = Artifact(
236
+ artifact = Artifact( # type: ignore
221
237
  storepath,
222
238
  key=key,
223
239
  description=description,
@@ -229,3 +245,31 @@ def save_tiledbsoma_experiment(
229
245
  artifact.otype = "tiledbsoma"
230
246
 
231
247
  return artifact.save()
248
+
249
+
250
+ # this is less defensive than _anndata_n_observations
251
+ # this doesn't really catches errors
252
+ # assumes that the tiledbsoma object is well-formed
253
+ def _soma_store_n_observations(obj) -> int:
254
+ if obj.soma_type in {"SOMADataFrame", "SOMASparseNDArray", "SOMADenseNDArray"}:
255
+ return obj.non_empty_domain()[0][1] + 1
256
+ elif obj.soma_type == "SOMAExperiment":
257
+ return _soma_store_n_observations(obj["obs"])
258
+ elif obj.soma_type == "SOMAMeasurement":
259
+ keys = obj.keys()
260
+ for slot in ("X", "obsm", "obsp"):
261
+ if slot in keys:
262
+ return _soma_store_n_observations(next(iter(obj[slot].values())))
263
+ elif obj.soma_type == "SOMACollection":
264
+ n_obs = 0
265
+ for value in obj.values():
266
+ n_obs += _soma_store_n_observations(value)
267
+ return n_obs
268
+ raise ValueError(
269
+ "Could not infer the number of observations from the tiledbsoma object."
270
+ )
271
+
272
+
273
+ def _soma_n_observations(objectpath: UPath) -> int:
274
+ with _open_tiledbsoma(objectpath, mode="r") as store:
275
+ return _soma_store_n_observations(store)
@@ -4,7 +4,6 @@ import shutil
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  import fsspec
7
- from lamin_utils import logger
8
7
  from lamindb_setup.core import StorageSettings
9
8
  from lamindb_setup.core.upath import (
10
9
  LocalPathClasses,
@@ -42,25 +41,27 @@ def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> s
42
41
  return storage_key
43
42
 
44
43
 
45
- def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
46
- # str is needed to eliminate UPath storage_options
47
- # from the equality checks below
48
- # and for fsspec.utils.get_protocol
49
- path_str = str(path)
50
- root_str = str(root)
51
- root_protocol = fsspec.utils.get_protocol(root_str)
52
- # check that the protocols are the same first
53
- if fsspec.utils.get_protocol(path_str) != root_protocol:
54
- return False
55
- if root_protocol in {"http", "https"}:
56
- # in this case it is a base url, not a file
57
- # so formally does not exist
44
+ def _safely_resolve(upath: UPath) -> UPath:
45
+ if upath.protocol in {"http", "https"}:
58
46
  resolve_kwargs = {"follow_redirects": False}
59
47
  else:
60
48
  resolve_kwargs = {}
61
- return (
62
- UPath(root_str).resolve(**resolve_kwargs) in UPath(path_str).resolve().parents
63
- )
49
+ return upath.resolve(**resolve_kwargs)
50
+
51
+
52
+ def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
53
+ if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):
54
+ return False
55
+ path_upath = _safely_resolve(UPath(path))
56
+ root_upath = _safely_resolve(UPath(root))
57
+ if path_upath.protocol == "s3":
58
+ endpoint_path = path_upath.storage_options.get("endpoint_url", "")
59
+ endpoint_root = root_upath.storage_options.get("endpoint_url", "")
60
+ if endpoint_path != endpoint_root:
61
+ return False
62
+ # str is needed to eliminate UPath storage_options
63
+ # which affect equality checks
64
+ return UPath(str(root_upath)) in UPath(str(path_upath)).parents
64
65
 
65
66
 
66
67
  # returns filepath and root of the storage
@@ -169,10 +170,15 @@ def store_file_or_folder(
169
170
 
170
171
 
171
172
  def delete_storage_using_key(
172
- artifact: Artifact, storage_key: str, using_key: str | None
173
- ):
173
+ artifact: Artifact,
174
+ storage_key: str,
175
+ raise_file_not_found_error: bool = True,
176
+ using_key: str | None = None,
177
+ ) -> None | str:
174
178
  filepath, _ = attempt_accessing_path(artifact, storage_key, using_key=using_key)
175
- delete_storage(filepath)
179
+ return delete_storage(
180
+ filepath, raise_file_not_found_error=raise_file_not_found_error
181
+ )
176
182
 
177
183
 
178
184
  def delete_storage(
@@ -191,5 +197,5 @@ def delete_storage(
191
197
  elif raise_file_not_found_error:
192
198
  raise FileNotFoundError(f"{storagepath} is not an existing path!")
193
199
  else:
194
- logger.warning(f"{storagepath} is not an existing path!")
200
+ return "did-not-delete"
195
201
  return None
@@ -1,13 +1,8 @@
1
- from typing import Literal
2
-
3
-
4
1
  class CreationSettings:
5
- artifact_if_hash_exists: Literal[
6
- "warn_return_existing", "error", "warn_create_new"
7
- ] = "warn_return_existing"
8
- """Behavior if file hash exists (default `"warn_return_existing"`).
2
+ search_names: bool = True
3
+ """Switch off to speed up creating records (default `True`).
9
4
 
10
- One of `["warn_return_existing", "error", "warn_create_new"]`.
5
+ If `True`, search for alternative names and avoids duplicates.
11
6
 
12
7
  FAQ: :doc:`/faq/idempotency`
13
8
  """
@@ -18,15 +13,8 @@ class CreationSettings:
18
13
 
19
14
  It speeds up file creation by about a factor 100.
20
15
  """
21
- search_names: bool = True
22
- """To speed up creating records (default `True`).
23
-
24
- If `True`, search for alternative names.
25
-
26
- FAQ: :doc:`/faq/idempotency`
27
- """
28
16
  artifact_silence_missing_run_warning: bool = False
29
- """Silence warning about missing run & transform during artifact creation."""
17
+ """Silence warning about missing run & transform during artifact creation (default `False`)."""
30
18
  _artifact_use_virtual_keys: bool = True
31
19
  """Treat `key` parameter in :class:`~lamindb.Artifact` as virtual.
32
20