lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. lamindb/__init__.py +17 -6
  2. lamindb/_artifact.py +202 -87
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +86 -52
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +21 -7
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +78 -18
  10. lamindb/_record.py +170 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +42 -11
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +129 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/users.py +1 -4
  19. lamindb/base/validation.py +2 -6
  20. lamindb/core/__init__.py +13 -14
  21. lamindb/core/_context.py +14 -9
  22. lamindb/core/_data.py +29 -25
  23. lamindb/core/_describe.py +1 -1
  24. lamindb/core/_django.py +1 -1
  25. lamindb/core/_feature_manager.py +53 -43
  26. lamindb/core/_label_manager.py +4 -4
  27. lamindb/core/_mapped_collection.py +24 -9
  28. lamindb/core/_track_environment.py +2 -1
  29. lamindb/core/datasets/__init__.py +6 -1
  30. lamindb/core/datasets/_core.py +12 -11
  31. lamindb/core/datasets/_small.py +67 -21
  32. lamindb/core/exceptions.py +1 -90
  33. lamindb/core/loaders.py +21 -15
  34. lamindb/core/relations.py +6 -4
  35. lamindb/core/storage/_anndata_accessor.py +49 -3
  36. lamindb/core/storage/_backed_access.py +12 -7
  37. lamindb/core/storage/_pyarrow_dataset.py +40 -15
  38. lamindb/core/storage/_tiledbsoma.py +56 -12
  39. lamindb/core/storage/paths.py +30 -24
  40. lamindb/core/subsettings/_creation_settings.py +4 -16
  41. lamindb/curators/__init__.py +2193 -846
  42. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  43. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  44. lamindb/errors.py +96 -0
  45. lamindb/integrations/_vitessce.py +3 -3
  46. lamindb/migrations/0069_squashed.py +76 -75
  47. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  48. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  49. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  50. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  51. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  52. lamindb/migrations/0086_various.py +95 -0
  53. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  54. lamindb/migrations/0088_schema_components.py +273 -0
  55. lamindb/migrations/0088_squashed.py +4372 -0
  56. lamindb/models.py +475 -168
  57. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
  58. lamindb-1.1.1.dist-info/RECORD +95 -0
  59. lamindb/curators/_spatial.py +0 -528
  60. lamindb/migrations/0052_squashed.py +0 -1261
  61. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  62. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  63. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  64. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  65. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  66. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  67. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  68. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  69. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  70. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  71. lamindb/migrations/0063_populate_latest_field.py +0 -45
  72. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  73. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  74. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  75. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  76. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  77. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  78. lamindb-1.0.5.dist-info/RECORD +0 -102
  79. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
  80. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/__init__.py CHANGED
@@ -1,12 +1,13 @@
1
1
  """A data framework for biology.
2
2
 
3
- Tracking notebooks & scripts.
3
+ Tracking notebooks, scripts & functions.
4
4
 
5
5
  .. autosummary::
6
6
  :toctree: .
7
7
 
8
8
  track
9
9
  finish
10
+ tracked
10
11
 
11
12
  Registries.
12
13
 
@@ -20,7 +21,7 @@ Registries.
20
21
  User
21
22
  Storage
22
23
  Feature
23
- FeatureSet
24
+ Schema
24
25
  Param
25
26
  Collection
26
27
  Project
@@ -33,7 +34,6 @@ Key functionality.
33
34
  :toctree: .
34
35
 
35
36
  connect
36
- Curator
37
37
  view
38
38
  save
39
39
 
@@ -44,23 +44,33 @@ Modules and settings.
44
44
 
45
45
  integrations
46
46
  context
47
+ curators
47
48
  settings
49
+ errors
48
50
  setup
49
51
  UPath
50
52
  base
51
53
  core
52
54
 
55
+ Backward compatibility.
56
+
57
+ .. autosummary::
58
+ :toctree: .
59
+
60
+ FeatureSet
61
+ Curator
62
+
53
63
  """
54
64
 
55
65
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
56
- __version__ = "1.0.5"
66
+ __version__ = "1.1.1"
57
67
 
58
68
  from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
59
69
  from lamindb_setup._check_setup import _check_instance_setup
60
70
  from lamindb_setup._connect_instance import connect
61
71
  from lamindb_setup.core.upath import UPath
62
72
 
63
- from . import base, setup
73
+ from . import base, errors, setup
64
74
 
65
75
 
66
76
  def __getattr__(name):
@@ -86,10 +96,11 @@ if _check_instance_setup(from_module="lamindb"):
86
96
  integrations,
87
97
  )
88
98
  from ._save import save
99
+ from ._tracked import tracked
89
100
  from ._view import view
90
101
  from .core._context import context
91
102
  from .core._settings import settings
92
- from .curators import Curator
103
+ from .curators import CatManager as Curator
93
104
  from .models import (
94
105
  Artifact,
95
106
  Collection,
lamindb/_artifact.py CHANGED
@@ -23,6 +23,8 @@ from lamindb_setup.core.upath import (
23
23
  get_stat_file_cloud,
24
24
  )
25
25
 
26
+ from lamindb._record import _get_record_kwargs
27
+ from lamindb.errors import FieldValidationError
26
28
  from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
27
29
 
28
30
  from ._parents import view_lineage
@@ -32,10 +34,9 @@ from .core._data import (
32
34
  describe,
33
35
  get_run,
34
36
  save_schema_links,
35
- save_staged__schemas_m2m,
37
+ save_staged_feature_sets,
36
38
  )
37
39
  from .core._settings import settings
38
- from .core.exceptions import IntegrityError, InvalidArgument
39
40
  from .core.loaders import load_to_memory
40
41
  from .core.storage import (
41
42
  LocalPathClasses,
@@ -44,7 +45,9 @@ from .core.storage import (
44
45
  infer_suffix,
45
46
  write_to_disk,
46
47
  )
48
+ from .core.storage._anndata_accessor import _anndata_n_observations
47
49
  from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
50
+ from .core.storage._tiledbsoma import _soma_n_observations
48
51
  from .core.storage.objects import _mudata_is_installed
49
52
  from .core.storage.paths import (
50
53
  AUTO_KEY_PREFIX,
@@ -58,6 +61,7 @@ from .core.versioning import (
58
61
  create_uid,
59
62
  message_update_key_in_version_family,
60
63
  )
64
+ from .errors import IntegrityError, InvalidArgument
61
65
 
62
66
  try:
63
67
  from .core.storage._zarr import zarr_is_adata
@@ -73,6 +77,7 @@ if TYPE_CHECKING:
73
77
  from pyarrow.dataset import Dataset as PyArrowDataset
74
78
  from tiledbsoma import Collection as SOMACollection
75
79
  from tiledbsoma import Experiment as SOMAExperiment
80
+ from tiledbsoma import Measurement as SOMAMeasurement
76
81
 
77
82
  from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
78
83
 
@@ -83,6 +88,7 @@ def process_pathlike(
83
88
  using_key: str | None,
84
89
  skip_existence_check: bool = False,
85
90
  ) -> tuple[Storage, bool]:
91
+ """Determines the appropriate storage for a given path and whether to use an existing storage key."""
86
92
  if not skip_existence_check:
87
93
  try: # check if file exists
88
94
  if not filepath.exists():
@@ -112,6 +118,10 @@ def process_pathlike(
112
118
  hf_path.path_in_repo = ""
113
119
  new_root = "hf://" + hf_path.unresolve()
114
120
  else:
121
+ if filepath.protocol == "s3":
122
+ # check that endpoint_url didn't propagate here
123
+ # as a part of the path string
124
+ assert "?" not in filepath.path # noqa: S101
115
125
  new_root = list(filepath.parents)[-1]
116
126
  # do not register remote storage locations on hub if the current instance
117
127
  # is not managed on the hub
@@ -142,6 +152,7 @@ def process_data(
142
152
  default_storage: Storage,
143
153
  using_key: str | None,
144
154
  skip_existence_check: bool = False,
155
+ is_replace: bool = False,
145
156
  ) -> tuple[Any, Path | UPath, str, Storage, bool]:
146
157
  """Serialize a data object that's provided as file or in memory."""
147
158
  # if not overwritten, data gets stored in default storage
@@ -151,14 +162,24 @@ def process_data(
151
162
  data_types = (pd.DataFrame, AnnData, MuData)
152
163
  else:
153
164
  data_types = (pd.DataFrame, AnnData) # type:ignore
154
-
165
+ if key is not None:
166
+ key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
167
+ # use suffix as the (adata) format if the format is not provided
168
+ if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
169
+ format = key_suffix[1:]
170
+ else:
171
+ key_suffix = None
155
172
  if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
156
173
  access_token = (
157
174
  default_storage._access_token
158
175
  if hasattr(default_storage, "_access_token")
159
176
  else None
160
177
  )
161
- path = create_path(data, access_token=access_token).resolve()
178
+ path = create_path(data, access_token=access_token)
179
+ # we don't resolve http links because they can resolve into a different domain
180
+ # for example into a temporary url
181
+ if path.protocol not in {"http", "https"}:
182
+ path = path.resolve()
162
183
  storage, use_existing_storage_key = process_pathlike(
163
184
  path,
164
185
  default_storage=default_storage,
@@ -170,31 +191,23 @@ def process_data(
170
191
  elif isinstance(data, data_types):
171
192
  storage = default_storage
172
193
  memory_rep = data
173
- if key is not None:
174
- key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
175
- # use suffix as the (adata) format if the format is not provided
176
- if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
177
- format = key_suffix[1:]
178
- else:
179
- key_suffix = None
180
194
  suffix = infer_suffix(data, format)
181
- if key_suffix is not None and key_suffix != suffix:
182
- raise InvalidArgument(
183
- f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
184
- f" be '{suffix}'."
185
- )
186
- cache_name = f"{provisional_uid}{suffix}"
187
- path = settings.cache_dir / cache_name
188
- # Alex: I don't understand the line below
189
- if path.suffixes == []:
190
- path = path.with_suffix(suffix)
191
- write_to_disk(data, path)
192
- use_existing_storage_key = False
193
195
  else:
194
196
  raise NotImplementedError(
195
- f"Do not know how to create a artifact object from {data}, pass a path"
196
- " instead!"
197
+ f"Do not know how to create a artifact object from {data}, pass a path instead!"
197
198
  )
199
+ if key_suffix is not None and key_suffix != suffix and not is_replace:
200
+ # consciously omitting a trailing period
201
+ if isinstance(data, (str, Path, UPath)):
202
+ message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
203
+ else:
204
+ message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
205
+ raise InvalidArgument(message)
206
+ # in case we have an in-memory representation, we need to write it to disk
207
+ if isinstance(data, data_types):
208
+ path = settings.cache_dir / f"{provisional_uid}{suffix}"
209
+ write_to_disk(data, path)
210
+ use_existing_storage_key = False
198
211
  return memory_rep, path, suffix, storage, use_existing_storage_key
199
212
 
200
213
 
@@ -205,6 +218,7 @@ def get_stat_or_artifact(
205
218
  is_replace: bool = False,
206
219
  instance: str | None = None,
207
220
  ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
221
+ """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
208
222
  n_files = None
209
223
  if settings.creation.artifact_skip_size_hash:
210
224
  return None, None, None, n_files, None
@@ -248,29 +262,14 @@ def get_stat_or_artifact(
248
262
  )
249
263
  previous_artifact_version = result[0]
250
264
  if artifact_with_same_hash_exists:
251
- if settings.creation.artifact_if_hash_exists == "error":
252
- msg = f"artifact with same hash exists: {result[0]}"
253
- hint = (
254
- "💡 you can make this error a warning:\n"
255
- " ln.settings.creation.artifact_if_hash_exists"
256
- )
257
- raise FileExistsError(f"{msg}\n{hint}")
258
- elif settings.creation.artifact_if_hash_exists == "warn_create_new":
259
- logger.warning(
260
- "creating new Artifact object despite existing artifact with same hash:"
261
- f" {result[0]}"
262
- )
263
- return size, hash, hash_type, n_files, None
264
- else:
265
- if result[0]._branch_code == -1:
266
- raise FileExistsError(
267
- f"You're trying to re-create this artifact in trash: {result[0]}"
268
- "Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
269
- )
270
- logger.important(
271
- f"returning existing artifact with same hash: {result[0]}; if you intended to query to track this artifact as an input, use: ln.Artifact.get()"
272
- )
273
- return result[0]
265
+ message = "found artifact with same hash"
266
+ if result[0]._branch_code == -1:
267
+ result[0].restore()
268
+ message = "restored artifact with same hash from trash"
269
+ logger.important(
270
+ f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
271
+ )
272
+ return result[0]
274
273
  else:
275
274
  return size, hash, hash_type, n_files, previous_artifact_version
276
275
 
@@ -326,6 +325,7 @@ def get_artifact_kwargs_from_data(
326
325
  default_storage,
327
326
  using_key,
328
327
  skip_check_exists,
328
+ is_replace=is_replace,
329
329
  )
330
330
  stat_or_artifact = get_stat_or_artifact(
331
331
  path=path,
@@ -441,7 +441,7 @@ def log_storage_hint(
441
441
  root_path = Path(storage.root) # type: ignore
442
442
  if check_path_is_child_of_root(root_path, Path.cwd()):
443
443
  # only display the relative path, not the fully resolved path
444
- display_root = root_path.relative_to(Path.cwd())
444
+ display_root = root_path.relative_to(Path.cwd()) # type: ignore
445
445
  hint += f"path in storage '{display_root}'" # type: ignore
446
446
  else:
447
447
  hint += "path content will be copied to default storage upon `save()`"
@@ -458,7 +458,7 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
458
458
  return True
459
459
  if isinstance(data, (str, Path, UPath)):
460
460
  data_path = UPath(data)
461
- if data_path.suffix == ".h5ad":
461
+ if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
462
462
  return True
463
463
  elif data_path.suffix == ".zarr":
464
464
  # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
@@ -480,7 +480,7 @@ def data_is_mudata(data: MuData | UPathStr) -> bool:
480
480
  if isinstance(data, MuData):
481
481
  return True
482
482
  if isinstance(data, (str, Path)):
483
- return UPath(data).suffix in {".h5mu"}
483
+ return UPath(data).suffix == ".h5mu"
484
484
  return False
485
485
 
486
486
 
@@ -506,8 +506,8 @@ def _check_otype_artifact(data: Any, otype: str | None = None):
506
506
 
507
507
 
508
508
  def __init__(artifact: Artifact, *args, **kwargs):
509
- artifact.features = FeatureManager(artifact)
510
- artifact.params = ParamManager(artifact)
509
+ artifact.features = FeatureManager(artifact) # type: ignore
510
+ artifact.params = ParamManager(artifact) # type: ignore
511
511
  # Below checks for the Django-internal call in from_db()
512
512
  # it'd be better if we could avoid this, but not being able to create a Artifact
513
513
  # from data with the default constructor renders the central class of the API
@@ -559,9 +559,9 @@ def __init__(artifact: Artifact, *args, **kwargs):
559
559
  logger.warning("`type` will be removed soon, please use `kind`")
560
560
  kind = kwargs.pop("type")
561
561
  if not len(kwargs) == 0:
562
- raise ValueError(
563
- "Only data, key, run, description, version, revises"
564
- f" can be passed, you passed: {kwargs}"
562
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
563
+ raise FieldValidationError(
564
+ f"Only {valid_keywords} can be passed, you passed: {kwargs}"
565
565
  )
566
566
  if revises is not None and key is not None and revises.key != key:
567
567
  note = message_update_key_in_version_family(
@@ -676,6 +676,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
676
676
  def from_df(
677
677
  cls,
678
678
  df: pd.DataFrame,
679
+ *,
679
680
  key: str | None = None,
680
681
  description: str | None = None,
681
682
  run: Run | None = None,
@@ -683,7 +684,7 @@ def from_df(
683
684
  **kwargs,
684
685
  ) -> Artifact:
685
686
  """{}""" # noqa: D415
686
- artifact = Artifact(
687
+ artifact = Artifact( # type: ignore
687
688
  data=df,
688
689
  key=key,
689
690
  run=run,
@@ -693,6 +694,7 @@ def from_df(
693
694
  kind="dataset",
694
695
  **kwargs,
695
696
  )
697
+ artifact.n_observations = len(df)
696
698
  return artifact
697
699
 
698
700
 
@@ -701,6 +703,7 @@ def from_df(
701
703
  def from_anndata(
702
704
  cls,
703
705
  adata: AnnData | UPathStr,
706
+ *,
704
707
  key: str | None = None,
705
708
  description: str | None = None,
706
709
  run: Run | None = None,
@@ -710,7 +713,8 @@ def from_anndata(
710
713
  """{}""" # noqa: D415
711
714
  if not data_is_anndata(adata):
712
715
  raise ValueError("data has to be an AnnData object or a path to AnnData-like")
713
- artifact = Artifact(
716
+ _anndata_n_observations(adata)
717
+ artifact = Artifact( # type: ignore
714
718
  data=adata,
715
719
  key=key,
716
720
  run=run,
@@ -720,6 +724,17 @@ def from_anndata(
720
724
  kind="dataset",
721
725
  **kwargs,
722
726
  )
727
+ # this is done instead of _anndata_n_observations(adata)
728
+ # because we need a proper path through create_path for cloud paths
729
+ # for additional upath options etc that create_path adds
730
+ obj_for_obs: AnnData | UPath
731
+ if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
732
+ obj_for_obs = artifact._memory_rep
733
+ else:
734
+ # returns ._local_filepath for local files
735
+ # and the proper path through create_path for cloud paths
736
+ obj_for_obs = artifact.path
737
+ artifact.n_observations = _anndata_n_observations(obj_for_obs)
723
738
  return artifact
724
739
 
725
740
 
@@ -728,6 +743,7 @@ def from_anndata(
728
743
  def from_mudata(
729
744
  cls,
730
745
  mdata: MuData,
746
+ *,
731
747
  key: str | None = None,
732
748
  description: str | None = None,
733
749
  run: Run | None = None,
@@ -735,7 +751,7 @@ def from_mudata(
735
751
  **kwargs,
736
752
  ) -> Artifact:
737
753
  """{}""" # noqa: D415
738
- artifact = Artifact(
754
+ artifact = Artifact( # type: ignore
739
755
  data=mdata,
740
756
  key=key,
741
757
  run=run,
@@ -745,6 +761,38 @@ def from_mudata(
745
761
  kind="dataset",
746
762
  **kwargs,
747
763
  )
764
+ artifact.n_observations = mdata.n_obs
765
+ return artifact
766
+
767
+
768
+ @classmethod # type: ignore
769
+ @doc_args(Artifact.from_tiledbsoma.__doc__)
770
+ def from_tiledbsoma(
771
+ cls,
772
+ path: UPathStr,
773
+ *,
774
+ key: str | None = None,
775
+ description: str | None = None,
776
+ run: Run | None = None,
777
+ revises: Artifact | None = None,
778
+ **kwargs,
779
+ ) -> Artifact:
780
+ """{}""" # noqa: D415
781
+ if UPath(path).suffix != ".tiledbsoma":
782
+ raise ValueError(
783
+ "A tiledbsoma store should have .tiledbsoma suffix to be registered."
784
+ )
785
+ artifact = Artifact( # type: ignore
786
+ data=path,
787
+ key=key,
788
+ run=run,
789
+ description=description,
790
+ revises=revises,
791
+ otype="tiledbsoma",
792
+ kind="dataset",
793
+ **kwargs,
794
+ )
795
+ artifact.n_observations = _soma_n_observations(artifact.path)
748
796
  return artifact
749
797
 
750
798
 
@@ -753,8 +801,8 @@ def from_mudata(
753
801
  def from_dir(
754
802
  cls,
755
803
  path: UPathStr,
756
- key: str | None = None,
757
804
  *,
805
+ key: str | None = None,
758
806
  run: Run | None = None,
759
807
  ) -> list[Artifact]:
760
808
  """{}""" # noqa: D415
@@ -931,22 +979,34 @@ inconsistent_state_msg = (
931
979
 
932
980
  # docstring handled through attach_func_to_class_method
933
981
  def open(
934
- self, mode: str = "r", is_run_input: bool | None = None
982
+ self, mode: str = "r", is_run_input: bool | None = None, **kwargs
935
983
  ) -> (
936
- AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
984
+ AnnDataAccessor
985
+ | BackedAccessor
986
+ | SOMACollection
987
+ | SOMAExperiment
988
+ | SOMAMeasurement
989
+ | PyArrowDataset
937
990
  ):
938
991
  if self._overwrite_versions and not self.is_latest:
939
992
  raise ValueError(inconsistent_state_msg)
993
+ # all hdf5 suffixes including gzipped
994
+ h5_suffixes = [".h5", ".hdf5", ".h5ad"]
995
+ h5_suffixes += [s + ".gz" for s in h5_suffixes]
940
996
  # ignore empty suffix for now
941
997
  suffixes = (
942
- "",
943
- ".h5",
944
- ".hdf5",
945
- ".h5ad",
946
- ".zarr",
947
- ".anndata.zarr",
948
- ".tiledbsoma",
949
- ) + PYARROW_SUFFIXES
998
+ (
999
+ "",
1000
+ ".zarr",
1001
+ ".anndata.zarr",
1002
+ ".tiledbsoma",
1003
+ )
1004
+ + tuple(h5_suffixes)
1005
+ + PYARROW_SUFFIXES
1006
+ + tuple(
1007
+ s + ".gz" for s in PYARROW_SUFFIXES
1008
+ ) # this doesn't work for externally gzipped files, REMOVE LATER
1009
+ )
950
1010
  if self.suffix not in suffixes:
951
1011
  raise ValueError(
952
1012
  "Artifact should have a zarr, h5, tiledbsoma object"
@@ -964,16 +1024,36 @@ def open(
964
1024
  using_key = settings._using_key
965
1025
  filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
966
1026
  is_tiledbsoma_w = (
967
- filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
1027
+ filepath.name == "soma" or self.suffix == ".tiledbsoma"
968
1028
  ) and mode == "w"
969
1029
  # consider the case where an object is already locally cached
970
1030
  localpath = setup_settings.paths.cloud_to_local_no_update(
971
1031
  filepath, cache_key=cache_key
972
1032
  )
973
- if not is_tiledbsoma_w and localpath.exists():
974
- access = backed_access(localpath, mode, using_key)
1033
+ if is_tiledbsoma_w:
1034
+ open_cache = False
975
1035
  else:
976
- access = backed_access(filepath, mode, using_key)
1036
+ open_cache = not isinstance(
1037
+ filepath, LocalPathClasses
1038
+ ) and not filepath.synchronize(localpath, just_check=True)
1039
+ if open_cache:
1040
+ try:
1041
+ access = backed_access(localpath, mode, using_key, **kwargs)
1042
+ except Exception as e:
1043
+ if isinstance(filepath, LocalPathClasses):
1044
+ raise e
1045
+ logger.warning(
1046
+ f"The cache might be corrupted: {e}. Trying to open directly."
1047
+ )
1048
+ access = backed_access(filepath, mode, using_key, **kwargs)
1049
+ # happens only if backed_access has been successful
1050
+ # delete the corrupted cache
1051
+ if localpath.is_dir():
1052
+ shutil.rmtree(localpath)
1053
+ else:
1054
+ localpath.unlink(missing_ok=True)
1055
+ else:
1056
+ access = backed_access(filepath, mode, using_key, **kwargs)
977
1057
  if is_tiledbsoma_w:
978
1058
 
979
1059
  def finalize():
@@ -1013,10 +1093,10 @@ def _synchronize_cleanup_on_error(
1013
1093
  cache_path = setup_settings.paths.cloud_to_local_no_update(
1014
1094
  filepath, cache_key=cache_key
1015
1095
  )
1016
- if cache_path.is_file():
1017
- cache_path.unlink(missing_ok=True)
1018
- elif cache_path.is_dir():
1096
+ if cache_path.is_dir():
1019
1097
  shutil.rmtree(cache_path)
1098
+ else:
1099
+ cache_path.unlink(missing_ok=True)
1020
1100
  raise e
1021
1101
  return cache_path
1022
1102
 
@@ -1033,8 +1113,24 @@ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
1033
1113
  self, using_key=settings._using_key
1034
1114
  )
1035
1115
  cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1036
- # cache_path is local so doesn't trigger any sync in load_to_memory
1037
- access_memory = load_to_memory(cache_path, **kwargs)
1116
+ try:
1117
+ # cache_path is local so doesn't trigger any sync in load_to_memory
1118
+ access_memory = load_to_memory(cache_path, **kwargs)
1119
+ except Exception as e:
1120
+ # just raise the exception if the original path is local
1121
+ if isinstance(filepath, LocalPathClasses):
1122
+ raise e
1123
+ logger.warning(
1124
+ f"The cache might be corrupted: {e}. Retrying to synchronize."
1125
+ )
1126
+ # delete the existing cache
1127
+ if cache_path.is_dir():
1128
+ shutil.rmtree(cache_path)
1129
+ else:
1130
+ cache_path.unlink(missing_ok=True)
1131
+ # download again and try to load into memory
1132
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1133
+ access_memory = load_to_memory(cache_path, **kwargs)
1038
1134
  # only call if load is successfull
1039
1135
  _track_run_input(self, is_run_input)
1040
1136
  return access_memory
@@ -1154,6 +1250,7 @@ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1154
1250
  def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1155
1251
  state_was_adding = self._state.adding
1156
1252
  print_progress = kwargs.pop("print_progress", True)
1253
+ store_kwargs = kwargs.pop("store_kwargs", {}) # kwargs for .upload_from in the end
1157
1254
  access_token = kwargs.pop("access_token", None)
1158
1255
  local_path = None
1159
1256
  if upload and setup_settings.instance.keep_artifacts_local:
@@ -1174,15 +1271,31 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1174
1271
  using_key = None
1175
1272
  if "using" in kwargs:
1176
1273
  using_key = kwargs["using"]
1177
- exception = check_and_attempt_upload(
1178
- self, using_key, access_token=access_token, print_progress=print_progress
1274
+ exception_upload = check_and_attempt_upload(
1275
+ self,
1276
+ using_key,
1277
+ access_token=access_token,
1278
+ print_progress=print_progress,
1279
+ **store_kwargs,
1179
1280
  )
1180
- if exception is not None:
1281
+ if exception_upload is not None:
1282
+ # we do not want to raise file not found on cleanup if upload of a file failed
1283
+ # often it is ACID in the filesystem itself
1284
+ # for example, s3 won't have the failed file, so just skip the delete in this case
1285
+ raise_file_not_found_error = False
1181
1286
  self._delete_skip_storage()
1182
- raise RuntimeError(exception)
1183
- exception = check_and_attempt_clearing(self, using_key)
1184
- if exception is not None:
1185
- raise RuntimeError(exception)
1287
+ else:
1288
+ # this is the case when it is cleaned on .replace
1289
+ raise_file_not_found_error = True
1290
+ # this is triggered by an exception in check_and_attempt_upload or by replace.
1291
+ exception_clear = check_and_attempt_clearing(
1292
+ self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
1293
+ )
1294
+ if exception_upload is not None:
1295
+ raise RuntimeError(exception_upload)
1296
+ if exception_clear is not None:
1297
+ raise RuntimeError(exception_clear)
1298
+ # this is only for keep_artifacts_local
1186
1299
  if local_path is not None and not state_was_adding:
1187
1300
  # only move the local artifact to cache if it was not newly created
1188
1301
  local_path_cache = ln_setup.settings.cache_dir / local_path.name
@@ -1197,7 +1310,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1197
1310
 
1198
1311
 
1199
1312
  def _save_skip_storage(file, **kwargs) -> None:
1200
- save_staged__schemas_m2m(file)
1313
+ save_staged_feature_sets(file)
1201
1314
  super(Artifact, file).save(**kwargs)
1202
1315
  save_schema_links(file)
1203
1316
 
@@ -1233,6 +1346,7 @@ METHOD_NAMES = [
1233
1346
  "from_anndata",
1234
1347
  "from_df",
1235
1348
  "from_mudata",
1349
+ "from_tiledbsoma",
1236
1350
  "open",
1237
1351
  "cache",
1238
1352
  "load",
@@ -1256,6 +1370,7 @@ for name in METHOD_NAMES:
1256
1370
  attach_func_to_class_method(name, Artifact, globals())
1257
1371
 
1258
1372
  # privates currently dealt with separately
1373
+ # mypy: ignore-errors
1259
1374
  Artifact._delete_skip_storage = _delete_skip_storage
1260
1375
  Artifact._save_skip_storage = _save_skip_storage
1261
1376
  Artifact._cache_path = _cache_path