lamindb 1.6.2__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +1 -3
  2. lamindb/_finish.py +32 -16
  3. lamindb/base/types.py +6 -4
  4. lamindb/core/_context.py +127 -57
  5. lamindb/core/_mapped_collection.py +1 -1
  6. lamindb/core/_settings.py +44 -4
  7. lamindb/core/_track_environment.py +5 -2
  8. lamindb/core/loaders.py +1 -1
  9. lamindb/core/storage/_anndata_accessor.py +1 -1
  10. lamindb/core/storage/_tiledbsoma.py +14 -8
  11. lamindb/core/storage/_valid_suffixes.py +0 -1
  12. lamindb/core/storage/_zarr.py +1 -1
  13. lamindb/core/storage/objects.py +13 -8
  14. lamindb/core/storage/paths.py +9 -6
  15. lamindb/core/types.py +1 -1
  16. lamindb/curators/_legacy.py +2 -1
  17. lamindb/curators/core.py +106 -105
  18. lamindb/errors.py +9 -0
  19. lamindb/examples/fixtures/__init__.py +0 -0
  20. lamindb/examples/fixtures/sheets.py +224 -0
  21. lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +1 -1
  22. lamindb/migrations/0105_record_unique_name.py +20 -0
  23. lamindb/migrations/0106_transfer_data_migration.py +25 -0
  24. lamindb/migrations/0107_add_schema_to_record.py +68 -0
  25. lamindb/migrations/0108_remove_record_sheet_remove_sheetproject_sheet_and_more.py +30 -0
  26. lamindb/migrations/0109_record_input_of_runs_alter_record_run_and_more.py +123 -0
  27. lamindb/migrations/0110_rename_values_artifacts_record_linked_artifacts.py +17 -0
  28. lamindb/migrations/0111_remove_record__sort_order.py +148 -0
  29. lamindb/migrations/0112_alter_recordartifact_feature_and_more.py +105 -0
  30. lamindb/migrations/0113_lower_case_branch_and_space_names.py +62 -0
  31. lamindb/migrations/0114_alter_run__status_code.py +24 -0
  32. lamindb/migrations/0115_alter_space_uid.py +52 -0
  33. lamindb/migrations/{0104_squashed.py → 0115_squashed.py} +261 -257
  34. lamindb/models/__init__.py +4 -3
  35. lamindb/models/_describe.py +88 -31
  36. lamindb/models/_feature_manager.py +627 -658
  37. lamindb/models/_label_manager.py +1 -3
  38. lamindb/models/artifact.py +214 -99
  39. lamindb/models/collection.py +7 -1
  40. lamindb/models/feature.py +288 -60
  41. lamindb/models/has_parents.py +3 -3
  42. lamindb/models/project.py +32 -15
  43. lamindb/models/query_manager.py +7 -1
  44. lamindb/models/query_set.py +118 -41
  45. lamindb/models/record.py +140 -94
  46. lamindb/models/run.py +42 -42
  47. lamindb/models/save.py +102 -16
  48. lamindb/models/schema.py +41 -8
  49. lamindb/models/sqlrecord.py +105 -40
  50. lamindb/models/storage.py +278 -0
  51. lamindb/models/transform.py +10 -2
  52. lamindb/models/ulabel.py +9 -1
  53. lamindb/py.typed +0 -0
  54. lamindb/setup/__init__.py +2 -1
  55. lamindb/setup/_switch.py +16 -0
  56. lamindb/setup/errors/__init__.py +4 -0
  57. lamindb/setup/types/__init__.py +4 -0
  58. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/METADATA +5 -5
  59. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/RECORD +61 -44
  60. lamindb/models/core.py +0 -135
  61. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/LICENSE +0 -0
  62. {lamindb-1.6.2.dist-info → lamindb-1.7.0.dist-info}/WHEEL +0 -0
@@ -33,9 +33,7 @@ if TYPE_CHECKING:
33
33
  from lamindb.models import Artifact, Collection, SQLRecord
34
34
  from lamindb.models.query_set import QuerySet
35
35
 
36
- # we do not want to show records because this is a breaking change until all instances are migrated
37
- # TODO: remove records from below once all instances are migrated
38
- EXCLUDE_LABELS = {"feature_sets", "records"}
36
+ EXCLUDE_LABELS = {"feature_sets"}
39
37
 
40
38
 
41
39
  def _get_labels(
@@ -16,17 +16,15 @@ from django.db import connections, models
16
16
  from django.db.models import CASCADE, PROTECT, Q
17
17
  from lamin_utils import colors, logger
18
18
  from lamindb_setup import settings as setup_settings
19
- from lamindb_setup._init_instance import register_storage_in_instance
20
19
  from lamindb_setup.core._hub_core import select_storage_or_parent
21
- from lamindb_setup.core._settings_storage import init_storage
22
20
  from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
23
- from lamindb_setup.core.types import UPathStr
24
21
  from lamindb_setup.core.upath import (
25
22
  create_path,
26
23
  extract_suffix_from_path,
27
24
  get_stat_dir_cloud,
28
25
  get_stat_file_cloud,
29
26
  )
27
+ from lamindb_setup.types import UPathStr
30
28
 
31
29
  from lamindb.base import deprecated
32
30
  from lamindb.base.fields import (
@@ -35,7 +33,7 @@ from lamindb.base.fields import (
35
33
  CharField,
36
34
  ForeignKey,
37
35
  )
38
- from lamindb.errors import FieldValidationError
36
+ from lamindb.errors import FieldValidationError, UnknownStorageLocation
39
37
  from lamindb.models.query_set import QuerySet
40
38
 
41
39
  from ..base.users import current_user_id
@@ -70,8 +68,6 @@ from ..models._is_versioned import (
70
68
  from ._django import get_artifact_with_related
71
69
  from ._feature_manager import (
72
70
  FeatureManager,
73
- FeatureManagerArtifact,
74
- add_label_feature_links,
75
71
  filter_base,
76
72
  get_label_links,
77
73
  )
@@ -80,10 +76,10 @@ from ._relations import (
80
76
  dict_module_name_to_model_name,
81
77
  dict_related_model_to_related_name,
82
78
  )
83
- from .core import Storage
84
79
  from .feature import Feature, FeatureValue
85
80
  from .has_parents import view_lineage
86
81
  from .run import Run, TracksRun, TracksUpdates, User
82
+ from .save import check_and_attempt_clearing, check_and_attempt_upload
87
83
  from .schema import Schema
88
84
  from .sqlrecord import (
89
85
  BaseSQLRecord,
@@ -92,6 +88,7 @@ from .sqlrecord import (
92
88
  _get_record_kwargs,
93
89
  record_repr,
94
90
  )
91
+ from .storage import Storage
95
92
  from .ulabel import ULabel
96
93
 
97
94
  WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"
@@ -127,6 +124,7 @@ if TYPE_CHECKING:
127
124
  from ._label_manager import LabelManager
128
125
  from .collection import Collection
129
126
  from .project import Project, Reference
127
+ from .record import Record
130
128
  from .transform import Transform
131
129
 
132
130
 
@@ -173,19 +171,32 @@ def process_pathlike(
173
171
  if filepath.protocol == "hf":
174
172
  hf_path = filepath.fs.resolve_path(filepath.as_posix())
175
173
  hf_path.path_in_repo = ""
176
- new_root = "hf://" + hf_path.unresolve()
174
+ new_root = "hf://" + hf_path.unresolve().rstrip("/")
177
175
  else:
178
176
  if filepath.protocol == "s3":
179
177
  # check that endpoint_url didn't propagate here
180
178
  # as a part of the path string
181
179
  assert "?" not in filepath.path # noqa: S101
182
- new_root = list(filepath.parents)[-1]
183
- # do not register remote storage locations on hub if the current instance
184
- # is not managed on the hub
185
- storage_settings, _ = init_storage(
186
- new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
187
- )
188
- storage_record = register_storage_in_instance(storage_settings)
180
+ new_root = list(filepath.parents)[-1].as_posix().rstrip("/")
181
+ # Re the Parallel execution of the logic below:
182
+ # One of the threads (or processes) would start to write the hub record and then the test file.
183
+ # The other ones would retrieve the hub record and the test file.
184
+ # All of them would come out of the exercise with storage_record.instance_uid == setup_settings.instance.uid
185
+ # and all of them would raise UnkownStorageLocation.
186
+ # Then one of these threads will trigger storage_record.delete() but also this is idempotent;
187
+ # this means they all throw the same error and deletion of the inexistent stuff (hub record, marker file)
188
+ # would just silently fail.
189
+ # Edge case: A user legitimately creates a storage location and another user runs this here at the exact same time.
190
+ # There is no way to decide then which is the legitimate creation.
191
+ storage_record = Storage(root=new_root).save()
192
+ if storage_record.instance_uid == setup_settings.instance.uid:
193
+ # we don't want to inadvertently create managed storage locations
194
+ # hence, we revert the creation and throw an error
195
+ storage_record.delete()
196
+ raise UnknownStorageLocation(
197
+ f"Path {filepath} is not contained in any known storage location:\n{Storage.df()[['uid', 'root', 'type']]}\n\n"
198
+ f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()"
199
+ )
189
200
  use_existing_storage_key = True
190
201
  return storage_record, use_existing_storage_key
191
202
  # if the filepath is local
@@ -271,7 +282,11 @@ def process_data(
271
282
  from lamindb import settings
272
283
 
273
284
  path = settings.cache_dir / f"{provisional_uid}{suffix}"
274
- write_to_disk(data, path)
285
+ if isinstance(format, dict):
286
+ format.pop("suffix", None)
287
+ else:
288
+ format = {}
289
+ write_to_disk(data, path, **format)
275
290
  use_existing_storage_key = False
276
291
 
277
292
  return memory_rep, path, suffix, storage, use_existing_storage_key
@@ -355,7 +370,7 @@ def check_path_in_existing_storage(
355
370
  if check_hub_register_storage and getattr(path, "protocol", None) in {"s3", "gs"}:
356
371
  result = select_storage_or_parent(path.as_posix())
357
372
  if result is not None:
358
- return Storage(**result).save()
373
+ return Storage(**result, _skip_preparation=True).save()
359
374
  return None
360
375
 
361
376
 
@@ -390,6 +405,7 @@ def get_artifact_kwargs_from_data(
390
405
  using_key: str | None = None,
391
406
  is_replace: bool = False,
392
407
  skip_check_exists: bool = False,
408
+ overwrite_versions: bool | None = None,
393
409
  ):
394
410
  from lamindb import settings
395
411
 
@@ -458,7 +474,8 @@ def get_artifact_kwargs_from_data(
458
474
  # we use an actual storage key
459
475
  if check_path_in_storage:
460
476
  key_is_virtual = False
461
-
477
+ if overwrite_versions is None:
478
+ overwrite_versions = n_files is not None
462
479
  kwargs = {
463
480
  "uid": provisional_uid,
464
481
  "suffix": suffix,
@@ -471,7 +488,7 @@ def get_artifact_kwargs_from_data(
471
488
  # to make them both available immediately
472
489
  # after object creation
473
490
  "n_files": n_files,
474
- "_overwrite_versions": n_files is not None, # True for folder, False for file
491
+ "_overwrite_versions": overwrite_versions, # True for folder, False for file
475
492
  "n_observations": None, # to implement
476
493
  "run_id": run.id if run is not None else None,
477
494
  "run": run,
@@ -525,25 +542,25 @@ def log_storage_hint(
525
542
 
526
543
  def data_is_scversedatastructure(
527
544
  data: ScverseDataStructures | UPathStr,
528
- expected_ds: Literal["AnnData", "MuData", "SpatialData"] | None = None,
545
+ structure_type: Literal["AnnData", "MuData", "SpatialData"] | None = None,
529
546
  ) -> bool:
530
547
  """Determine whether a specific in-memory object or a UPathstr is any or a specific scverse data structure."""
531
548
  file_suffix = None
532
- if expected_ds == "AnnData":
549
+ if structure_type == "AnnData":
533
550
  file_suffix = ".h5ad"
534
- elif expected_ds == "MuData":
551
+ elif structure_type == "MuData":
535
552
  file_suffix = ".h5mu"
536
553
  # SpatialData does not have a unique suffix but `.zarr`
537
554
 
538
- if expected_ds is None:
555
+ if structure_type is None:
539
556
  return any(
540
557
  hasattr(data, "__class__") and data.__class__.__name__ == cl_name
541
558
  for cl_name in ["AnnData", "MuData", "SpatialData"]
542
559
  )
543
- elif hasattr(data, "__class__") and data.__class__.__name__ == expected_ds:
560
+ elif hasattr(data, "__class__") and data.__class__.__name__ == structure_type:
544
561
  return True
545
562
 
546
- data_type = expected_ds.lower()
563
+ data_type = structure_type.lower()
547
564
  if isinstance(data, (str, Path, UPath)):
548
565
  data_path = UPath(data)
549
566
 
@@ -559,13 +576,15 @@ def data_is_scversedatastructure(
559
576
  if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
560
577
  return (
561
578
  identify_zarr_type(
562
- data_path if expected_ds == "AnnData" else data,
563
- check=True if expected_ds == "AnnData" else False,
579
+ data_path if structure_type == "AnnData" else data,
580
+ check=True if structure_type == "AnnData" else False,
564
581
  )
565
582
  == data_type
566
583
  )
567
584
  else:
568
- logger.warning(f"We do not check if cloud zarr is {expected_ds} or not")
585
+ logger.warning(
586
+ f"we do not check whether cloud zarr is {structure_type}"
587
+ )
569
588
  return False
570
589
  return False
571
590
 
@@ -957,8 +976,7 @@ def add_labels(
957
976
  features_labels = {
958
977
  registry_name: [(feature, label_record) for label_record in records]
959
978
  }
960
- add_label_feature_links(
961
- self.features,
979
+ self.features._add_label_feature_links(
962
980
  features_labels,
963
981
  feature_ref_is_name=feature_ref_is_name,
964
982
  label_ref_is_name=label_ref_is_name,
@@ -980,7 +998,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
980
998
  key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
981
999
  description: `str | None = None` A description.
982
1000
  revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
983
- run: `Run | None = None` The run that creates the artifact.
1001
+ overwrite_versions: `bool | None = None` Whether to overwrite versions. Defaults to `True` for folders and `False` for files.
1002
+ run: `Run | bool | None = None` The run that creates the artifact. If `False`, surpress tracking the run.
1003
+ If `None`, infer the run from the global run context.
984
1004
 
985
1005
  Examples:
986
1006
 
@@ -1090,47 +1110,54 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1090
1110
  ),
1091
1111
  ]
1092
1112
 
1113
+ _aux_fields: dict[str, tuple[str, type]] = {
1114
+ "0": ("_is_saved_to_storage_location", bool),
1115
+ }
1093
1116
  _len_full_uid: int = 20
1094
1117
  _len_stem_uid: int = 16
1095
1118
 
1096
- features: FeatureManager = FeatureManagerArtifact # type: ignore
1097
- """Feature manager.
1119
+ @property
1120
+ def features(self) -> FeatureManager:
1121
+ """Feature manager.
1122
+
1123
+ Typically, you annotate a dataset with features by defining a `Schema` and passing it to the `Artifact` constructor.
1098
1124
 
1099
- Typically, you annotate a dataset with features by defining a `Schema` and passing it to the `Artifact` constructor.
1125
+ Here is how to do annotate an artifact ad hoc::
1100
1126
 
1101
- Here is how to do annotate an artifact ad hoc::
1127
+ artifact.features.add_values({
1128
+ "species": organism, # here, organism is an Organism record
1129
+ "scientist": ['Barbara McClintock', 'Edgar Anderson'],
1130
+ "temperature": 27.6,
1131
+ "experiment": "Experiment 1"
1132
+ })
1102
1133
 
1103
- artifact.features.add_values({
1104
- "species": organism, # here, organism is an Organism record
1105
- "scientist": ['Barbara McClintock', 'Edgar Anderson'],
1106
- "temperature": 27.6,
1107
- "experiment": "Experiment 1"
1108
- })
1134
+ Query artifacts by features::
1109
1135
 
1110
- Query artifacts by features::
1136
+ ln.Artifact.filter(scientist="Barbara McClintock")
1111
1137
 
1112
- ln.Artifact.filter(scientist="Barbara McClintock")
1138
+ Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1139
+ instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1140
+ `DataFrame`-like artifact and annotates it with features corresponding to
1141
+ these columns. `artifact.features.add_values`, by contrast, does not
1142
+ validate the content of the artifact.
1113
1143
 
1114
- Features may or may not be part of the dataset, i.e., the artifact content in storage. For
1115
- instance, the :class:`~lamindb.curators.DataFrameCurator` flow validates the columns of a
1116
- `DataFrame`-like artifact and annotates it with features corresponding to
1117
- these columns. `artifact.features.add_values`, by contrast, does not
1118
- validate the content of the artifact.
1144
+ .. dropdown:: An example for a model-like artifact
1119
1145
 
1120
- .. dropdown:: An example for a model-like artifact
1146
+ ::
1121
1147
 
1122
- ::
1148
+ artifact.features.add_values({
1149
+ "hidden_size": 32,
1150
+ "bottleneck_size": 16,
1151
+ "batch_size": 32,
1152
+ "preprocess_params": {
1153
+ "normalization_type": "cool",
1154
+ "subset_highlyvariable": True,
1155
+ },
1156
+ })
1157
+ """
1158
+ from ._feature_manager import FeatureManager
1123
1159
 
1124
- artifact.features.add_values({
1125
- "hidden_size": 32,
1126
- "bottleneck_size": 16,
1127
- "batch_size": 32,
1128
- "preprocess_params": {
1129
- "normalization_type": "cool",
1130
- "subset_highlyvariable": True,
1131
- },
1132
- })
1133
- """
1160
+ return FeatureManager(self)
1134
1161
 
1135
1162
  @property
1136
1163
  def labels(self) -> LabelManager:
@@ -1219,9 +1246,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1219
1246
  """Number of files for folder-like artifacts, `None` for file-like artifacts.
1220
1247
 
1221
1248
  Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
1222
-
1223
- .. versionchanged:: 1.0
1224
- Renamed from `n_objects` to `n_files`.
1225
1249
  """
1226
1250
  n_observations: int | None = BigIntegerField(
1227
1251
  null=True, db_index=True, default=None, editable=False
@@ -1289,14 +1313,15 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1289
1313
  )
1290
1314
  """Creator of record."""
1291
1315
  _overwrite_versions: bool = BooleanField(default=None)
1292
- """Indicates whether to store or overwrite versions.
1293
-
1294
- It defaults to False for file-like artifacts and to True for folder-like artifacts.
1295
- """
1316
+ # see corresponding property `overwrite_versions`
1296
1317
  projects: Project
1297
- """Linked projects."""
1318
+ """Annotating projects."""
1298
1319
  references: Reference
1299
- """Linked references."""
1320
+ """Annotating references."""
1321
+ records: Record
1322
+ """Annotating records."""
1323
+ linked_in_records: Record
1324
+ """Linked in records."""
1300
1325
 
1301
1326
  @overload
1302
1327
  def __init__(
@@ -1313,7 +1338,8 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1313
1338
  key: str | None = None,
1314
1339
  description: str | None = None,
1315
1340
  revises: Artifact | None = None,
1316
- run: Run | None = None,
1341
+ overwrite_versions: bool | None = None,
1342
+ run: Run | False | None = None,
1317
1343
  ): ...
1318
1344
 
1319
1345
  @overload
@@ -1327,28 +1353,23 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1327
1353
  *args,
1328
1354
  **kwargs,
1329
1355
  ):
1330
- self.features = FeatureManager(self) # type: ignore
1331
- # Below checks for the Django-internal call in from_db()
1332
- # it'd be better if we could avoid this, but not being able to create a Artifact
1333
- # from data with the default constructor renders the central class of the API
1334
- # essentially useless
1335
- # The danger below is not that a user might pass as many args (12 of it), but rather
1336
- # that at some point the Django API might change; on the other hand, this
1337
- # condition of for calling the constructor based on kwargs should always
1338
- # stay robust
1356
+ # check whether we are called with db args
1339
1357
  if len(args) == len(self._meta.concrete_fields):
1340
1358
  super().__init__(*args, **kwargs)
1341
1359
  return None
1342
- # now we proceed with the user-facing constructor
1360
+ # now proceed with the user-facing constructor
1343
1361
  if len(args) > 1:
1344
1362
  raise ValueError("Only one non-keyword arg allowed: data")
1345
1363
  data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1346
1364
  kind: str = kwargs.pop("kind", None)
1347
1365
  key: str | None = kwargs.pop("key", None)
1366
+ run_id: int | None = kwargs.pop("run_id", None) # for REST API
1348
1367
  run: Run | None = kwargs.pop("run", None)
1349
1368
  description: str | None = kwargs.pop("description", None)
1350
1369
  revises: Artifact | None = kwargs.pop("revises", None)
1370
+ overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1351
1371
  version: str | None = kwargs.pop("version", None)
1372
+ branch_id: int | None = None
1352
1373
  if "visibility" in kwargs: # backward compat
1353
1374
  branch_id = kwargs.pop("visibility")
1354
1375
  if "_branch_code" in kwargs: # backward compat
@@ -1357,6 +1378,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1357
1378
  branch_id = kwargs.pop("branch_id")
1358
1379
  else:
1359
1380
  branch_id = 1
1381
+ branch = kwargs.pop("branch", None)
1382
+ space = kwargs.pop("space", None)
1383
+ space_id = kwargs.pop("space_id", 1)
1360
1384
  format = kwargs.pop("format", None)
1361
1385
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1362
1386
  skip_check_exists = kwargs.pop("skip_check_exists", False)
@@ -1369,6 +1393,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1369
1393
  default_storage = setup_settings.instance.storage.record
1370
1394
  using_key = kwargs.pop("using_key", None)
1371
1395
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1396
+ if isinstance(data, str) and data.startswith("s3:///"):
1397
+ # issue in Groovy / nf-lamin producing malformed S3 paths
1398
+ # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q
1399
+ data = data.replace("s3:///", "s3://")
1372
1400
  otype = _check_otype_artifact(data=data, otype=otype)
1373
1401
  if "type" in kwargs:
1374
1402
  logger.warning("`type` will be removed soon, please use `kind`")
@@ -1413,6 +1441,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1413
1441
  default_storage=default_storage,
1414
1442
  using_key=using_key,
1415
1443
  skip_check_exists=skip_check_exists,
1444
+ overwrite_versions=overwrite_versions,
1416
1445
  )
1417
1446
 
1418
1447
  # an object with the same hash already exists
@@ -1462,10 +1491,15 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1462
1491
  if revises is not None and revises.key is not None and kwargs["key"] is None:
1463
1492
  kwargs["key"] = revises.key
1464
1493
 
1494
+ if run_id is not None:
1495
+ kwargs["run_id"] = run_id
1465
1496
  kwargs["kind"] = kind
1466
1497
  kwargs["version"] = version
1467
1498
  kwargs["description"] = description
1499
+ kwargs["branch"] = branch
1468
1500
  kwargs["branch_id"] = branch_id
1501
+ kwargs["space"] = space
1502
+ kwargs["space_id"] = space_id
1469
1503
  kwargs["otype"] = otype
1470
1504
  kwargs["revises"] = revises
1471
1505
  # this check needs to come down here because key might be populated from an
@@ -1504,6 +1538,18 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1504
1538
  def n_objects(self) -> int:
1505
1539
  return self.n_files
1506
1540
 
1541
+ @property
1542
+ def overwrite_versions(self) -> bool:
1543
+ """Indicates whether to keep or overwrite versions.
1544
+
1545
+ It defaults to `False` for file-like artifacts and to `True` for folder-like artifacts.
1546
+
1547
+ Note that this requires significant storage space for large folders with
1548
+ many duplicated files. Currently, `lamindb` does *not* de-duplicate files across
1549
+ versions as in git, but keeps all files for all versions of the folder in storage.
1550
+ """
1551
+ return self._overwrite_versions
1552
+
1507
1553
  @property
1508
1554
  def path(self) -> Path:
1509
1555
  """Path.
@@ -1605,13 +1651,17 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1605
1651
  keys_normalized = [key.split("__")[0] for key in expressions]
1606
1652
  field_or_feature_or_param = keys_normalized[0].split("__")[0]
1607
1653
  if field_or_feature_or_param in Artifact.__get_available_fields__():
1608
- return QuerySet(model=cls).filter(*queries, **expressions)
1654
+ qs = QuerySet(model=cls).filter(*queries, **expressions)
1655
+ if not any(e.startswith("kind") for e in expressions):
1656
+ return qs.exclude(kind="__lamindb_run__")
1657
+ else:
1658
+ return qs
1609
1659
  elif all(
1610
1660
  features_validated := Feature.validate(
1611
1661
  keys_normalized, field="name", mute=True
1612
1662
  )
1613
1663
  ):
1614
- return filter_base(FeatureManagerArtifact, **expressions)
1664
+ return filter_base(Artifact, **expressions)
1615
1665
  else:
1616
1666
  features = ", ".join(
1617
1667
  sorted(np.array(keys_normalized)[~features_validated])
@@ -1626,7 +1676,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1626
1676
  f"Or fix invalid {message}"
1627
1677
  )
1628
1678
  else:
1629
- return QuerySet(model=cls).filter(*queries, **expressions)
1679
+ return (
1680
+ QuerySet(model=cls)
1681
+ .filter(*queries, **expressions)
1682
+ .exclude(kind="__lamindb_run__")
1683
+ )
1630
1684
 
1631
1685
  @classmethod
1632
1686
  def from_df(
@@ -1696,6 +1750,13 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1696
1750
  if schema is not None:
1697
1751
  from ..curators import DataFrameCurator
1698
1752
 
1753
+ if not artifact._state.adding and artifact.suffix != ".parquet":
1754
+ logger.warning(
1755
+ f"not re-validating existing artifact as it was stored as {artifact.suffix}, "
1756
+ "which does not maintain categorical dtype information"
1757
+ )
1758
+ return artifact
1759
+
1699
1760
  curator = DataFrameCurator(artifact, schema)
1700
1761
  curator.validate()
1701
1762
  artifact.schema = schema
@@ -1726,7 +1787,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1726
1787
  schema: A schema that defines how to validate & annotate.
1727
1788
 
1728
1789
  See Also:
1729
-
1730
1790
  :meth:`~lamindb.Collection`
1731
1791
  Track collections.
1732
1792
  :class:`~lamindb.Feature`
@@ -1933,12 +1993,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1933
1993
  revises: Artifact | None = None,
1934
1994
  **kwargs,
1935
1995
  ) -> Artifact:
1936
- """Create from a tiledbsoma store.
1996
+ """Create from a `tiledbsoma.Experiment` store.
1937
1997
 
1938
1998
  Args:
1939
- path: A tiledbsoma store with .tiledbsoma suffix.
1940
- key: A relative path within default storage,
1941
- e.g., `"myfolder/mystore.tiledbsoma"`.
1999
+ exp: TileDB-SOMA Experiment object or path to Experiment store.
2000
+ key: A relative path within default storage, e.g., `"myfolder/mystore.tiledbsoma"`.
1942
2001
  description: A description.
1943
2002
  revises: An old version of the artifact.
1944
2003
  run: The run that creates the artifact.
@@ -1953,7 +2012,11 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1953
2012
  raise ValueError(
1954
2013
  "data has to be a SOMA Experiment object or a path to SOMA Experiment store."
1955
2014
  )
2015
+
2016
+ # SOMAExperiment.uri may have file:// prefix for local paths which needs stripping for filesystem access.
2017
+ # Other URI schemes (s3://, etc.) are preserved and supported.
1956
2018
  exp = exp.uri.removeprefix("file://") if not isinstance(exp, UPathStr) else exp
2019
+
1957
2020
  artifact = Artifact( # type: ignore
1958
2021
  data=exp,
1959
2022
  key=key,
@@ -1975,7 +2038,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1975
2038
  key: str | None = None,
1976
2039
  run: Run | None = None,
1977
2040
  ) -> list[Artifact]:
1978
- """Create a list of artifact objects from a directory.
2041
+ """Create a list of :class:`~lamindb.Artifact` objects from a directory.
1979
2042
 
1980
2043
  Hint:
1981
2044
  If you have a high number of files (several 100k) and don't want to
@@ -2476,8 +2539,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2476
2539
  will not delete the underlying storage by default (if `storage=True` is not specified).
2477
2540
  Deleting the latest version will delete all the versions for folder artifacts.
2478
2541
 
2479
- FAQ: :doc:`docs:faq/storage`
2480
-
2481
2542
  Args:
2482
2543
  permanent: Permanently delete the artifact (skip trash).
2483
2544
  storage: Indicate whether you want to delete the artifact in storage.
@@ -2585,18 +2646,56 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2585
2646
  if delete_msg != "did-not-delete":
2586
2647
  logger.success(f"deleted {colors.yellow(f'{path}')}")
2587
2648
 
2588
- def save(self, upload: bool | None = None, **kwargs) -> Artifact:
2649
+ @property
2650
+ def _is_saved_to_storage_location(self) -> bool | None:
2651
+ """Indicates whether this artifact was correctly written to its storage.
2652
+
2653
+ This is meaningful only after calling `.save()`.
2654
+
2655
+ `None` means no writing was necessary, `True` - that it was written correctly.
2656
+ `False` shows that there was a problem with writing.
2657
+ """
2658
+ if self._aux is not None:
2659
+ return self._aux.get("af", {}).get("0", None)
2660
+ else:
2661
+ return None
2662
+
2663
+ @_is_saved_to_storage_location.setter
2664
+ def _is_saved_to_storage_location(self, value: bool) -> None:
2665
+ self._aux = self._aux or {}
2666
+ self._aux.setdefault("af", {})["0"] = value
2667
+
2668
+ def save(
2669
+ self,
2670
+ upload: bool | None = None,
2671
+ transfer: Literal["record", "annotations"] = "record",
2672
+ **kwargs,
2673
+ ) -> Artifact:
2589
2674
  """Save to database & storage.
2590
2675
 
2591
2676
  Args:
2592
2677
  upload: Trigger upload to cloud storage in instances with hybrid storage mode.
2678
+ transfer: In case artifact was queried on a different instance, dictates behavior of transfer.
2679
+ If "record", only the artifact record is transferred to the current instance.
2680
+ If "annotations", also the annotations linked in the source instance are transferred.
2593
2681
 
2594
- Example::
2682
+ See Also:
2683
+ :doc:`transfer`
2595
2684
 
2596
- import lamindb as ln
2685
+ Example:
2686
+
2687
+ ::
2688
+
2689
+ import lamindb as ln
2597
2690
 
2598
- artifact = ln.Artifact("./myfile.csv", key="myfile.parquet").save()
2691
+ artifact = ln.Artifact("./myfile.csv", key="myfile.parquet").save()
2599
2692
  """
2693
+ if transfer not in {"record", "annotations"}:
2694
+ raise ValueError(
2695
+ f"transfer should be either 'record' or 'annotations', not {transfer}"
2696
+ )
2697
+ else:
2698
+ kwargs["transfer"] = transfer
2600
2699
  state_was_adding = self._state.adding
2601
2700
  print_progress = kwargs.pop("print_progress", True)
2602
2701
  store_kwargs = kwargs.pop(
@@ -2615,9 +2714,16 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2615
2714
  # ensure that the artifact is uploaded
2616
2715
  self._to_store = True
2617
2716
 
2618
- self._save_skip_storage(**kwargs)
2717
+ # _is_saved_to_storage_location indicates whether the saving / upload process is successful
2718
+ flag_complete = hasattr(self, "_local_filepath") and getattr(
2719
+ self, "_to_store", False
2720
+ )
2721
+ if flag_complete:
2722
+ self._is_saved_to_storage_location = (
2723
+ False # will be updated to True at the end
2724
+ )
2619
2725
 
2620
- from .save import check_and_attempt_clearing, check_and_attempt_upload
2726
+ self._save_skip_storage(**kwargs)
2621
2727
 
2622
2728
  using_key = None
2623
2729
  if "using" in kwargs:
@@ -2645,9 +2751,17 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2645
2751
  using_key=using_key,
2646
2752
  )
2647
2753
  if exception_upload is not None:
2648
- raise RuntimeError(exception_upload)
2754
+ raise exception_upload
2649
2755
  if exception_clear is not None:
2650
- raise RuntimeError(exception_clear)
2756
+ raise exception_clear
2757
+ # the saving / upload process has been successfull, just mark it as such
2758
+ # maybe some error handling here?
2759
+ if flag_complete:
2760
+ self._is_saved_to_storage_location = True
2761
+ # pass kwargs here because it can contain `using` or other things
2762
+ # affecting the connection
2763
+ super().save(**kwargs)
2764
+
2651
2765
  # this is only for keep_artifacts_local
2652
2766
  if local_path is not None and not state_was_adding:
2653
2767
  # only move the local artifact to cache if it was not newly created
@@ -2662,6 +2776,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2662
2776
  if hasattr(self, "_curator"):
2663
2777
  curator = self._curator
2664
2778
  delattr(self, "_curator")
2779
+ # just annotates this artifact
2665
2780
  curator.save_artifact()
2666
2781
  return self
2667
2782