lamindb 1.12.1__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,7 +62,7 @@ from ..core.storage.paths import (
62
62
  filepath_cache_key_from_artifact,
63
63
  filepath_from_artifact,
64
64
  )
65
- from ..errors import InvalidArgument, ValidationError
65
+ from ..errors import InvalidArgument, NoStorageLocationForSpace, ValidationError
66
66
  from ..models._is_versioned import (
67
67
  create_uid,
68
68
  )
@@ -301,6 +301,7 @@ def get_stat_or_artifact(
301
301
  check_hash: bool = True,
302
302
  is_replace: bool = False,
303
303
  instance: str | None = None,
304
+ skip_hash_lookup: bool = False,
304
305
  ) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
305
306
  """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
306
307
  n_files = None
@@ -328,31 +329,39 @@ def get_stat_or_artifact(
328
329
  if not check_hash:
329
330
  return size, hash, hash_type, n_files, None
330
331
  previous_artifact_version = None
331
- if key is None or is_replace:
332
- result = Artifact.objects.using(instance).filter(hash=hash).all()
333
- artifact_with_same_hash_exists = len(result) > 0
332
+ if skip_hash_lookup:
333
+ artifact_with_same_hash_exists = False
334
+ hash_lookup_result = []
334
335
  else:
335
- result = (
336
- Artifact.objects.using(instance)
337
- .filter(Q(hash=hash) | Q(key=key, storage=storage))
338
- .order_by("-created_at")
339
- .all()
340
- )
341
- artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
342
- if not artifact_with_same_hash_exists and len(result) > 0:
336
+ if key is None or is_replace:
337
+ hash_lookup_result = Artifact.objects.using(instance).filter(
338
+ ~Q(branch_id=-1), hash=hash
339
+ )
340
+ artifact_with_same_hash_exists = len(hash_lookup_result) > 0
341
+ else:
342
+ hash_lookup_result = (
343
+ Artifact.objects.using(instance)
344
+ .filter(
345
+ ~Q(branch_id=-1),
346
+ Q(hash=hash) | Q(key=key, storage=storage),
347
+ )
348
+ .order_by("-created_at")
349
+ )
350
+ artifact_with_same_hash_exists = (
351
+ hash_lookup_result.filter(hash=hash).count() > 0
352
+ )
353
+ if key is not None and not is_replace:
354
+ if not artifact_with_same_hash_exists and len(hash_lookup_result) > 0:
343
355
  logger.important(
344
356
  f"creating new artifact version for key='{key}' (storage: '{storage.root}')"
345
357
  )
346
- previous_artifact_version = result[0]
358
+ previous_artifact_version = hash_lookup_result[0]
347
359
  if artifact_with_same_hash_exists:
348
360
  message = "returning existing artifact with same hash"
349
- if result[0].branch_id == -1:
350
- result[0].restore()
351
- message = "restored artifact with same hash from trash"
352
361
  logger.important(
353
- f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
362
+ f"{message}: {hash_lookup_result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
354
363
  )
355
- return result[0]
364
+ return hash_lookup_result[0]
356
365
  else:
357
366
  return size, hash, hash_type, n_files, previous_artifact_version
358
367
 
@@ -407,8 +416,8 @@ def get_artifact_kwargs_from_data(
407
416
  is_replace: bool = False,
408
417
  skip_check_exists: bool = False,
409
418
  overwrite_versions: bool | None = None,
419
+ skip_hash_lookup: bool = False,
410
420
  ):
411
- run = get_run(run)
412
421
  memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
413
422
  provisional_uid,
414
423
  data,
@@ -440,11 +449,10 @@ def get_artifact_kwargs_from_data(
440
449
  key=key,
441
450
  instance=using_key,
442
451
  is_replace=is_replace,
452
+ skip_hash_lookup=skip_hash_lookup,
443
453
  )
444
454
  if isinstance(stat_or_artifact, Artifact):
445
455
  existing_artifact = stat_or_artifact
446
- if run is not None:
447
- existing_artifact._populate_subsequent_runs(run)
448
456
  return existing_artifact, None
449
457
  else:
450
458
  size, hash, hash_type, n_files, revises = stat_or_artifact
@@ -634,11 +642,12 @@ def _check_otype_artifact(
634
642
  return otype
635
643
 
636
644
 
637
- def _populate_subsequent_runs_(record: Union[Artifact, Collection], run: Run):
645
+ def populate_subsequent_run(record: Union[Artifact, Collection], run: Run):
638
646
  if record.run is None:
639
647
  record.run = run
640
648
  elif record.run != run:
641
649
  record._subsequent_runs.add(run)
650
+ record._subsequent_run_id = run.id
642
651
 
643
652
 
644
653
  # also see current_run() in core._data
@@ -698,29 +707,6 @@ def save_schema_links(self: Artifact) -> None:
698
707
  bulk_create(links, ignore_conflicts=True)
699
708
 
700
709
 
701
- # can restore later if needed
702
- # def format_provenance(self, fk_data, print_types):
703
- # type_str = lambda attr: (
704
- # f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
705
- # )
706
-
707
- # return "".join(
708
- # [
709
- # f" .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
710
- # for field_name, value in fk_data.items()
711
- # if value.get("name")
712
- # ]
713
- # )
714
-
715
- # can restore later if needed
716
- # def format_input_of_runs(self, print_types):
717
- # if self.id is not None and self.input_of_runs.exists():
718
- # values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
719
- # type_str = ": Run" if print_types else "" # type: ignore
720
- # return f" .input_of_runs{type_str} = {', '.join(values)}\n"
721
- # return ""
722
-
723
-
724
710
  def _describe_postgres(self): # for Artifact & Collection
725
711
  from ._describe import (
726
712
  describe_artifact_general,
@@ -963,7 +949,7 @@ def add_labels(
963
949
  else:
964
950
  validate_feature(feature, records) # type:ignore
965
951
  records_by_registry = defaultdict(list)
966
- feature_sets = self.feature_sets.filter(itype="Feature").all()
952
+ feature_sets = self.feature_sets.filter(itype="Feature")
967
953
  internal_features = set() # type: ignore
968
954
  if len(feature_sets) > 0:
969
955
  for schema in feature_sets:
@@ -1136,6 +1122,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1136
1122
  space: `Space | None = None` The space of the artifact. If `None`, uses the current space.
1137
1123
  storage: `Storage | None = None` The storage location for the artifact. If `None`, uses the default storage location.
1138
1124
  You can see and set the default storage location in :attr:`~lamindb.core.Settings.storage`.
1125
+ schema: A schema that defines how to validate & annotate.
1126
+ features: Additional external features to link.
1127
+ skip_hash_lookup: Skip the hash lookup so that a new artifact is created even if an identical artifact already exists.
1139
1128
 
1140
1129
  Examples:
1141
1130
 
@@ -1326,7 +1315,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1326
1315
 
1327
1316
  Similarly, you query based on these accessors::
1328
1317
 
1329
- ln.Artifact.filter(ulabels__name="Experiment 1").all()
1318
+ ln.Artifact.filter(ulabels__name="Experiment 1")
1330
1319
 
1331
1320
  Unlike the registry-specific accessors, the `.labels` accessor provides
1332
1321
  a way of associating labels with features::
@@ -1495,12 +1484,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1495
1484
  def __init__(
1496
1485
  self,
1497
1486
  # we're not choosing the name "path" for this arg because
1498
- # it'd be confusing with `artifact.path`, which is not the same
1499
- # so "data" conveys better that this is input data that's ingested
1487
+ # it could be confused with `artifact.path`
1488
+ # "data" conveys better that this is input data that's ingested
1500
1489
  # and will be moved to a target path at `artifact.path`
1501
- # also internally, we sometimes pass "data objects" like a DataFrame
1502
- # here; and we might refactor this but we might also keep that internal
1503
- # usage
1504
1490
  data: UPathStr,
1505
1491
  kind: ArtifactKind | str | None = None,
1506
1492
  key: str | None = None,
@@ -1511,6 +1497,9 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1511
1497
  storage: Storage | None = None,
1512
1498
  branch: Branch | None = None,
1513
1499
  space: Space | None = None,
1500
+ schema: Schema | None = None,
1501
+ features: dict[str, Any] | None = None,
1502
+ skip_hash_lookup: bool = False,
1514
1503
  ): ...
1515
1504
 
1516
1505
  @overload
@@ -1542,48 +1531,23 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1542
1531
  revises: Artifact | None = kwargs.pop("revises", None)
1543
1532
  overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None)
1544
1533
  version: str | None = kwargs.pop("version", None)
1545
-
1546
- features: dict[str, Any] = kwargs.pop("features", None)
1547
1534
  schema: Schema | None = kwargs.pop("schema", None)
1548
- if features is not None and schema is not None:
1549
- from lamindb.curators import DataFrameCurator
1550
-
1551
- temp_df = pd.DataFrame([features])
1552
- validation_schema = schema
1553
- if schema.itype == "Composite" and schema.slots:
1554
- if len(schema.slots) > 1:
1555
- raise ValueError(
1556
- f"Composite schema has {len(schema.slots)} slots. "
1557
- "External feature validation only supports schemas with a single slot."
1558
- )
1559
- try:
1560
- validation_schema = next(
1561
- k for k in schema.slots.keys() if k.startswith("__external")
1562
- )
1563
- except StopIteration:
1564
- raise ValueError(
1565
- "External feature validation requires a slot that starts with __external."
1566
- ) from None
1567
-
1568
- external_curator = DataFrameCurator(temp_df, validation_schema)
1569
- external_curator.validate()
1570
- external_curator._artifact = self
1571
-
1572
- self._external_features = features
1573
-
1574
- branch_id: int | None = None
1575
- if "visibility" in kwargs: # backward compat
1576
- branch_id = kwargs.pop("visibility")
1577
- if "_branch_code" in kwargs: # backward compat
1578
- branch_id = kwargs.pop("_branch_code")
1579
- elif "branch_id" in kwargs:
1580
- branch_id = kwargs.pop("branch_id")
1581
- else:
1582
- branch_id = 1
1583
- branch = kwargs.pop("branch", None)
1535
+ features: dict[str, Any] | None = kwargs.pop("features", None)
1536
+ skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False)
1537
+
1538
+ # validate external features if passed with a schema
1539
+ if features is not None:
1540
+ self._external_features = features
1541
+ if schema is not None:
1542
+ from lamindb.curators.core import ExperimentalDictCurator
1584
1543
 
1544
+ validation_schema = schema
1545
+ ExperimentalDictCurator(features, validation_schema).validate()
1546
+
1547
+ branch = kwargs.pop("branch", None)
1548
+ assert "branch_id" not in kwargs, "Please pass branch instead of branch_id." # noqa: S101
1585
1549
  space = kwargs.pop("space", None)
1586
- assert "space_id" not in kwargs, "please pass space instead" # noqa: S101
1550
+ assert "space_id" not in kwargs, "Please pass space instead of space_id." # noqa: S101
1587
1551
  format = kwargs.pop("format", None)
1588
1552
  _is_internal_call = kwargs.pop("_is_internal_call", False)
1589
1553
  skip_check_exists = kwargs.pop("skip_check_exists", False)
@@ -1611,11 +1575,19 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1611
1575
  "storage argument ignored as storage information from space takes precedence"
1612
1576
  )
1613
1577
  storage_locs_for_space = Storage.filter(space=space)
1614
- storage = storage_locs_for_space.first()
1615
- if len(storage_locs_for_space) > 1:
1616
- logger.warning(
1617
- f"more than one storage location for space {space}, choosing {storage}"
1578
+ n_storage_locs_for_space = len(storage_locs_for_space)
1579
+ if n_storage_locs_for_space == 0:
1580
+ raise NoStorageLocationForSpace(
1581
+ "No storage location found for space.\n"
1582
+ "Either create one via ln.Storage(root='create-s3', space=space).save()\n"
1583
+ "Or start managing access to an existing storage location via the space: storage_loc.space = space; storage.save()"
1618
1584
  )
1585
+ else:
1586
+ storage = storage_locs_for_space.first()
1587
+ if n_storage_locs_for_space > 1:
1588
+ logger.warning(
1589
+ f"more than one storage location for space {space}, choosing {storage}"
1590
+ )
1619
1591
  otype = kwargs.pop("otype") if "otype" in kwargs else None
1620
1592
  if isinstance(data, str) and data.startswith("s3:///"):
1621
1593
  # issue in Groovy / nf-lamin producing malformed S3 paths
@@ -1658,6 +1630,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1658
1630
  is_automanaged_path = False
1659
1631
 
1660
1632
  provisional_uid, revises = create_uid(revises=revises, version=version)
1633
+ run = get_run(run)
1661
1634
  kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1662
1635
  data=data,
1663
1636
  key=key,
@@ -1669,6 +1642,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1669
1642
  using_key=using_key,
1670
1643
  skip_check_exists=skip_check_exists,
1671
1644
  overwrite_versions=overwrite_versions,
1645
+ skip_hash_lookup=skip_hash_lookup,
1672
1646
  )
1673
1647
 
1674
1648
  # an object with the same hash already exists
@@ -1685,6 +1659,8 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1685
1659
  f"key {self.key} on existing artifact differs from passed key {key}"
1686
1660
  )
1687
1661
  update_attributes(self, attr_to_update)
1662
+ if run is not None:
1663
+ populate_subsequent_run(self, run)
1688
1664
  return None
1689
1665
  else:
1690
1666
  kwargs = kwargs_or_artifact
@@ -1724,7 +1700,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1724
1700
  kwargs["version"] = version
1725
1701
  kwargs["description"] = description
1726
1702
  kwargs["branch"] = branch
1727
- kwargs["branch_id"] = branch_id
1728
1703
  kwargs["space"] = space
1729
1704
  kwargs["otype"] = otype
1730
1705
  kwargs["revises"] = revises
@@ -1739,43 +1714,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1739
1714
 
1740
1715
  super().__init__(**kwargs)
1741
1716
 
1742
- @classmethod
1743
- def from_lazy(
1744
- cls,
1745
- suffix: str,
1746
- overwrite_versions: bool,
1747
- key: str | None = None,
1748
- description: str | None = None,
1749
- run: Run | None = None,
1750
- **kwargs,
1751
- ) -> LazyArtifact:
1752
- """Create a lazy artifact for streaming to auto-generated internal paths.
1753
-
1754
- This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1755
- and register the path as an artifact.
1756
-
1757
- The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1758
- on `.save()` with the provided arguments.
1759
-
1760
- Args:
1761
- suffix: The suffix for the auto-generated internal path
1762
- overwrite_versions: Whether to overwrite versions.
1763
- key: An optional key to reference the artifact.
1764
- description: A description.
1765
- run: The run that creates the artifact.
1766
- **kwargs: Other keyword arguments for the artifact to be created.
1767
-
1768
- Examples:
1769
-
1770
- Create a lazy artifact, write to the path and save to get a real artifact::
1771
-
1772
- lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1773
- zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1774
- artifact = lazy.save()
1775
- """
1776
- args = {"key": key, "description": description, "run": run, **kwargs}
1777
- return LazyArtifact(suffix, overwrite_versions, **args)
1778
-
1779
1717
  @property
1780
1718
  @deprecated("kind")
1781
1719
  def type(self) -> str:
@@ -1876,8 +1814,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1876
1814
  artifact = ln.Arfifact.get(key="examples/my_file.parquet")
1877
1815
  artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad")
1878
1816
  """
1879
- from .query_set import QuerySet
1880
-
1881
1817
  return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)
1882
1818
 
1883
1819
  @classmethod
@@ -1909,6 +1845,43 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1909
1845
  # from Registry metaclass
1910
1846
  return type(cls).filter(cls, *queries, **expressions)
1911
1847
 
1848
+ @classmethod
1849
+ def from_lazy(
1850
+ cls,
1851
+ suffix: str,
1852
+ overwrite_versions: bool,
1853
+ key: str | None = None,
1854
+ description: str | None = None,
1855
+ run: Run | None = None,
1856
+ **kwargs,
1857
+ ) -> LazyArtifact:
1858
+ """Create a lazy artifact for streaming to auto-generated internal paths.
1859
+
1860
+ This is needed when it is desirable to stream to a `lamindb` auto-generated internal path
1861
+ and register the path as an artifact.
1862
+
1863
+ The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact
1864
+ on `.save()` with the provided arguments.
1865
+
1866
+ Args:
1867
+ suffix: The suffix for the auto-generated internal path
1868
+ overwrite_versions: Whether to overwrite versions.
1869
+ key: An optional key to reference the artifact.
1870
+ description: A description.
1871
+ run: The run that creates the artifact.
1872
+ **kwargs: Other keyword arguments for the artifact to be created.
1873
+
1874
+ Examples:
1875
+
1876
+ Create a lazy artifact, write to the path and save to get a real artifact::
1877
+
1878
+ lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr")
1879
+ zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path
1880
+ artifact = lazy.save()
1881
+ """
1882
+ args = {"key": key, "description": description, "run": run, **kwargs}
1883
+ return LazyArtifact(suffix, overwrite_versions, **args)
1884
+
1912
1885
  @classmethod
1913
1886
  def from_dataframe(
1914
1887
  cls,
@@ -1932,21 +1905,15 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1932
1905
  revises: An old version of the artifact.
1933
1906
  run: The run that creates the artifact.
1934
1907
  schema: A schema that defines how to validate & annotate.
1935
- features: External features dict for additional annotation.
1908
+ features: Additional external features to link.
1936
1909
 
1937
- See Also:
1938
- :meth:`~lamindb.Collection`
1939
- Track collections.
1940
- :class:`~lamindb.Feature`
1941
- Track features.
1942
-
1943
- Example:
1910
+ Examples:
1944
1911
 
1945
1912
  No validation and annotation::
1946
1913
 
1947
1914
  import lamindb as ln
1948
1915
 
1949
- df = ln.core.datasets.mini_immuno.get_dataset1()
1916
+ df = ln.examples.datasets.mini_immuno.get_dataset1()
1950
1917
  artifact = ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save()
1951
1918
 
1952
1919
  With validation and annotation.
@@ -1980,9 +1947,10 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1980
1947
  **kwargs,
1981
1948
  )
1982
1949
  artifact.n_observations = len(df)
1983
-
1950
+ if features is not None:
1951
+ artifact._external_features = features
1984
1952
  if schema is not None:
1985
- from lamindb.curators.core import ComponentCurator
1953
+ from lamindb.curators.core import DataFrameCurator, ExperimentalDictCurator
1986
1954
 
1987
1955
  if not artifact._state.adding and artifact.suffix != ".parquet":
1988
1956
  logger.warning(
@@ -1991,31 +1959,14 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
1991
1959
  )
1992
1960
  return artifact
1993
1961
 
1994
- # Handle external features validation for Composite schemas
1995
- if schema.itype == "Composite" and features is not None:
1996
- try:
1997
- external_slot = next(
1998
- k for k in schema.slots.keys() if "__external__" in k
1999
- )
2000
- validation_schema = schema.slots[external_slot]
2001
- except StopIteration:
2002
- raise ValueError(
2003
- "External feature validation requires a slot __external__."
2004
- ) from None
2005
-
2006
- external_curator = ComponentCurator(
2007
- pd.DataFrame([features]), validation_schema
2008
- )
2009
- external_curator.validate()
2010
- artifact._external_features = features
2011
-
2012
- # Validate main DataFrame if not Composite or if Composite has attrs
2013
- if schema.itype != "Composite" or "attrs" in schema.slots:
2014
- curator = ComponentCurator(artifact, schema)
2015
- curator.validate()
2016
- artifact.schema = schema
2017
- artifact._curator = curator
1962
+ if features is not None and "__external__" in schema.slots:
1963
+ validation_schema = schema.slots["__external__"]
1964
+ ExperimentalDictCurator(features, validation_schema).validate()
2018
1965
 
1966
+ curator = DataFrameCurator(artifact, schema)
1967
+ curator.validate()
1968
+ artifact.schema = schema
1969
+ artifact._curator = curator
2019
1970
  return artifact
2020
1971
 
2021
1972
  @classmethod
@@ -2076,7 +2027,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2076
2027
 
2077
2028
  import lamindb as ln
2078
2029
 
2079
- adata = ln.core.datasets.anndata_with_obs()
2030
+ adata = ln.examples.datasets.anndata_with_obs()
2080
2031
  artifact = ln.Artifact.from_anndata(adata, key="mini_anndata_with_obs.h5ad").save()
2081
2032
 
2082
2033
  With validation and annotation.
@@ -2164,7 +2115,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2164
2115
 
2165
2116
  import lamindb as ln
2166
2117
 
2167
- mdata = ln.core.datasets.mudata_papalexi21_subset()
2118
+ mdata = ln.examples.datasets.mudata_papalexi21_subset()
2168
2119
  artifact = ln.Artifact.from_mudata(mdata, key="mudata_papalexi21_subset.h5mu").save()
2169
2120
  """
2170
2121
  if not data_is_scversedatastructure(mdata, "MuData"):
@@ -2335,7 +2286,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2335
2286
 
2336
2287
  import lamindb as ln
2337
2288
 
2338
- dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
2289
+ dir_path = ln.examples.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
2339
2290
  artifacts = ln.Artifact.from_dir(dir_path)
2340
2291
  ln.save(artifacts)
2341
2292
  """
@@ -2447,6 +2398,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2447
2398
  However, it will update the suffix if it changes.
2448
2399
  """
2449
2400
  storage = settings.storage.record
2401
+ run = get_run(run)
2450
2402
  kwargs, privates = get_artifact_kwargs_from_data(
2451
2403
  provisional_uid=self.uid,
2452
2404
  data=data,
@@ -2690,7 +2642,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2690
2642
 
2691
2643
  access = _track_writes_factory(access, finalize)
2692
2644
  # only call if open is successfull
2693
- _track_run_input(self, is_run_input)
2645
+ track_run_input(self, is_run_input)
2694
2646
  return access
2695
2647
 
2696
2648
  def load(
@@ -2769,7 +2721,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2769
2721
  )
2770
2722
  access_memory = load_to_memory(cache_path, **kwargs)
2771
2723
  # only call if load is successfull
2772
- _track_run_input(self, is_run_input)
2724
+ track_run_input(self, is_run_input)
2773
2725
 
2774
2726
  return access_memory
2775
2727
 
@@ -2804,7 +2756,7 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2804
2756
  filepath, cache_key=cache_key, **kwargs
2805
2757
  )
2806
2758
  # only call if sync is successfull
2807
- _track_run_input(self, is_run_input)
2759
+ track_run_input(self, is_run_input)
2808
2760
  return cache_path
2809
2761
 
2810
2762
  def delete(
@@ -2969,13 +2921,13 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
2969
2921
  )
2970
2922
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2971
2923
 
2972
- # Handle external features
2973
- if hasattr(self, "_external_features") and self._external_features is not None:
2924
+ # annotate with external features
2925
+ if hasattr(self, "_external_features"):
2974
2926
  external_features = self._external_features
2975
2927
  delattr(self, "_external_features")
2976
2928
  self.features.add_values(external_features)
2977
2929
 
2978
- # annotate Artifact
2930
+ # annotate with internal features based on curator
2979
2931
  if hasattr(self, "_curator"):
2980
2932
  curator = self._curator
2981
2933
  delattr(self, "_curator")
@@ -3002,9 +2954,6 @@ class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
3002
2954
  """
3003
2955
  return describe_artifact_collection(self, return_str=return_str)
3004
2956
 
3005
- def _populate_subsequent_runs(self, run: Run) -> None:
3006
- _populate_subsequent_runs_(self, run)
3007
-
3008
2957
 
3009
2958
  # can't really just call .cache in .load because of double tracking
3010
2959
  def _synchronize_cleanup_on_error(
@@ -3068,15 +3017,20 @@ class ArtifactUser(BaseSQLRecord, IsLink, TracksRun):
3068
3017
  unique_together = ("artifact", "user", "feature")
3069
3018
 
3070
3019
 
3071
- def _track_run_input(
3072
- data: (
3020
+ def track_run_input(
3021
+ record: (
3073
3022
  Artifact | Iterable[Artifact]
3074
3023
  ), # can also be Collection | Iterable[Collection]
3075
3024
  is_run_input: bool | Run | None = None,
3076
3025
  run: Run | None = None,
3077
- ):
3026
+ ) -> None:
3027
+ """Links a record as an input to a run.
3028
+
3029
+ This function contains all validation logic to make decisions on whether a
3030
+ record qualifies as an input or not.
3031
+ """
3078
3032
  if is_run_input is False:
3079
- return
3033
+ return None
3080
3034
 
3081
3035
  from .._tracked import get_current_tracked_run
3082
3036
  from ..core._context import context
@@ -3089,133 +3043,138 @@ def _track_run_input(
3089
3043
  run = get_current_tracked_run()
3090
3044
  if run is None:
3091
3045
  run = context.run
3092
- # consider that data is an iterable of Data
3093
- data_iter: Iterable[Artifact] | Iterable[Collection] = (
3094
- [data] if isinstance(data, (Artifact, Collection)) else data
3046
+ # consider that record is an iterable of Data
3047
+ record_iter: Iterable[Artifact] | Iterable[Collection] = (
3048
+ [record] if isinstance(record, (Artifact, Collection)) else record
3095
3049
  )
3096
- track_run_input = False
3097
- input_data = []
3050
+ input_records = []
3098
3051
  if run is not None:
3099
- # avoid cycles: data can't be both input and output
3100
- def is_valid_input(data: Artifact | Collection):
3052
+ assert not run._state.adding, "Save the run before tracking its inputs." # noqa: S101
3053
+
3054
+ def is_valid_input(record: Artifact | Collection):
3101
3055
  is_valid = False
3102
- if data._state.db == "default":
3056
+ # if a record is not yet saved it has record._state.db = None
3057
+ # then it can't be an input
3058
+ # we silently ignore because what will happen is that
3059
+ # the record either gets saved and then is tracked as an output
3060
+ # or it won't get saved at all
3061
+ if record._state.db == "default":
3103
3062
  # things are OK if the record is on the default db
3104
3063
  is_valid = True
3105
- elif data._state.db is None:
3106
- # if a record is not yet saved, it can't be an input
3107
- # we silently ignore because what likely happens is that
3108
- # the user works with an object that's about to be saved
3109
- # in the current Python session
3110
- is_valid = False
3111
3064
  else:
3112
3065
  # record is on another db
3113
3066
  # we have to save the record into the current db with
3114
3067
  # the run being attached to a transfer transform
3115
3068
  logger.info(
3116
- f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}...') as input"
3069
+ f"completing transfer to track {record.__class__.__name__}('{record.uid}') as input"
3117
3070
  )
3118
- data.save()
3071
+ record.save()
3119
3072
  is_valid = True
3120
- data_run_id, run_id = data.run_id, run.id
3121
- different_runs = (data_run_id != run_id) or (
3122
- data_run_id is None and run_id is None
3123
- )
3124
- return (
3125
- different_runs
3126
- and not data._state.adding # this seems duplicated with data._state.db is None
3127
- and is_valid
3128
- )
3073
+ # avoid cycles: record can't be both input and output
3074
+ if record.run_id == run.id:
3075
+ logger.debug(
3076
+ f"not tracking {record} as input to run {run} because created by same run"
3077
+ )
3078
+ is_valid = False
3079
+ if run.id == getattr(record, "_subsequent_run_id", None):
3080
+ logger.debug(
3081
+ f"not tracking {record} as input to run {run} because re-created in same run"
3082
+ )
3083
+ is_valid = False
3084
+ return is_valid
3129
3085
 
3130
- input_data = [data for data in data_iter if is_valid_input(data)]
3131
- input_data_ids = [data.id for data in input_data]
3132
- if input_data:
3133
- data_class_name = input_data[0].__class__.__name__.lower()
3086
+ input_records = [record for record in record_iter if is_valid_input(record)]
3087
+ input_records_ids = [record.id for record in input_records]
3088
+ if input_records:
3089
+ record_class_name = input_records[0].__class__.__name__.lower()
3134
3090
  # let us first look at the case in which the user does not
3135
3091
  # provide a boolean value for `is_run_input`
3136
3092
  # hence, we need to determine whether we actually want to
3137
3093
  # track a run or not
3138
- if is_run_input is None:
3139
- # we don't have a run record
3094
+ track = False
3095
+ is_run_input = settings.track_run_inputs if is_run_input is None else is_run_input
3096
+ if is_run_input:
3140
3097
  if run is None:
3141
- if settings.track_run_inputs:
3142
- if not is_read_only_connection():
3143
- logger.warning(WARNING_NO_INPUT)
3144
- # assume we have a run record
3145
- else:
3146
- # assume there is non-cyclic candidate input data
3147
- if input_data:
3148
- if settings.track_run_inputs:
3149
- transform_note = ""
3150
- if len(input_data) == 1:
3151
- if input_data[0].transform is not None:
3152
- transform_note = (
3153
- ", adding parent transform"
3154
- f" {input_data[0].transform.id}"
3155
- )
3156
- logger.info(
3157
- f"adding {data_class_name} ids {input_data_ids} as inputs for run"
3158
- f" {run.id}{transform_note}"
3159
- )
3160
- track_run_input = True
3161
- else:
3162
- logger.hint(
3163
- "track these data as a run input by passing `is_run_input=True`"
3164
- )
3098
+ if not is_read_only_connection():
3099
+ logger.warning(WARNING_NO_INPUT)
3100
+ elif input_records:
3101
+ logger.debug(
3102
+ f"adding {record_class_name} ids {input_records_ids} as inputs for run {run.id}"
3103
+ )
3104
+ track = True
3165
3105
  else:
3166
- track_run_input = is_run_input
3167
- if track_run_input:
3168
- if run is None:
3169
- raise ValueError("No run context set. Call `ln.track()`.")
3170
- if run._state.adding:
3171
- # avoid adding the same run twice
3172
- run.save()
3173
- if data_class_name == "artifact":
3174
- IsLink = run.input_artifacts.through
3175
- links = [
3176
- IsLink(run_id=run.id, artifact_id=data_id) for data_id in input_data_ids
3177
- ]
3178
- else:
3179
- IsLink = run.input_collections.through
3180
- links = [
3181
- IsLink(run_id=run.id, collection_id=data_id)
3182
- for data_id in input_data_ids
3183
- ]
3184
- try:
3185
- IsLink.objects.bulk_create(links, ignore_conflicts=True)
3186
- except ProgrammingError as e:
3187
- if "new row violates row-level security policy" in str(e):
3188
- instance = setup_settings.instance
3189
- available_spaces = instance.available_spaces
3190
- if available_spaces is None:
3191
- raise NoWriteAccess(
3192
- f"You’re not allowed to write to the instance {instance.slug}.\n"
3193
- "Please contact administrators of the instance if you need write access."
3194
- ) from None
3195
- write_access_spaces = (
3196
- available_spaces["admin"] + available_spaces["write"]
3197
- )
3198
- no_write_access_spaces = {
3199
- data_space
3200
- for data in input_data
3201
- if (data_space := data.space) not in write_access_spaces
3202
- }
3203
- if (run_space := run.space) not in write_access_spaces:
3204
- no_write_access_spaces.add(run_space)
3205
- if len(no_write_access_spaces) > 1:
3206
- name_msg = ", ".join(
3207
- f"'{space.name}'" for space in no_write_access_spaces
3208
- )
3209
- space_msg = "spaces"
3210
- else:
3211
- name_msg = f"'{no_write_access_spaces.pop().name}'"
3212
- space_msg = "space"
3106
+ track = is_run_input
3107
+ if not track or not input_records:
3108
+ return None
3109
+ if run is None:
3110
+ raise ValueError("No run context set. Call `ln.track()`.")
3111
+ if record_class_name == "artifact":
3112
+ IsLink = run.input_artifacts.through
3113
+ links = [
3114
+ IsLink(run_id=run.id, artifact_id=record_id)
3115
+ for record_id in input_records_ids
3116
+ ]
3117
+ else:
3118
+ IsLink = run.input_collections.through
3119
+ links = [
3120
+ IsLink(run_id=run.id, collection_id=record_id)
3121
+ for record_id in input_records_ids
3122
+ ]
3123
+ try:
3124
+ IsLink.objects.bulk_create(links, ignore_conflicts=True)
3125
+ except ProgrammingError as e:
3126
+ if "new row violates row-level security policy" in str(e):
3127
+ instance = setup_settings.instance
3128
+ available_spaces = instance.available_spaces
3129
+ if available_spaces is None:
3213
3130
  raise NoWriteAccess(
3214
- f"You’re not allowed to write to the {space_msg} {name_msg}.\n"
3215
- f"Please contact administrators of the {space_msg} if you need write access."
3131
+ f"You’re not allowed to write to the instance {instance.slug}.\n"
3132
+ "Please contact administrators of the instance if you need write access."
3216
3133
  ) from None
3134
+ write_access_spaces = available_spaces["admin"] + available_spaces["write"]
3135
+ no_write_access_spaces = {
3136
+ record_space
3137
+ for record in input_records
3138
+ if (record_space := record.space) not in write_access_spaces
3139
+ }
3140
+ if (run_space := run.space) not in write_access_spaces:
3141
+ no_write_access_spaces.add(run_space)
3142
+
3143
+ if not no_write_access_spaces:
3144
+ # if there are no unavailable spaces, then this should be due to locking
3145
+ locked_records = [
3146
+ record
3147
+ for record in input_records
3148
+ if getattr(record, "is_locked", False)
3149
+ ]
3150
+ if run.is_locked:
3151
+ locked_records.append(run)
3152
+ # if no unavailable spaces and no locked records, just raise the original error
3153
+ if not locked_records:
3154
+ raise e
3155
+ no_write_msg = (
3156
+ "It is not allowed to modify locked records: "
3157
+ + ", ".join(
3158
+ r.__class__.__name__ + f"(uid={r.uid})" for r in locked_records
3159
+ )
3160
+ + "."
3161
+ )
3162
+ raise NoWriteAccess(no_write_msg) from None
3163
+
3164
+ if len(no_write_access_spaces) > 1:
3165
+ name_msg = ", ".join(
3166
+ f"'{space.name}'" for space in no_write_access_spaces
3167
+ )
3168
+ space_msg = "spaces"
3217
3169
  else:
3218
- raise e
3170
+ name_msg = f"'{no_write_access_spaces.pop().name}'"
3171
+ space_msg = "space"
3172
+ raise NoWriteAccess(
3173
+ f"You’re not allowed to write to the {space_msg} {name_msg}.\n"
3174
+ f"Please contact administrators of the {space_msg} if you need write access."
3175
+ ) from None
3176
+ else:
3177
+ raise e
3219
3178
 
3220
3179
 
3221
3180
  # privates currently dealt with separately