lsst-daf-butler 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. lsst/daf/butler/_dataset_association.py +28 -3
  2. lsst/daf/butler/_registry_shim.py +76 -4
  3. lsst/daf/butler/cli/cmd/_remove_collections.py +15 -0
  4. lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
  5. lsst/daf/butler/configs/storageClasses.yaml +5 -0
  6. lsst/daf/butler/datastores/fileDatastore.py +2 -2
  7. lsst/daf/butler/direct_butler/_direct_butler.py +3 -3
  8. lsst/daf/butler/json.py +2 -2
  9. lsst/daf/butler/queries/_expression_strings.py +1 -1
  10. lsst/daf/butler/registry/_registry.py +39 -20
  11. lsst/daf/butler/registry/_registry_base.py +13 -5
  12. lsst/daf/butler/registry/datasets/byDimensions/_manager.py +65 -16
  13. lsst/daf/butler/registry/interfaces/_datasets.py +21 -1
  14. lsst/daf/butler/registry/queries/_query_common.py +3 -0
  15. lsst/daf/butler/registry/queries/_query_datasets.py +7 -1
  16. lsst/daf/butler/registry/sql_registry.py +41 -187
  17. lsst/daf/butler/registry/tests/_registry.py +120 -6
  18. lsst/daf/butler/remote_butler/_factory.py +2 -2
  19. lsst/daf/butler/remote_butler/_registry.py +4 -0
  20. lsst/daf/butler/remote_butler/_remote_butler.py +3 -1
  21. lsst/daf/butler/remote_butler/authentication/cadc.py +63 -11
  22. lsst/daf/butler/script/removeCollections.py +46 -13
  23. lsst/daf/butler/tests/butler_queries.py +40 -1
  24. lsst/daf/butler/tests/hybrid_butler_registry.py +5 -4
  25. lsst/daf/butler/tests/server_available.py +53 -0
  26. lsst/daf/butler/timespan_database_representation.py +8 -0
  27. lsst/daf/butler/transfers/_context.py +5 -16
  28. lsst/daf/butler/version.py +1 -1
  29. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/METADATA +1 -1
  30. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/RECORD +38 -37
  31. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/WHEEL +0 -0
  32. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/entry_points.txt +0 -0
  33. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/COPYRIGHT +0 -0
  34. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/LICENSE +0 -0
  35. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/bsd_license.txt +0 -0
  36. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/gpl-v3.0.txt +0 -0
  37. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/top_level.txt +0 -0
  38. {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/zip-safe +0 -0
@@ -29,15 +29,17 @@ from __future__ import annotations
29
29
 
30
30
  __all__ = ("DatasetAssociation",)
31
31
 
32
- from collections.abc import Iterator
32
+ from collections.abc import Iterator, Mapping
33
33
  from dataclasses import dataclass
34
34
  from typing import TYPE_CHECKING, Any
35
35
 
36
+ from ._collection_type import CollectionType
36
37
  from ._dataset_ref import DatasetRef
37
38
  from ._dataset_type import DatasetType
38
39
  from ._timespan import Timespan
39
40
 
40
41
  if TYPE_CHECKING:
42
+ from ._butler_collections import CollectionInfo
41
43
  from .queries._general_query_results import GeneralQueryResults
42
44
 
43
45
 
@@ -66,7 +68,10 @@ class DatasetAssociation:
66
68
 
67
69
  @classmethod
68
70
  def from_query_result(
69
- cls, result: GeneralQueryResults, dataset_type: DatasetType
71
+ cls,
72
+ result: GeneralQueryResults,
73
+ dataset_type: DatasetType,
74
+ collection_info: Mapping[str, CollectionInfo] | None = None,
70
75
  ) -> Iterator[DatasetAssociation]:
71
76
  """Construct dataset associations from the result of general query.
72
77
 
@@ -79,11 +84,31 @@ class DatasetAssociation:
79
84
  "timespan" dataset fields for ``dataset_type``.
80
85
  dataset_type : `DatasetType`
81
86
  Dataset type, query has to include this dataset type.
87
+ collection_info : `~collections.abc.Mapping` \
88
+ [`str`, `CollectionInfo`], optional
89
+ Mapping from collection name to information about it for all
90
+ collections that may appear in the query results. If not provided,
91
+ timespans for `~CollectionType.RUN` and `~CollectionType.TAGGED`
92
+ collections will be bounded, instead of `None`; this is actually
93
+ more consistent with how those timespans are used elsewhere in the
94
+ query system, but is a change from how `DatasetAssocation` has
95
+ historically worked.
82
96
  """
83
97
  timespan_key = f"{dataset_type.name}.timespan"
84
98
  collection_key = f"{dataset_type.name}.collection"
85
99
  for _, refs, row_dict in result.iter_tuples(dataset_type):
86
- yield DatasetAssociation(refs[0], row_dict[collection_key], row_dict[timespan_key])
100
+ collection = row_dict[collection_key]
101
+ timespan = row_dict[timespan_key]
102
+ if (
103
+ collection_info is not None
104
+ and collection_info[collection].type is not CollectionType.CALIBRATION
105
+ ):
106
+ # This behavior is for backwards compatibility only; in most
107
+ # contexts it makes sense to consider the timespan of a RUN
108
+ # or TAGGED collection to be unbounded, not None, and that's
109
+ # what the query results we're iterating over do.
110
+ timespan = None
111
+ yield DatasetAssociation(refs[0], collection, timespan)
87
112
 
88
113
  def __lt__(self, other: Any) -> bool:
89
114
  # Allow sorting of associations
@@ -36,6 +36,7 @@ from typing import TYPE_CHECKING, Any
36
36
  from ._collection_type import CollectionType
37
37
  from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
38
38
  from ._dataset_type import DatasetType
39
+ from ._exceptions import CalibrationLookupError
39
40
  from ._storage_class import StorageClassFactory
40
41
  from ._timespan import Timespan
41
42
  from .dimensions import (
@@ -48,7 +49,9 @@ from .dimensions import (
48
49
  )
49
50
  from .registry._collection_summary import CollectionSummary
50
51
  from .registry._defaults import RegistryDefaults
52
+ from .registry._exceptions import NoDefaultCollectionError
51
53
  from .registry._registry_base import RegistryBase
54
+ from .registry.queries._query_common import resolve_collections
52
55
 
53
56
  if TYPE_CHECKING:
54
57
  from .direct_butler import DirectButler
@@ -182,13 +185,76 @@ class RegistryShim(RegistryBase):
182
185
  *,
183
186
  collections: CollectionArgType | None = None,
184
187
  timespan: Timespan | None = None,
188
+ datastore_records: bool = False,
185
189
  **kwargs: Any,
186
190
  ) -> DatasetRef | None:
187
191
  # Docstring inherited from a base class.
188
- return self._registry.findDataset(
189
- datasetType, dataId, collections=collections, timespan=timespan, **kwargs
192
+ if not isinstance(datasetType, DatasetType):
193
+ datasetType = self.getDatasetType(datasetType)
194
+
195
+ dataId = DataCoordinate.standardize(
196
+ dataId,
197
+ dimensions=datasetType.dimensions,
198
+ universe=self.dimensions,
199
+ defaults=self.defaults.dataId,
200
+ **kwargs,
190
201
  )
191
202
 
203
+ with self._butler.query() as query:
204
+ resolved_collections = resolve_collections(self._butler, collections)
205
+ if not resolved_collections:
206
+ if collections is None:
207
+ raise NoDefaultCollectionError("No collections provided, and no default collections set")
208
+ else:
209
+ return None
210
+
211
+ if datasetType.isCalibration() and timespan is None:
212
+ # Filter out calibration collections, because with no timespan
213
+ # we have no way of selecting a dataset from them.
214
+ collection_info = self._butler.collections.query_info(
215
+ resolved_collections, flatten_chains=True
216
+ )
217
+ resolved_collections = [
218
+ info.name for info in collection_info if info.type != CollectionType.CALIBRATION
219
+ ]
220
+ if not resolved_collections:
221
+ return None
222
+
223
+ result = query.datasets(datasetType, resolved_collections, find_first=True).limit(2)
224
+ dataset_type_name = result.dataset_type.name
225
+ # Search only on the 'required' dimensions for the dataset type.
226
+ # Any extra values provided by the user are ignored.
227
+ minimal_data_id = DataCoordinate.standardize(
228
+ dataId.subset(datasetType.dimensions.required).required, universe=self.dimensions
229
+ )
230
+ result = result.where(minimal_data_id)
231
+ if (
232
+ datasetType.isCalibration()
233
+ and timespan is not None
234
+ and (timespan.begin is not None or timespan.end is not None)
235
+ ):
236
+ timespan_column = query.expression_factory[dataset_type_name].timespan
237
+ result = result.where(timespan_column.overlaps(timespan))
238
+
239
+ datasets = list(result)
240
+ if len(datasets) == 1:
241
+ ref = datasets[0]
242
+ if dataId.hasRecords():
243
+ ref = ref.expanded(dataId)
244
+ # Propagate storage class from user-provided DatasetType, which
245
+ # may not match the definition in the database.
246
+ ref = ref.overrideStorageClass(datasetType.storageClass_name)
247
+ if datastore_records:
248
+ ref = self._registry.get_datastore_records(ref)
249
+ return ref
250
+ elif len(datasets) == 0:
251
+ return None
252
+ else:
253
+ raise CalibrationLookupError(
254
+ f"Ambiguous calibration lookup for {datasetType} with timespan {timespan}"
255
+ f" in collections {resolved_collections}."
256
+ )
257
+
192
258
  def insertDatasets(
193
259
  self,
194
260
  datasetType: DatasetType | str,
@@ -200,14 +266,20 @@ class RegistryShim(RegistryBase):
200
266
  # Docstring inherited from a base class.
201
267
  return self._registry.insertDatasets(datasetType, dataIds, run, expand, idGenerationMode)
202
268
 
203
- def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True) -> list[DatasetRef]:
269
+ def _importDatasets(
270
+ self, datasets: Iterable[DatasetRef], expand: bool = True, assume_new: bool = False
271
+ ) -> list[DatasetRef]:
204
272
  # Docstring inherited from a base class.
205
- return self._registry._importDatasets(datasets, expand)
273
+ return self._registry._importDatasets(datasets, expand, assume_new)
206
274
 
207
275
  def getDataset(self, id: DatasetId) -> DatasetRef | None:
208
276
  # Docstring inherited from a base class.
209
277
  return self._registry.getDataset(id)
210
278
 
279
+ def _fetch_run_dataset_ids(self, run: str) -> list[DatasetId]:
280
+ # Docstring inherited.
281
+ return self._registry._fetch_run_dataset_ids(run)
282
+
211
283
  def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
212
284
  # Docstring inherited from a base class.
213
285
  self._registry.removeDatasets(refs)
@@ -41,6 +41,8 @@ from ..utils import ButlerCommand
41
41
  noNonRunCollectionsMsg = "No non-RUN collections were found."
42
42
  willRemoveCollectionMsg = "The following collections will be removed:"
43
43
  removedCollectionsMsg = "Removed collections"
44
+ willRemoveCollectionChainsMsg = "Collections to be removed from their parent collection chains:"
45
+ removedCollectionChainsMsg = "Removed collections from their parent collection chains:"
44
46
  canNotRemoveFoundRuns = "The following RUN collections were found but can NOT be removed by this command:"
45
47
  didNotRemoveFoundRuns = "Found RUN collections but they can NOT be removed by this command:"
46
48
  abortedMsg = "Aborted."
@@ -53,6 +55,11 @@ abortedMsg = "Aborted."
53
55
  )
54
56
  @confirm_option()
55
57
  @options_file_option()
58
+ @click.option(
59
+ "--remove-from-parents",
60
+ is_flag=True,
61
+ help="Forcibly remove the collection even if it is still referenced from collection chains.",
62
+ )
56
63
  def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
57
64
  """Remove one or more non-RUN collections.
58
65
 
@@ -73,6 +80,10 @@ def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
73
80
  result.removeCollectionsTable.pprint_all(align="<")
74
81
  else:
75
82
  print("\n" + noNonRunCollectionsMsg)
83
+ if len(result.removeChainsTable):
84
+ print("\n" + willRemoveCollectionChainsMsg)
85
+ result.removeChainsTable.pprint_all(align="<")
86
+ print()
76
87
  if len(result.runsTable):
77
88
  print("\n" + canNotRemoveFoundRuns)
78
89
  result.runsTable.pprint_all(align="<")
@@ -86,6 +97,10 @@ def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
86
97
  else:
87
98
  print("\n" + removedCollectionsMsg + ":\n")
88
99
  result.removeCollectionsTable.pprint_all(align="<")
100
+ if len(result.removeChainsTable):
101
+ print("\n" + removedCollectionChainsMsg)
102
+ result.removeChainsTable.pprint_all(align="<")
103
+ print()
89
104
  if len(result.runsTable):
90
105
  print("\n" + didNotRemoveFoundRuns)
91
106
  result.runsTable.pprint_all(align="<")
@@ -87,6 +87,7 @@ SpectractorSpectrum: lsst.atmospec.formatters.SpectractorSpectrumFormatter
87
87
  SpectractorImage: lsst.atmospec.formatters.SpectractorImageFormatter
88
88
  SpectractorFitParameters: lsst.atmospec.formatters.SpectractorFitParametersFormatter
89
89
  ScarletModelData: lsst.meas.extensions.scarlet.io.ScarletModelFormatter
90
+ LsstScarletModelData: lsst.meas.extensions.scarlet.io.ScarletModelFormatter
90
91
  MetricMeasurementBundle: lsst.daf.butler.formatters.json.JsonFormatter
91
92
  MultipleCellCoadd: lsst.cell_coadds.CellCoaddFitsFormatter
92
93
  NNModelPackagePayload: lsst.meas.transiNet.modelPackages.NNModelPackageFormatter
@@ -412,6 +412,11 @@ storageClasses:
412
412
  parameters:
413
413
  - blend_id
414
414
  delegate: lsst.meas.extensions.scarlet.io.ScarletModelDelegate
415
+ LsstScarletModelData:
416
+ pytype: lsst.meas.extensions.scarlet.io.LsstScarletModelData
417
+ parameters:
418
+ - blend_id
419
+ delegate: lsst.meas.extensions.scarlet.io.ScarletModelDelegate
415
420
  MetricMeasurementBundle:
416
421
  pytype: lsst.analysis.tools.interfaces.MetricMeasurementBundle
417
422
  MultipleCellCoadd:
@@ -1874,7 +1874,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1874
1874
 
1875
1875
  # Have to handle trustGetRequest mode by checking for the existence
1876
1876
  # of the missing refs on disk.
1877
- if missing_refs:
1877
+ if missing_refs and not predict:
1878
1878
  dataset_existence = self._mexists_check_expected(missing_refs, None)
1879
1879
  really_missing = set()
1880
1880
  not_missing = set()
@@ -3218,7 +3218,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
3218
3218
  return ref
3219
3219
  dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
3220
3220
  if dataset_type is not None:
3221
- ref = ref.overrideStorageClass(dataset_type.storageClass)
3221
+ ref = ref.overrideStorageClass(dataset_type.storageClass_name)
3222
3222
  return ref
3223
3223
 
3224
3224
  def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
@@ -1302,7 +1302,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1302
1302
 
1303
1303
  data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1304
1304
 
1305
- ref = self._registry.findDataset(
1305
+ ref = self.registry.findDataset(
1306
1306
  parent_type,
1307
1307
  data_id,
1308
1308
  collections=collections,
@@ -2107,7 +2107,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2107
2107
  dry_run: bool = False,
2108
2108
  ) -> _ImportDatasetsInfo:
2109
2109
  # Docstring inherited.
2110
- if not self.isWriteable():
2110
+ if not self.isWriteable() and not dry_run:
2111
2111
  raise TypeError("Butler is read-only.")
2112
2112
 
2113
2113
  # Will iterate through the refs multiple times so need to convert
@@ -2312,7 +2312,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2312
2312
  ) -> collections.abc.Collection[DatasetRef]:
2313
2313
  # Docstring inherited.
2314
2314
  source_refs = list(source_refs)
2315
- if not self.isWriteable():
2315
+ if not self.isWriteable() and not dry_run:
2316
2316
  raise TypeError("Butler is read-only.")
2317
2317
 
2318
2318
  progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
lsst/daf/butler/json.py CHANGED
@@ -68,7 +68,7 @@ def to_json_pydantic(self: SupportsSimple, minimal: bool = False) -> str:
68
68
 
69
69
  def from_json_pydantic(
70
70
  cls_: type[SupportsSimple],
71
- json_str: str,
71
+ json_str: str | bytes | bytearray,
72
72
  universe: DimensionUniverse | None = None,
73
73
  registry: Registry | None = None,
74
74
  ) -> SupportsSimple:
@@ -78,7 +78,7 @@ def from_json_pydantic(
78
78
  ----------
79
79
  cls_ : `type` of `SupportsSimple`
80
80
  The Python type being created.
81
- json_str : `str`
81
+ json_str : `str` or `bytes` or `bytearray`
82
82
  The JSON string representing this object.
83
83
  universe : `DimensionUniverse` or `None`, optional
84
84
  The universe required to instantiate some models. Required if
@@ -241,7 +241,7 @@ class _ConversionVisitor(TreeVisitor[_VisitorResult]):
241
241
 
242
242
  def visitBind(self, name: str, node: Node) -> _VisitorResult:
243
243
  if name not in self.context.bind:
244
- raise InvalidQueryError("Name {name!r} is not in the bind map.")
244
+ raise InvalidQueryError(f"Name {name!r} is not in the bind map.")
245
245
  # Logic in visitIdentifier handles binds.
246
246
  return self.visitIdentifier(name, node)
247
247
 
@@ -631,57 +631,55 @@ class Registry(ABC):
631
631
  self,
632
632
  datasets: Iterable[DatasetRef],
633
633
  expand: bool = True,
634
+ assume_new: bool = False,
634
635
  ) -> list[DatasetRef]:
635
636
  """Import one or more datasets into the `Registry`.
636
637
 
637
- Difference from `insertDatasets` method is that this method accepts
638
- `DatasetRef` instances which should already be resolved and have a
639
- dataset ID. If registry supports globally-unique dataset IDs (e.g.
640
- `uuid.UUID`) then datasets which already exist in the registry will be
641
- ignored if imported again.
638
+ This differs from `insertDatasets` method in that this method accepts
639
+ `DatasetRef` instances, which already have a dataset ID.
642
640
 
643
641
  Parameters
644
642
  ----------
645
643
  datasets : `~collections.abc.Iterable` of `DatasetRef`
646
644
  Datasets to be inserted. All `DatasetRef` instances must have
647
- identical ``datasetType`` and ``run`` attributes. ``run``
645
+ identical ``run`` attributes. ``run``
648
646
  attribute can be `None` and defaults to ``self.defaults.run``.
649
647
  Datasets can specify ``id`` attribute which will be used for
650
- inserted datasets. All dataset IDs must have the same type
651
- (`int` or `uuid.UUID`), if type of dataset IDs does not match
652
- configured backend then IDs will be ignored and new IDs will be
653
- generated by backend.
648
+ inserted datasets.
649
+ Datasets can be of multiple dataset types, but all the dataset
650
+ types must have the same set of dimensions.
654
651
  expand : `bool`, optional
655
652
  If `True` (default), expand data IDs as they are inserted. This is
656
653
  necessary in general, but it may be disabled if the caller can
657
654
  guarantee this is unnecessary.
655
+ assume_new : `bool`, optional
656
+ If `True`, assume datasets are new. If `False`, datasets that are
657
+ identical to an existing one are ignored.
658
658
 
659
659
  Returns
660
660
  -------
661
661
  refs : `list` of `DatasetRef`
662
- Resolved `DatasetRef` instances for all given data IDs (in the same
663
- order). If any of ``datasets`` has an ID which already exists in
664
- the database then it will not be inserted or updated, but a
665
- resolved `DatasetRef` will be returned for it in any case.
662
+ `DatasetRef` instances for all given data IDs (in the same order).
663
+ If any of ``datasets`` has an ID which already exists in the
664
+ database then it will not be inserted or updated, but a
665
+ `DatasetRef` will be returned for it in any case.
666
666
 
667
667
  Raises
668
668
  ------
669
669
  lsst.daf.butler.registry.NoDefaultCollectionError
670
670
  Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
671
671
  lsst.daf.butler.registry.DatasetTypeError
672
- Raised if datasets correspond to more than one dataset type or
673
- dataset type is not known to registry.
672
+ Raised if a dataset type is not known to registry.
674
673
  lsst.daf.butler.registry.ConflictingDefinitionError
675
674
  If a dataset with the same dataset type and data ID as one of those
676
- given already exists in ``run``.
675
+ given already exists in ``run``, or if ``assume_new=True`` and at
676
+ least one dataset is not new.
677
677
  lsst.daf.butler.registry.MissingCollectionError
678
678
  Raised if ``run`` does not exist in the registry.
679
679
 
680
680
  Notes
681
681
  -----
682
- This method is considered package-private and internal to Butler
683
- implementation. Clients outside daf_butler package should not use this
684
- method.
682
+ This method is considered middleware-internal.
685
683
  """
686
684
  raise NotImplementedError()
687
685
 
@@ -702,6 +700,27 @@ class Registry(ABC):
702
700
  """
703
701
  raise NotImplementedError()
704
702
 
703
+ @abstractmethod
704
+ def _fetch_run_dataset_ids(self, run: str) -> list[DatasetId]:
705
+ """Return the IDs of all datasets in the given ``RUN``
706
+ collection.
707
+
708
+ Parameters
709
+ ----------
710
+ run : `str`
711
+ Name of the collection.
712
+
713
+ Returns
714
+ -------
715
+ dataset_ids : `list` [`uuid.UUID`]
716
+ List of dataset IDs.
717
+
718
+ Notes
719
+ -----
720
+ This is a middleware-internal interface.
721
+ """
722
+ raise NotImplementedError()
723
+
705
724
  @abstractmethod
706
725
  def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
707
726
  """Remove datasets from the Registry.
@@ -231,20 +231,28 @@ class RegistryBase(Registry):
231
231
  collectionTypes: Iterable[CollectionType] = CollectionType.all(),
232
232
  flattenChains: bool = False,
233
233
  ) -> Iterator[DatasetAssociation]:
234
- # queryCollections only accepts DatasetType.
235
234
  if isinstance(datasetType, str):
236
235
  datasetType = self.getDatasetType(datasetType)
237
- resolved_collections = self.queryCollections(
238
- collections, datasetType=datasetType, collectionTypes=collectionTypes, flattenChains=flattenChains
239
- )
240
236
  with self._butler.query() as query:
237
+ resolved_collections = self.queryCollections(
238
+ collections,
239
+ datasetType=datasetType,
240
+ collectionTypes=collectionTypes,
241
+ flattenChains=flattenChains,
242
+ )
243
+ # It's annoyingly difficult to just do the collection query once,
244
+ # since query_info doesn't accept all the expression types that
245
+ # queryCollections does. But it's all cached anyway.
246
+ collection_info = {
247
+ info.name: info for info in self._butler.collections.query_info(resolved_collections)
248
+ }
241
249
  query = query.join_dataset_search(datasetType, resolved_collections)
242
250
  result = query.general(
243
251
  datasetType.dimensions,
244
252
  dataset_fields={datasetType.name: {"dataset_id", "run", "collection", "timespan"}},
245
253
  find_first=False,
246
254
  )
247
- yield from DatasetAssociation.from_query_result(result, datasetType)
255
+ yield from DatasetAssociation.from_query_result(result, datasetType, collection_info)
248
256
 
249
257
  def _resolve_dataset_types(self, dataset_types: object | None) -> list[str]:
250
258
  if dataset_types is None:
@@ -617,6 +617,14 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
617
617
  dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types)
618
618
  return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
619
619
 
620
+ def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]:
621
+ # Docstring inherited.
622
+ sql = sqlalchemy.select(self._static.dataset.c.id).where(
623
+ self._static.dataset.c[self._run_key_column] == run.key
624
+ )
625
+ with self._db.query(sql) as result:
626
+ return list(result.scalars())
627
+
620
628
  def ingest_date_dtype(self) -> type:
621
629
  """Return type of the ``ingest_date`` column."""
622
630
  schema_version = self.newSchemaVersion()
@@ -698,7 +706,7 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
698
706
  for dataId, row in zip(data_id_list, rows, strict=True)
699
707
  ]
700
708
 
701
- def import_(self, run: RunRecord, refs: list[DatasetRef]) -> None:
709
+ def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None:
702
710
  # Docstring inherited from DatasetRecordStorageManager.
703
711
  if not refs:
704
712
  # Just in case an empty mapping is provided we want to avoid
@@ -721,7 +729,6 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
721
729
  "Table cache should have been populated when looking up dataset types"
722
730
  )
723
731
  tags_table = self._get_tags_table(dynamic_tables)
724
-
725
732
  # Current timestamp, type depends on schema version.
726
733
  if self._use_astropy_ingest_date:
727
734
  # Astropy `now()` precision should be the same as `now()` which
@@ -729,11 +736,8 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
729
736
  timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
730
737
  else:
731
738
  timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC))
732
-
733
- # We'll insert all new rows into a temporary table
734
- table_spec = makeTagTableSpec(dimensions, type(self._collections), constraints=False)
735
739
  collection_fkey_name = self._collections.getCollectionForeignKeyName()
736
- tmpRows = [
740
+ tags_rows = [
737
741
  {
738
742
  "dataset_type_id": dataset_type_storage[ref.datasetType.name].dataset_type_id,
739
743
  collection_fkey_name: run.key,
@@ -742,9 +746,29 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
742
746
  }
743
747
  for ref in refs
744
748
  ]
749
+ if assume_new:
750
+ self._import_new(run, refs, dataset_type_storage, tags_table, tags_rows, timestamp)
751
+ else:
752
+ self._import_guarded(
753
+ run, refs, dimensions, dataset_type_storage, tags_table, tags_rows, timestamp
754
+ )
755
+
756
+ def _import_guarded(
757
+ self,
758
+ run: RunRecord,
759
+ refs: list[DatasetRef],
760
+ dimensions: DimensionGroup,
761
+ dataset_type_storage: dict[str, _DatasetRecordStorage],
762
+ tags_table: sqlalchemy.Table,
763
+ tags_rows: list[dict[str, object]],
764
+ timestamp: sqlalchemy.BindParameter[astropy.time.Time | datetime.datetime],
765
+ ) -> None:
766
+ # We'll insert all new rows into a temporary table
767
+ table_spec = makeTagTableSpec(dimensions, type(self._collections), constraints=False)
768
+ collection_fkey_name = self._collections.getCollectionForeignKeyName()
745
769
  with self._db.transaction(for_temp_tables=True), self._db.temporary_table(table_spec) as tmp_tags:
746
770
  # store all incoming data in a temporary table
747
- self._db.insert(tmp_tags, *tmpRows)
771
+ self._db.insert(tmp_tags, *tags_rows)
748
772
  # There are some checks that we want to make for consistency
749
773
  # of the new datasets with existing ones.
750
774
  self._validate_import(dimensions, tags_table, tmp_tags, run)
@@ -764,17 +788,19 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
764
788
  timestamp.label("ingest_date"),
765
789
  ),
766
790
  )
767
- # Update the summary tables for this collection in case this
768
- # is the first time this dataset type or these governor values
769
- # will be inserted there.
770
- summary = CollectionSummary()
771
- summary.add_datasets(refs)
772
- self._summaries.update(
773
- run, [storage.dataset_type_id for storage in dataset_type_storage.values()], summary
774
- )
791
+ self._update_summaries(run, refs, dataset_type_storage)
775
792
  # Copy from temp table into tags table.
776
793
  self._db.insert(tags_table, select=tmp_tags.select())
777
794
 
795
+ def _update_summaries(
796
+ self, run: RunRecord, refs: list[DatasetRef], dataset_type_storage: dict[str, _DatasetRecordStorage]
797
+ ) -> None:
798
+ summary = CollectionSummary()
799
+ summary.add_datasets(refs)
800
+ self._summaries.update(
801
+ run, [storage.dataset_type_id for storage in dataset_type_storage.values()], summary
802
+ )
803
+
778
804
  def _validate_import(
779
805
  self,
780
806
  dimensions: DimensionGroup,
@@ -899,6 +925,29 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
899
925
  f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
900
926
  )
901
927
 
928
+ def _import_new(
929
+ self,
930
+ run: RunRecord,
931
+ refs: list[DatasetRef],
932
+ dataset_type_storage: dict[str, _DatasetRecordStorage],
933
+ tags_table: sqlalchemy.Table,
934
+ tags_rows: list[dict[str, object]],
935
+ timestamp: sqlalchemy.BindParameter[astropy.time.Time | datetime.datetime],
936
+ ) -> None:
937
+ static_rows = [
938
+ {
939
+ "id": ref.id,
940
+ "dataset_type_id": dataset_type_storage[ref.datasetType.name].dataset_type_id,
941
+ self._run_key_column: run.key,
942
+ "ingest_date": timestamp.value,
943
+ }
944
+ for ref in refs
945
+ ]
946
+ with self._db.transaction():
947
+ self._db.insert(self._static.dataset, *static_rows)
948
+ self._update_summaries(run, refs, dataset_type_storage)
949
+ self._db.insert(tags_table, *tags_rows)
950
+
902
951
  def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None:
903
952
  # Docstring inherited from DatasetRecordStorageManager.
904
953
  # Only delete from common dataset table; ON DELETE foreign key clauses
@@ -1425,7 +1474,7 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
1425
1474
  )
1426
1475
  if "timespan" in fields:
1427
1476
  tags_builder.joins.timespans[fields_key] = self._db.getTimespanRepresentation().fromLiteral(
1428
- None
1477
+ Timespan(None, None)
1429
1478
  )
1430
1479
  calibs_builder: SqlSelectBuilder | None = None
1431
1480
  if CollectionType.CALIBRATION in collection_types:
@@ -384,6 +384,23 @@ class DatasetRecordStorageManager(VersionedExtension):
384
384
  """
385
385
  raise NotImplementedError()
386
386
 
387
+ @abstractmethod
388
+ def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]:
389
+ """Return the IDs of all datasets in the given ``RUN``
390
+ collection.
391
+
392
+ Parameters
393
+ ----------
394
+ run : `RunRecord`
395
+ Record describing the collection.
396
+
397
+ Returns
398
+ -------
399
+ dataset_ids : `list` [`uuid.UUID`]
400
+ List of dataset IDs.
401
+ """
402
+ raise NotImplementedError()
403
+
387
404
  @abstractmethod
388
405
  def ingest_date_dtype(self) -> type:
389
406
  """Return type of the ``ingest_date`` column."""
@@ -424,7 +441,7 @@ class DatasetRecordStorageManager(VersionedExtension):
424
441
  raise NotImplementedError()
425
442
 
426
443
  @abstractmethod
427
- def import_(self, run: RunRecord, refs: list[DatasetRef]) -> None:
444
+ def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None:
428
445
  """Insert one or more dataset entries into the database.
429
446
 
430
447
  Parameters
@@ -435,6 +452,9 @@ class DatasetRecordStorageManager(VersionedExtension):
435
452
  refs : `list` [ `DatasetRef` ]
436
453
  List of datasets to be be inserted. All of the ``DatasetRef``
437
454
  ``run`` attributes must match the ``run`` parameter.
455
+ assume_new : `bool`, optional
456
+ If `True`, assume all datasets are new and skip conflict resolution
457
+ logic.
438
458
  """
439
459
  raise NotImplementedError()
440
460
 
@@ -58,6 +58,9 @@ class CommonQueryArguments:
58
58
  def replaceCollections(self, collections: list[str]) -> CommonQueryArguments:
59
59
  return dataclasses.replace(self, collections=collections)
60
60
 
61
+ def replaceDatasetTypes(self, dataset_types: list[str]) -> CommonQueryArguments:
62
+ return dataclasses.replace(self, dataset_types=dataset_types)
63
+
61
64
 
62
65
  _T = TypeVar("_T", bound=QueryResultsBase)
63
66
  _U = TypeVar("_U", bound=QueryResultsBase)