lsst-daf-butler 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/daf/butler/_dataset_association.py +28 -3
- lsst/daf/butler/_registry_shim.py +76 -4
- lsst/daf/butler/cli/cmd/_remove_collections.py +15 -0
- lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
- lsst/daf/butler/configs/storageClasses.yaml +5 -0
- lsst/daf/butler/datastores/fileDatastore.py +2 -2
- lsst/daf/butler/direct_butler/_direct_butler.py +3 -3
- lsst/daf/butler/json.py +2 -2
- lsst/daf/butler/queries/_expression_strings.py +1 -1
- lsst/daf/butler/registry/_registry.py +39 -20
- lsst/daf/butler/registry/_registry_base.py +13 -5
- lsst/daf/butler/registry/datasets/byDimensions/_manager.py +65 -16
- lsst/daf/butler/registry/interfaces/_datasets.py +21 -1
- lsst/daf/butler/registry/queries/_query_common.py +3 -0
- lsst/daf/butler/registry/queries/_query_datasets.py +7 -1
- lsst/daf/butler/registry/sql_registry.py +41 -187
- lsst/daf/butler/registry/tests/_registry.py +120 -6
- lsst/daf/butler/remote_butler/_factory.py +2 -2
- lsst/daf/butler/remote_butler/_registry.py +4 -0
- lsst/daf/butler/remote_butler/_remote_butler.py +3 -1
- lsst/daf/butler/remote_butler/authentication/cadc.py +63 -11
- lsst/daf/butler/script/removeCollections.py +46 -13
- lsst/daf/butler/tests/butler_queries.py +40 -1
- lsst/daf/butler/tests/hybrid_butler_registry.py +5 -4
- lsst/daf/butler/tests/server_available.py +53 -0
- lsst/daf/butler/timespan_database_representation.py +8 -0
- lsst/daf/butler/transfers/_context.py +5 -16
- lsst/daf/butler/version.py +1 -1
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/METADATA +1 -1
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/RECORD +38 -37
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/WHEEL +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/entry_points.txt +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/LICENSE +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/top_level.txt +0 -0
- {lsst_daf_butler-29.2025.4100.dist-info → lsst_daf_butler-29.2025.4300.dist-info}/zip-safe +0 -0
|
@@ -29,15 +29,17 @@ from __future__ import annotations
|
|
|
29
29
|
|
|
30
30
|
__all__ = ("DatasetAssociation",)
|
|
31
31
|
|
|
32
|
-
from collections.abc import Iterator
|
|
32
|
+
from collections.abc import Iterator, Mapping
|
|
33
33
|
from dataclasses import dataclass
|
|
34
34
|
from typing import TYPE_CHECKING, Any
|
|
35
35
|
|
|
36
|
+
from ._collection_type import CollectionType
|
|
36
37
|
from ._dataset_ref import DatasetRef
|
|
37
38
|
from ._dataset_type import DatasetType
|
|
38
39
|
from ._timespan import Timespan
|
|
39
40
|
|
|
40
41
|
if TYPE_CHECKING:
|
|
42
|
+
from ._butler_collections import CollectionInfo
|
|
41
43
|
from .queries._general_query_results import GeneralQueryResults
|
|
42
44
|
|
|
43
45
|
|
|
@@ -66,7 +68,10 @@ class DatasetAssociation:
|
|
|
66
68
|
|
|
67
69
|
@classmethod
|
|
68
70
|
def from_query_result(
|
|
69
|
-
cls,
|
|
71
|
+
cls,
|
|
72
|
+
result: GeneralQueryResults,
|
|
73
|
+
dataset_type: DatasetType,
|
|
74
|
+
collection_info: Mapping[str, CollectionInfo] | None = None,
|
|
70
75
|
) -> Iterator[DatasetAssociation]:
|
|
71
76
|
"""Construct dataset associations from the result of general query.
|
|
72
77
|
|
|
@@ -79,11 +84,31 @@ class DatasetAssociation:
|
|
|
79
84
|
"timespan" dataset fields for ``dataset_type``.
|
|
80
85
|
dataset_type : `DatasetType`
|
|
81
86
|
Dataset type, query has to include this dataset type.
|
|
87
|
+
collection_info : `~collections.abc.Mapping` \
|
|
88
|
+
[`str`, `CollectionInfo`], optional
|
|
89
|
+
Mapping from collection name to information about it for all
|
|
90
|
+
collections that may appear in the query results. If not provided,
|
|
91
|
+
timespans for `~CollectionType.RUN` and `~CollectionType.TAGGED`
|
|
92
|
+
collections will be bounded, instead of `None`; this is actually
|
|
93
|
+
more consistent with how those timespans are used elsewhere in the
|
|
94
|
+
query system, but is a change from how `DatasetAssocation` has
|
|
95
|
+
historically worked.
|
|
82
96
|
"""
|
|
83
97
|
timespan_key = f"{dataset_type.name}.timespan"
|
|
84
98
|
collection_key = f"{dataset_type.name}.collection"
|
|
85
99
|
for _, refs, row_dict in result.iter_tuples(dataset_type):
|
|
86
|
-
|
|
100
|
+
collection = row_dict[collection_key]
|
|
101
|
+
timespan = row_dict[timespan_key]
|
|
102
|
+
if (
|
|
103
|
+
collection_info is not None
|
|
104
|
+
and collection_info[collection].type is not CollectionType.CALIBRATION
|
|
105
|
+
):
|
|
106
|
+
# This behavior is for backwards compatibility only; in most
|
|
107
|
+
# contexts it makes sense to consider the timespan of a RUN
|
|
108
|
+
# or TAGGED collection to be unbounded, not None, and that's
|
|
109
|
+
# what the query results we're iterating over do.
|
|
110
|
+
timespan = None
|
|
111
|
+
yield DatasetAssociation(refs[0], collection, timespan)
|
|
87
112
|
|
|
88
113
|
def __lt__(self, other: Any) -> bool:
|
|
89
114
|
# Allow sorting of associations
|
|
@@ -36,6 +36,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
36
36
|
from ._collection_type import CollectionType
|
|
37
37
|
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
|
|
38
38
|
from ._dataset_type import DatasetType
|
|
39
|
+
from ._exceptions import CalibrationLookupError
|
|
39
40
|
from ._storage_class import StorageClassFactory
|
|
40
41
|
from ._timespan import Timespan
|
|
41
42
|
from .dimensions import (
|
|
@@ -48,7 +49,9 @@ from .dimensions import (
|
|
|
48
49
|
)
|
|
49
50
|
from .registry._collection_summary import CollectionSummary
|
|
50
51
|
from .registry._defaults import RegistryDefaults
|
|
52
|
+
from .registry._exceptions import NoDefaultCollectionError
|
|
51
53
|
from .registry._registry_base import RegistryBase
|
|
54
|
+
from .registry.queries._query_common import resolve_collections
|
|
52
55
|
|
|
53
56
|
if TYPE_CHECKING:
|
|
54
57
|
from .direct_butler import DirectButler
|
|
@@ -182,13 +185,76 @@ class RegistryShim(RegistryBase):
|
|
|
182
185
|
*,
|
|
183
186
|
collections: CollectionArgType | None = None,
|
|
184
187
|
timespan: Timespan | None = None,
|
|
188
|
+
datastore_records: bool = False,
|
|
185
189
|
**kwargs: Any,
|
|
186
190
|
) -> DatasetRef | None:
|
|
187
191
|
# Docstring inherited from a base class.
|
|
188
|
-
|
|
189
|
-
datasetType
|
|
192
|
+
if not isinstance(datasetType, DatasetType):
|
|
193
|
+
datasetType = self.getDatasetType(datasetType)
|
|
194
|
+
|
|
195
|
+
dataId = DataCoordinate.standardize(
|
|
196
|
+
dataId,
|
|
197
|
+
dimensions=datasetType.dimensions,
|
|
198
|
+
universe=self.dimensions,
|
|
199
|
+
defaults=self.defaults.dataId,
|
|
200
|
+
**kwargs,
|
|
190
201
|
)
|
|
191
202
|
|
|
203
|
+
with self._butler.query() as query:
|
|
204
|
+
resolved_collections = resolve_collections(self._butler, collections)
|
|
205
|
+
if not resolved_collections:
|
|
206
|
+
if collections is None:
|
|
207
|
+
raise NoDefaultCollectionError("No collections provided, and no default collections set")
|
|
208
|
+
else:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
if datasetType.isCalibration() and timespan is None:
|
|
212
|
+
# Filter out calibration collections, because with no timespan
|
|
213
|
+
# we have no way of selecting a dataset from them.
|
|
214
|
+
collection_info = self._butler.collections.query_info(
|
|
215
|
+
resolved_collections, flatten_chains=True
|
|
216
|
+
)
|
|
217
|
+
resolved_collections = [
|
|
218
|
+
info.name for info in collection_info if info.type != CollectionType.CALIBRATION
|
|
219
|
+
]
|
|
220
|
+
if not resolved_collections:
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
result = query.datasets(datasetType, resolved_collections, find_first=True).limit(2)
|
|
224
|
+
dataset_type_name = result.dataset_type.name
|
|
225
|
+
# Search only on the 'required' dimensions for the dataset type.
|
|
226
|
+
# Any extra values provided by the user are ignored.
|
|
227
|
+
minimal_data_id = DataCoordinate.standardize(
|
|
228
|
+
dataId.subset(datasetType.dimensions.required).required, universe=self.dimensions
|
|
229
|
+
)
|
|
230
|
+
result = result.where(minimal_data_id)
|
|
231
|
+
if (
|
|
232
|
+
datasetType.isCalibration()
|
|
233
|
+
and timespan is not None
|
|
234
|
+
and (timespan.begin is not None or timespan.end is not None)
|
|
235
|
+
):
|
|
236
|
+
timespan_column = query.expression_factory[dataset_type_name].timespan
|
|
237
|
+
result = result.where(timespan_column.overlaps(timespan))
|
|
238
|
+
|
|
239
|
+
datasets = list(result)
|
|
240
|
+
if len(datasets) == 1:
|
|
241
|
+
ref = datasets[0]
|
|
242
|
+
if dataId.hasRecords():
|
|
243
|
+
ref = ref.expanded(dataId)
|
|
244
|
+
# Propagate storage class from user-provided DatasetType, which
|
|
245
|
+
# may not match the definition in the database.
|
|
246
|
+
ref = ref.overrideStorageClass(datasetType.storageClass_name)
|
|
247
|
+
if datastore_records:
|
|
248
|
+
ref = self._registry.get_datastore_records(ref)
|
|
249
|
+
return ref
|
|
250
|
+
elif len(datasets) == 0:
|
|
251
|
+
return None
|
|
252
|
+
else:
|
|
253
|
+
raise CalibrationLookupError(
|
|
254
|
+
f"Ambiguous calibration lookup for {datasetType} with timespan {timespan}"
|
|
255
|
+
f" in collections {resolved_collections}."
|
|
256
|
+
)
|
|
257
|
+
|
|
192
258
|
def insertDatasets(
|
|
193
259
|
self,
|
|
194
260
|
datasetType: DatasetType | str,
|
|
@@ -200,14 +266,20 @@ class RegistryShim(RegistryBase):
|
|
|
200
266
|
# Docstring inherited from a base class.
|
|
201
267
|
return self._registry.insertDatasets(datasetType, dataIds, run, expand, idGenerationMode)
|
|
202
268
|
|
|
203
|
-
def _importDatasets(
|
|
269
|
+
def _importDatasets(
|
|
270
|
+
self, datasets: Iterable[DatasetRef], expand: bool = True, assume_new: bool = False
|
|
271
|
+
) -> list[DatasetRef]:
|
|
204
272
|
# Docstring inherited from a base class.
|
|
205
|
-
return self._registry._importDatasets(datasets, expand)
|
|
273
|
+
return self._registry._importDatasets(datasets, expand, assume_new)
|
|
206
274
|
|
|
207
275
|
def getDataset(self, id: DatasetId) -> DatasetRef | None:
|
|
208
276
|
# Docstring inherited from a base class.
|
|
209
277
|
return self._registry.getDataset(id)
|
|
210
278
|
|
|
279
|
+
def _fetch_run_dataset_ids(self, run: str) -> list[DatasetId]:
|
|
280
|
+
# Docstring inherited.
|
|
281
|
+
return self._registry._fetch_run_dataset_ids(run)
|
|
282
|
+
|
|
211
283
|
def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
|
|
212
284
|
# Docstring inherited from a base class.
|
|
213
285
|
self._registry.removeDatasets(refs)
|
|
@@ -41,6 +41,8 @@ from ..utils import ButlerCommand
|
|
|
41
41
|
noNonRunCollectionsMsg = "No non-RUN collections were found."
|
|
42
42
|
willRemoveCollectionMsg = "The following collections will be removed:"
|
|
43
43
|
removedCollectionsMsg = "Removed collections"
|
|
44
|
+
willRemoveCollectionChainsMsg = "Collections to be removed from their parent collection chains:"
|
|
45
|
+
removedCollectionChainsMsg = "Removed collections from their parent collection chains:"
|
|
44
46
|
canNotRemoveFoundRuns = "The following RUN collections were found but can NOT be removed by this command:"
|
|
45
47
|
didNotRemoveFoundRuns = "Found RUN collections but they can NOT be removed by this command:"
|
|
46
48
|
abortedMsg = "Aborted."
|
|
@@ -53,6 +55,11 @@ abortedMsg = "Aborted."
|
|
|
53
55
|
)
|
|
54
56
|
@confirm_option()
|
|
55
57
|
@options_file_option()
|
|
58
|
+
@click.option(
|
|
59
|
+
"--remove-from-parents",
|
|
60
|
+
is_flag=True,
|
|
61
|
+
help="Forcibly remove the collection even if it is still referenced from collection chains.",
|
|
62
|
+
)
|
|
56
63
|
def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
|
|
57
64
|
"""Remove one or more non-RUN collections.
|
|
58
65
|
|
|
@@ -73,6 +80,10 @@ def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
|
|
|
73
80
|
result.removeCollectionsTable.pprint_all(align="<")
|
|
74
81
|
else:
|
|
75
82
|
print("\n" + noNonRunCollectionsMsg)
|
|
83
|
+
if len(result.removeChainsTable):
|
|
84
|
+
print("\n" + willRemoveCollectionChainsMsg)
|
|
85
|
+
result.removeChainsTable.pprint_all(align="<")
|
|
86
|
+
print()
|
|
76
87
|
if len(result.runsTable):
|
|
77
88
|
print("\n" + canNotRemoveFoundRuns)
|
|
78
89
|
result.runsTable.pprint_all(align="<")
|
|
@@ -86,6 +97,10 @@ def remove_collections(**kwargs: Any) -> None: # numpydoc ignore=PR01
|
|
|
86
97
|
else:
|
|
87
98
|
print("\n" + removedCollectionsMsg + ":\n")
|
|
88
99
|
result.removeCollectionsTable.pprint_all(align="<")
|
|
100
|
+
if len(result.removeChainsTable):
|
|
101
|
+
print("\n" + removedCollectionChainsMsg)
|
|
102
|
+
result.removeChainsTable.pprint_all(align="<")
|
|
103
|
+
print()
|
|
89
104
|
if len(result.runsTable):
|
|
90
105
|
print("\n" + didNotRemoveFoundRuns)
|
|
91
106
|
result.runsTable.pprint_all(align="<")
|
|
@@ -87,6 +87,7 @@ SpectractorSpectrum: lsst.atmospec.formatters.SpectractorSpectrumFormatter
|
|
|
87
87
|
SpectractorImage: lsst.atmospec.formatters.SpectractorImageFormatter
|
|
88
88
|
SpectractorFitParameters: lsst.atmospec.formatters.SpectractorFitParametersFormatter
|
|
89
89
|
ScarletModelData: lsst.meas.extensions.scarlet.io.ScarletModelFormatter
|
|
90
|
+
LsstScarletModelData: lsst.meas.extensions.scarlet.io.ScarletModelFormatter
|
|
90
91
|
MetricMeasurementBundle: lsst.daf.butler.formatters.json.JsonFormatter
|
|
91
92
|
MultipleCellCoadd: lsst.cell_coadds.CellCoaddFitsFormatter
|
|
92
93
|
NNModelPackagePayload: lsst.meas.transiNet.modelPackages.NNModelPackageFormatter
|
|
@@ -412,6 +412,11 @@ storageClasses:
|
|
|
412
412
|
parameters:
|
|
413
413
|
- blend_id
|
|
414
414
|
delegate: lsst.meas.extensions.scarlet.io.ScarletModelDelegate
|
|
415
|
+
LsstScarletModelData:
|
|
416
|
+
pytype: lsst.meas.extensions.scarlet.io.LsstScarletModelData
|
|
417
|
+
parameters:
|
|
418
|
+
- blend_id
|
|
419
|
+
delegate: lsst.meas.extensions.scarlet.io.ScarletModelDelegate
|
|
415
420
|
MetricMeasurementBundle:
|
|
416
421
|
pytype: lsst.analysis.tools.interfaces.MetricMeasurementBundle
|
|
417
422
|
MultipleCellCoadd:
|
|
@@ -1874,7 +1874,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1874
1874
|
|
|
1875
1875
|
# Have to handle trustGetRequest mode by checking for the existence
|
|
1876
1876
|
# of the missing refs on disk.
|
|
1877
|
-
if missing_refs:
|
|
1877
|
+
if missing_refs and not predict:
|
|
1878
1878
|
dataset_existence = self._mexists_check_expected(missing_refs, None)
|
|
1879
1879
|
really_missing = set()
|
|
1880
1880
|
not_missing = set()
|
|
@@ -3218,7 +3218,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
3218
3218
|
return ref
|
|
3219
3219
|
dataset_type = self._retrieve_dataset_method(ref.datasetType.name)
|
|
3220
3220
|
if dataset_type is not None:
|
|
3221
|
-
ref = ref.overrideStorageClass(dataset_type.
|
|
3221
|
+
ref = ref.overrideStorageClass(dataset_type.storageClass_name)
|
|
3222
3222
|
return ref
|
|
3223
3223
|
|
|
3224
3224
|
def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]:
|
|
@@ -1302,7 +1302,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1302
1302
|
|
|
1303
1303
|
data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
|
|
1304
1304
|
|
|
1305
|
-
ref = self.
|
|
1305
|
+
ref = self.registry.findDataset(
|
|
1306
1306
|
parent_type,
|
|
1307
1307
|
data_id,
|
|
1308
1308
|
collections=collections,
|
|
@@ -2107,7 +2107,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2107
2107
|
dry_run: bool = False,
|
|
2108
2108
|
) -> _ImportDatasetsInfo:
|
|
2109
2109
|
# Docstring inherited.
|
|
2110
|
-
if not self.isWriteable():
|
|
2110
|
+
if not self.isWriteable() and not dry_run:
|
|
2111
2111
|
raise TypeError("Butler is read-only.")
|
|
2112
2112
|
|
|
2113
2113
|
# Will iterate through the refs multiple times so need to convert
|
|
@@ -2312,7 +2312,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2312
2312
|
) -> collections.abc.Collection[DatasetRef]:
|
|
2313
2313
|
# Docstring inherited.
|
|
2314
2314
|
source_refs = list(source_refs)
|
|
2315
|
-
if not self.isWriteable():
|
|
2315
|
+
if not self.isWriteable() and not dry_run:
|
|
2316
2316
|
raise TypeError("Butler is read-only.")
|
|
2317
2317
|
|
|
2318
2318
|
progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
|
lsst/daf/butler/json.py
CHANGED
|
@@ -68,7 +68,7 @@ def to_json_pydantic(self: SupportsSimple, minimal: bool = False) -> str:
|
|
|
68
68
|
|
|
69
69
|
def from_json_pydantic(
|
|
70
70
|
cls_: type[SupportsSimple],
|
|
71
|
-
json_str: str,
|
|
71
|
+
json_str: str | bytes | bytearray,
|
|
72
72
|
universe: DimensionUniverse | None = None,
|
|
73
73
|
registry: Registry | None = None,
|
|
74
74
|
) -> SupportsSimple:
|
|
@@ -78,7 +78,7 @@ def from_json_pydantic(
|
|
|
78
78
|
----------
|
|
79
79
|
cls_ : `type` of `SupportsSimple`
|
|
80
80
|
The Python type being created.
|
|
81
|
-
json_str : `str`
|
|
81
|
+
json_str : `str` or `bytes` or `bytearray`
|
|
82
82
|
The JSON string representing this object.
|
|
83
83
|
universe : `DimensionUniverse` or `None`, optional
|
|
84
84
|
The universe required to instantiate some models. Required if
|
|
@@ -241,7 +241,7 @@ class _ConversionVisitor(TreeVisitor[_VisitorResult]):
|
|
|
241
241
|
|
|
242
242
|
def visitBind(self, name: str, node: Node) -> _VisitorResult:
|
|
243
243
|
if name not in self.context.bind:
|
|
244
|
-
raise InvalidQueryError("Name {name!r} is not in the bind map.")
|
|
244
|
+
raise InvalidQueryError(f"Name {name!r} is not in the bind map.")
|
|
245
245
|
# Logic in visitIdentifier handles binds.
|
|
246
246
|
return self.visitIdentifier(name, node)
|
|
247
247
|
|
|
@@ -631,57 +631,55 @@ class Registry(ABC):
|
|
|
631
631
|
self,
|
|
632
632
|
datasets: Iterable[DatasetRef],
|
|
633
633
|
expand: bool = True,
|
|
634
|
+
assume_new: bool = False,
|
|
634
635
|
) -> list[DatasetRef]:
|
|
635
636
|
"""Import one or more datasets into the `Registry`.
|
|
636
637
|
|
|
637
|
-
|
|
638
|
-
`DatasetRef` instances which
|
|
639
|
-
dataset ID. If registry supports globally-unique dataset IDs (e.g.
|
|
640
|
-
`uuid.UUID`) then datasets which already exist in the registry will be
|
|
641
|
-
ignored if imported again.
|
|
638
|
+
This differs from `insertDatasets` method in that this method accepts
|
|
639
|
+
`DatasetRef` instances, which already have a dataset ID.
|
|
642
640
|
|
|
643
641
|
Parameters
|
|
644
642
|
----------
|
|
645
643
|
datasets : `~collections.abc.Iterable` of `DatasetRef`
|
|
646
644
|
Datasets to be inserted. All `DatasetRef` instances must have
|
|
647
|
-
identical ``
|
|
645
|
+
identical ``run`` attributes. ``run``
|
|
648
646
|
attribute can be `None` and defaults to ``self.defaults.run``.
|
|
649
647
|
Datasets can specify ``id`` attribute which will be used for
|
|
650
|
-
inserted datasets.
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
generated by backend.
|
|
648
|
+
inserted datasets.
|
|
649
|
+
Datasets can be of multiple dataset types, but all the dataset
|
|
650
|
+
types must have the same set of dimensions.
|
|
654
651
|
expand : `bool`, optional
|
|
655
652
|
If `True` (default), expand data IDs as they are inserted. This is
|
|
656
653
|
necessary in general, but it may be disabled if the caller can
|
|
657
654
|
guarantee this is unnecessary.
|
|
655
|
+
assume_new : `bool`, optional
|
|
656
|
+
If `True`, assume datasets are new. If `False`, datasets that are
|
|
657
|
+
identical to an existing one are ignored.
|
|
658
658
|
|
|
659
659
|
Returns
|
|
660
660
|
-------
|
|
661
661
|
refs : `list` of `DatasetRef`
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
662
|
+
`DatasetRef` instances for all given data IDs (in the same order).
|
|
663
|
+
If any of ``datasets`` has an ID which already exists in the
|
|
664
|
+
database then it will not be inserted or updated, but a
|
|
665
|
+
`DatasetRef` will be returned for it in any case.
|
|
666
666
|
|
|
667
667
|
Raises
|
|
668
668
|
------
|
|
669
669
|
lsst.daf.butler.registry.NoDefaultCollectionError
|
|
670
670
|
Raised if ``run`` is `None` and ``self.defaults.run`` is `None`.
|
|
671
671
|
lsst.daf.butler.registry.DatasetTypeError
|
|
672
|
-
Raised if
|
|
673
|
-
dataset type is not known to registry.
|
|
672
|
+
Raised if a dataset type is not known to registry.
|
|
674
673
|
lsst.daf.butler.registry.ConflictingDefinitionError
|
|
675
674
|
If a dataset with the same dataset type and data ID as one of those
|
|
676
|
-
given already exists in ``run
|
|
675
|
+
given already exists in ``run``, or if ``assume_new=True`` and at
|
|
676
|
+
least one dataset is not new.
|
|
677
677
|
lsst.daf.butler.registry.MissingCollectionError
|
|
678
678
|
Raised if ``run`` does not exist in the registry.
|
|
679
679
|
|
|
680
680
|
Notes
|
|
681
681
|
-----
|
|
682
|
-
This method is considered
|
|
683
|
-
implementation. Clients outside daf_butler package should not use this
|
|
684
|
-
method.
|
|
682
|
+
This method is considered middleware-internal.
|
|
685
683
|
"""
|
|
686
684
|
raise NotImplementedError()
|
|
687
685
|
|
|
@@ -702,6 +700,27 @@ class Registry(ABC):
|
|
|
702
700
|
"""
|
|
703
701
|
raise NotImplementedError()
|
|
704
702
|
|
|
703
|
+
@abstractmethod
|
|
704
|
+
def _fetch_run_dataset_ids(self, run: str) -> list[DatasetId]:
|
|
705
|
+
"""Return the IDs of all datasets in the given ``RUN``
|
|
706
|
+
collection.
|
|
707
|
+
|
|
708
|
+
Parameters
|
|
709
|
+
----------
|
|
710
|
+
run : `str`
|
|
711
|
+
Name of the collection.
|
|
712
|
+
|
|
713
|
+
Returns
|
|
714
|
+
-------
|
|
715
|
+
dataset_ids : `list` [`uuid.UUID`]
|
|
716
|
+
List of dataset IDs.
|
|
717
|
+
|
|
718
|
+
Notes
|
|
719
|
+
-----
|
|
720
|
+
This is a middleware-internal interface.
|
|
721
|
+
"""
|
|
722
|
+
raise NotImplementedError()
|
|
723
|
+
|
|
705
724
|
@abstractmethod
|
|
706
725
|
def removeDatasets(self, refs: Iterable[DatasetRef]) -> None:
|
|
707
726
|
"""Remove datasets from the Registry.
|
|
@@ -231,20 +231,28 @@ class RegistryBase(Registry):
|
|
|
231
231
|
collectionTypes: Iterable[CollectionType] = CollectionType.all(),
|
|
232
232
|
flattenChains: bool = False,
|
|
233
233
|
) -> Iterator[DatasetAssociation]:
|
|
234
|
-
# queryCollections only accepts DatasetType.
|
|
235
234
|
if isinstance(datasetType, str):
|
|
236
235
|
datasetType = self.getDatasetType(datasetType)
|
|
237
|
-
resolved_collections = self.queryCollections(
|
|
238
|
-
collections, datasetType=datasetType, collectionTypes=collectionTypes, flattenChains=flattenChains
|
|
239
|
-
)
|
|
240
236
|
with self._butler.query() as query:
|
|
237
|
+
resolved_collections = self.queryCollections(
|
|
238
|
+
collections,
|
|
239
|
+
datasetType=datasetType,
|
|
240
|
+
collectionTypes=collectionTypes,
|
|
241
|
+
flattenChains=flattenChains,
|
|
242
|
+
)
|
|
243
|
+
# It's annoyingly difficult to just do the collection query once,
|
|
244
|
+
# since query_info doesn't accept all the expression types that
|
|
245
|
+
# queryCollections does. But it's all cached anyway.
|
|
246
|
+
collection_info = {
|
|
247
|
+
info.name: info for info in self._butler.collections.query_info(resolved_collections)
|
|
248
|
+
}
|
|
241
249
|
query = query.join_dataset_search(datasetType, resolved_collections)
|
|
242
250
|
result = query.general(
|
|
243
251
|
datasetType.dimensions,
|
|
244
252
|
dataset_fields={datasetType.name: {"dataset_id", "run", "collection", "timespan"}},
|
|
245
253
|
find_first=False,
|
|
246
254
|
)
|
|
247
|
-
yield from DatasetAssociation.from_query_result(result, datasetType)
|
|
255
|
+
yield from DatasetAssociation.from_query_result(result, datasetType, collection_info)
|
|
248
256
|
|
|
249
257
|
def _resolve_dataset_types(self, dataset_types: object | None) -> list[str]:
|
|
250
258
|
if dataset_types is None:
|
|
@@ -617,6 +617,14 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
617
617
|
dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types)
|
|
618
618
|
return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
|
|
619
619
|
|
|
620
|
+
def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]:
|
|
621
|
+
# Docstring inherited.
|
|
622
|
+
sql = sqlalchemy.select(self._static.dataset.c.id).where(
|
|
623
|
+
self._static.dataset.c[self._run_key_column] == run.key
|
|
624
|
+
)
|
|
625
|
+
with self._db.query(sql) as result:
|
|
626
|
+
return list(result.scalars())
|
|
627
|
+
|
|
620
628
|
def ingest_date_dtype(self) -> type:
|
|
621
629
|
"""Return type of the ``ingest_date`` column."""
|
|
622
630
|
schema_version = self.newSchemaVersion()
|
|
@@ -698,7 +706,7 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
698
706
|
for dataId, row in zip(data_id_list, rows, strict=True)
|
|
699
707
|
]
|
|
700
708
|
|
|
701
|
-
def import_(self, run: RunRecord, refs: list[DatasetRef]) -> None:
|
|
709
|
+
def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None:
|
|
702
710
|
# Docstring inherited from DatasetRecordStorageManager.
|
|
703
711
|
if not refs:
|
|
704
712
|
# Just in case an empty mapping is provided we want to avoid
|
|
@@ -721,7 +729,6 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
721
729
|
"Table cache should have been populated when looking up dataset types"
|
|
722
730
|
)
|
|
723
731
|
tags_table = self._get_tags_table(dynamic_tables)
|
|
724
|
-
|
|
725
732
|
# Current timestamp, type depends on schema version.
|
|
726
733
|
if self._use_astropy_ingest_date:
|
|
727
734
|
# Astropy `now()` precision should be the same as `now()` which
|
|
@@ -729,11 +736,8 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
729
736
|
timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
|
|
730
737
|
else:
|
|
731
738
|
timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC))
|
|
732
|
-
|
|
733
|
-
# We'll insert all new rows into a temporary table
|
|
734
|
-
table_spec = makeTagTableSpec(dimensions, type(self._collections), constraints=False)
|
|
735
739
|
collection_fkey_name = self._collections.getCollectionForeignKeyName()
|
|
736
|
-
|
|
740
|
+
tags_rows = [
|
|
737
741
|
{
|
|
738
742
|
"dataset_type_id": dataset_type_storage[ref.datasetType.name].dataset_type_id,
|
|
739
743
|
collection_fkey_name: run.key,
|
|
@@ -742,9 +746,29 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
742
746
|
}
|
|
743
747
|
for ref in refs
|
|
744
748
|
]
|
|
749
|
+
if assume_new:
|
|
750
|
+
self._import_new(run, refs, dataset_type_storage, tags_table, tags_rows, timestamp)
|
|
751
|
+
else:
|
|
752
|
+
self._import_guarded(
|
|
753
|
+
run, refs, dimensions, dataset_type_storage, tags_table, tags_rows, timestamp
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
def _import_guarded(
|
|
757
|
+
self,
|
|
758
|
+
run: RunRecord,
|
|
759
|
+
refs: list[DatasetRef],
|
|
760
|
+
dimensions: DimensionGroup,
|
|
761
|
+
dataset_type_storage: dict[str, _DatasetRecordStorage],
|
|
762
|
+
tags_table: sqlalchemy.Table,
|
|
763
|
+
tags_rows: list[dict[str, object]],
|
|
764
|
+
timestamp: sqlalchemy.BindParameter[astropy.time.Time | datetime.datetime],
|
|
765
|
+
) -> None:
|
|
766
|
+
# We'll insert all new rows into a temporary table
|
|
767
|
+
table_spec = makeTagTableSpec(dimensions, type(self._collections), constraints=False)
|
|
768
|
+
collection_fkey_name = self._collections.getCollectionForeignKeyName()
|
|
745
769
|
with self._db.transaction(for_temp_tables=True), self._db.temporary_table(table_spec) as tmp_tags:
|
|
746
770
|
# store all incoming data in a temporary table
|
|
747
|
-
self._db.insert(tmp_tags, *
|
|
771
|
+
self._db.insert(tmp_tags, *tags_rows)
|
|
748
772
|
# There are some checks that we want to make for consistency
|
|
749
773
|
# of the new datasets with existing ones.
|
|
750
774
|
self._validate_import(dimensions, tags_table, tmp_tags, run)
|
|
@@ -764,17 +788,19 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
764
788
|
timestamp.label("ingest_date"),
|
|
765
789
|
),
|
|
766
790
|
)
|
|
767
|
-
|
|
768
|
-
# is the first time this dataset type or these governor values
|
|
769
|
-
# will be inserted there.
|
|
770
|
-
summary = CollectionSummary()
|
|
771
|
-
summary.add_datasets(refs)
|
|
772
|
-
self._summaries.update(
|
|
773
|
-
run, [storage.dataset_type_id for storage in dataset_type_storage.values()], summary
|
|
774
|
-
)
|
|
791
|
+
self._update_summaries(run, refs, dataset_type_storage)
|
|
775
792
|
# Copy from temp table into tags table.
|
|
776
793
|
self._db.insert(tags_table, select=tmp_tags.select())
|
|
777
794
|
|
|
795
|
+
def _update_summaries(
|
|
796
|
+
self, run: RunRecord, refs: list[DatasetRef], dataset_type_storage: dict[str, _DatasetRecordStorage]
|
|
797
|
+
) -> None:
|
|
798
|
+
summary = CollectionSummary()
|
|
799
|
+
summary.add_datasets(refs)
|
|
800
|
+
self._summaries.update(
|
|
801
|
+
run, [storage.dataset_type_id for storage in dataset_type_storage.values()], summary
|
|
802
|
+
)
|
|
803
|
+
|
|
778
804
|
def _validate_import(
|
|
779
805
|
self,
|
|
780
806
|
dimensions: DimensionGroup,
|
|
@@ -899,6 +925,29 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
899
925
|
f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
|
|
900
926
|
)
|
|
901
927
|
|
|
928
|
+
def _import_new(
|
|
929
|
+
self,
|
|
930
|
+
run: RunRecord,
|
|
931
|
+
refs: list[DatasetRef],
|
|
932
|
+
dataset_type_storage: dict[str, _DatasetRecordStorage],
|
|
933
|
+
tags_table: sqlalchemy.Table,
|
|
934
|
+
tags_rows: list[dict[str, object]],
|
|
935
|
+
timestamp: sqlalchemy.BindParameter[astropy.time.Time | datetime.datetime],
|
|
936
|
+
) -> None:
|
|
937
|
+
static_rows = [
|
|
938
|
+
{
|
|
939
|
+
"id": ref.id,
|
|
940
|
+
"dataset_type_id": dataset_type_storage[ref.datasetType.name].dataset_type_id,
|
|
941
|
+
self._run_key_column: run.key,
|
|
942
|
+
"ingest_date": timestamp.value,
|
|
943
|
+
}
|
|
944
|
+
for ref in refs
|
|
945
|
+
]
|
|
946
|
+
with self._db.transaction():
|
|
947
|
+
self._db.insert(self._static.dataset, *static_rows)
|
|
948
|
+
self._update_summaries(run, refs, dataset_type_storage)
|
|
949
|
+
self._db.insert(tags_table, *tags_rows)
|
|
950
|
+
|
|
902
951
|
def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None:
|
|
903
952
|
# Docstring inherited from DatasetRecordStorageManager.
|
|
904
953
|
# Only delete from common dataset table; ON DELETE foreign key clauses
|
|
@@ -1425,7 +1474,7 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
|
|
|
1425
1474
|
)
|
|
1426
1475
|
if "timespan" in fields:
|
|
1427
1476
|
tags_builder.joins.timespans[fields_key] = self._db.getTimespanRepresentation().fromLiteral(
|
|
1428
|
-
None
|
|
1477
|
+
Timespan(None, None)
|
|
1429
1478
|
)
|
|
1430
1479
|
calibs_builder: SqlSelectBuilder | None = None
|
|
1431
1480
|
if CollectionType.CALIBRATION in collection_types:
|
|
@@ -384,6 +384,23 @@ class DatasetRecordStorageManager(VersionedExtension):
|
|
|
384
384
|
"""
|
|
385
385
|
raise NotImplementedError()
|
|
386
386
|
|
|
387
|
+
@abstractmethod
|
|
388
|
+
def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]:
|
|
389
|
+
"""Return the IDs of all datasets in the given ``RUN``
|
|
390
|
+
collection.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
run : `RunRecord`
|
|
395
|
+
Record describing the collection.
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
dataset_ids : `list` [`uuid.UUID`]
|
|
400
|
+
List of dataset IDs.
|
|
401
|
+
"""
|
|
402
|
+
raise NotImplementedError()
|
|
403
|
+
|
|
387
404
|
@abstractmethod
|
|
388
405
|
def ingest_date_dtype(self) -> type:
|
|
389
406
|
"""Return type of the ``ingest_date`` column."""
|
|
@@ -424,7 +441,7 @@ class DatasetRecordStorageManager(VersionedExtension):
|
|
|
424
441
|
raise NotImplementedError()
|
|
425
442
|
|
|
426
443
|
@abstractmethod
|
|
427
|
-
def import_(self, run: RunRecord, refs: list[DatasetRef]) -> None:
|
|
444
|
+
def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None:
|
|
428
445
|
"""Insert one or more dataset entries into the database.
|
|
429
446
|
|
|
430
447
|
Parameters
|
|
@@ -435,6 +452,9 @@ class DatasetRecordStorageManager(VersionedExtension):
|
|
|
435
452
|
refs : `list` [ `DatasetRef` ]
|
|
436
453
|
List of datasets to be be inserted. All of the ``DatasetRef``
|
|
437
454
|
``run`` attributes must match the ``run`` parameter.
|
|
455
|
+
assume_new : `bool`, optional
|
|
456
|
+
If `True`, assume all datasets are new and skip conflict resolution
|
|
457
|
+
logic.
|
|
438
458
|
"""
|
|
439
459
|
raise NotImplementedError()
|
|
440
460
|
|
|
@@ -58,6 +58,9 @@ class CommonQueryArguments:
|
|
|
58
58
|
def replaceCollections(self, collections: list[str]) -> CommonQueryArguments:
|
|
59
59
|
return dataclasses.replace(self, collections=collections)
|
|
60
60
|
|
|
61
|
+
def replaceDatasetTypes(self, dataset_types: list[str]) -> CommonQueryArguments:
|
|
62
|
+
return dataclasses.replace(self, dataset_types=dataset_types)
|
|
63
|
+
|
|
61
64
|
|
|
62
65
|
_T = TypeVar("_T", bound=QueryResultsBase)
|
|
63
66
|
_U = TypeVar("_U", bound=QueryResultsBase)
|