lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/daf/butler/_butler.py +27 -8
- lsst/daf/butler/_butler_collections.py +4 -4
- lsst/daf/butler/_butler_metrics.py +51 -2
- lsst/daf/butler/_dataset_provenance.py +1 -1
- lsst/daf/butler/_dataset_ref.py +1 -1
- lsst/daf/butler/_exceptions.py +2 -2
- lsst/daf/butler/_file_dataset.py +2 -1
- lsst/daf/butler/_formatter.py +14 -7
- lsst/daf/butler/_labeled_butler_factory.py +28 -8
- lsst/daf/butler/_query_all_datasets.py +2 -0
- lsst/daf/butler/_rubin/temporary_for_ingest.py +207 -0
- lsst/daf/butler/cli/cmd/_remove_runs.py +1 -12
- lsst/daf/butler/column_spec.py +4 -4
- lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
- lsst/daf/butler/configs/storageClasses.yaml +15 -0
- lsst/daf/butler/datastore/_datastore.py +21 -1
- lsst/daf/butler/datastore/record_data.py +1 -1
- lsst/daf/butler/datastore/stored_file_info.py +2 -2
- lsst/daf/butler/datastores/chainedDatastore.py +4 -0
- lsst/daf/butler/datastores/fileDatastore.py +26 -13
- lsst/daf/butler/datastores/file_datastore/get.py +4 -4
- lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +5 -1
- lsst/daf/butler/datastores/file_datastore/transfer.py +2 -2
- lsst/daf/butler/datastores/inMemoryDatastore.py +8 -0
- lsst/daf/butler/ddl.py +2 -2
- lsst/daf/butler/dimensions/_coordinate.py +11 -8
- lsst/daf/butler/dimensions/_record_set.py +1 -1
- lsst/daf/butler/dimensions/_records.py +9 -3
- lsst/daf/butler/direct_butler/_direct_butler.py +85 -51
- lsst/daf/butler/direct_query_driver/_driver.py +5 -4
- lsst/daf/butler/direct_query_driver/_result_page_converter.py +1 -1
- lsst/daf/butler/formatters/parquet.py +6 -6
- lsst/daf/butler/logging.py +9 -3
- lsst/daf/butler/nonempty_mapping.py +1 -1
- lsst/daf/butler/persistence_context.py +8 -5
- lsst/daf/butler/queries/_general_query_results.py +1 -1
- lsst/daf/butler/queries/driver.py +1 -1
- lsst/daf/butler/queries/expression_factory.py +2 -2
- lsst/daf/butler/queries/expressions/parser/exprTree.py +1 -1
- lsst/daf/butler/queries/expressions/parser/parserYacc.py +1 -1
- lsst/daf/butler/queries/overlaps.py +2 -2
- lsst/daf/butler/queries/tree/_column_set.py +1 -1
- lsst/daf/butler/registry/_collection_record_cache.py +1 -1
- lsst/daf/butler/registry/_collection_summary_cache.py +5 -4
- lsst/daf/butler/registry/_registry.py +4 -0
- lsst/daf/butler/registry/bridge/monolithic.py +17 -13
- lsst/daf/butler/registry/databases/postgresql.py +2 -1
- lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py +1 -1
- lsst/daf/butler/registry/datasets/byDimensions/_manager.py +53 -47
- lsst/daf/butler/registry/datasets/byDimensions/summaries.py +3 -2
- lsst/daf/butler/registry/expand_data_ids.py +93 -0
- lsst/daf/butler/registry/interfaces/_database.py +6 -1
- lsst/daf/butler/registry/interfaces/_datasets.py +2 -1
- lsst/daf/butler/registry/interfaces/_obscore.py +1 -1
- lsst/daf/butler/registry/obscore/_records.py +1 -1
- lsst/daf/butler/registry/obscore/_spatial.py +2 -2
- lsst/daf/butler/registry/queries/_results.py +2 -2
- lsst/daf/butler/registry/sql_registry.py +3 -25
- lsst/daf/butler/registry/wildcards.py +5 -5
- lsst/daf/butler/remote_butler/_get.py +1 -1
- lsst/daf/butler/remote_butler/_remote_butler.py +6 -1
- lsst/daf/butler/remote_butler/_remote_file_transfer_source.py +4 -0
- lsst/daf/butler/remote_butler/authentication/cadc.py +4 -3
- lsst/daf/butler/script/_pruneDatasets.py +4 -2
- lsst/daf/butler/script/configValidate.py +2 -2
- lsst/daf/butler/script/queryCollections.py +2 -2
- lsst/daf/butler/script/removeCollections.py +2 -0
- lsst/daf/butler/script/removeRuns.py +2 -0
- lsst/daf/butler/tests/cliCmdTestBase.py +2 -0
- lsst/daf/butler/tests/cliLogTestBase.py +2 -0
- lsst/daf/butler/tests/hybrid_butler.py +10 -2
- lsst/daf/butler/tests/registry_data/lsstcam-subset.yaml +191 -0
- lsst/daf/butler/tests/registry_data/spatial.py +4 -2
- lsst/daf/butler/tests/testFormatters.py +2 -2
- lsst/daf/butler/tests/utils.py +1 -1
- lsst/daf/butler/timespan_database_representation.py +3 -3
- lsst/daf/butler/transfers/_context.py +7 -6
- lsst/daf/butler/version.py +1 -1
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/METADATA +3 -2
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/RECORD +88 -85
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/WHEEL +1 -1
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/entry_points.txt +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/LICENSE +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/top_level.txt +0 -0
- {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/zip-safe +0 -0
lsst/daf/butler/column_spec.py
CHANGED
|
@@ -109,12 +109,12 @@ class ColumnValueSerializer(ABC):
|
|
|
109
109
|
|
|
110
110
|
Parameters
|
|
111
111
|
----------
|
|
112
|
-
value : `Any`
|
|
112
|
+
value : `typing.Any`
|
|
113
113
|
Column value to be serialized.
|
|
114
114
|
|
|
115
115
|
Returns
|
|
116
116
|
-------
|
|
117
|
-
value : `Any`
|
|
117
|
+
value : `typing.Any`
|
|
118
118
|
Column value in serializable format.
|
|
119
119
|
"""
|
|
120
120
|
raise NotImplementedError
|
|
@@ -125,12 +125,12 @@ class ColumnValueSerializer(ABC):
|
|
|
125
125
|
|
|
126
126
|
Parameters
|
|
127
127
|
----------
|
|
128
|
-
value : `Any`
|
|
128
|
+
value : `typing.Any`
|
|
129
129
|
Serialized column value.
|
|
130
130
|
|
|
131
131
|
Returns
|
|
132
132
|
-------
|
|
133
|
-
value : `Any`
|
|
133
|
+
value : `typing.Any`
|
|
134
134
|
Deserialized column value.
|
|
135
135
|
"""
|
|
136
136
|
raise NotImplementedError
|
|
@@ -100,3 +100,4 @@ VisitBackgroundModel: lsst.daf.butler.formatters.json.JsonFormatter
|
|
|
100
100
|
VignettingCorrection: lsst.ts.observatory.control.utils.extras.vignetting_storage.VignettingCorrectionFormatter
|
|
101
101
|
SSPAuxiliaryFile: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFileFormatter
|
|
102
102
|
VisitGeometry: lsst.daf.butler.formatters.json.JsonFormatter
|
|
103
|
+
ProvenanceQuantumGraph: lsst.pipe.base.quantum_graph.formatter.ProvenanceFormatter
|
|
@@ -443,3 +443,18 @@ storageClasses:
|
|
|
443
443
|
pytype: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFile
|
|
444
444
|
VisitGeometry:
|
|
445
445
|
pytype: lsst.obs.base.visit_geometry.VisitGeometry
|
|
446
|
+
ProvenanceQuantumGraph:
|
|
447
|
+
pytype: lsst.pipe.base.quantum_graph.ProvenanceQuantumGraph
|
|
448
|
+
parameters:
|
|
449
|
+
- import_mode # lsst.pipe.base.pipeline_graph.TaskImportMode
|
|
450
|
+
- quanta # iterable of uuid.UUID; quanta to read
|
|
451
|
+
- datasets # iterable of uuid.UUID; datasets to read
|
|
452
|
+
- read_init_quanta # bool, defaults to True; whether to read pre-exec-init info
|
|
453
|
+
derivedComponents:
|
|
454
|
+
packages: Packages # ignores node parameters
|
|
455
|
+
|
|
456
|
+
# UUID keys can be quantum or data IDs (whichever is passed in via
|
|
457
|
+
# parameters). Nested lists are attempts to run the quantum (last is
|
|
458
|
+
# most recent).
|
|
459
|
+
logs: StructuredDataDict # dict[uuid.UUID, list[ButlerLogRecords]]
|
|
460
|
+
metadata: StructuredDataDict # dict[uuid.UUID, list[TaskMetadata]]
|
|
@@ -284,6 +284,14 @@ class DatasetRefURIs(abc.Sequence):
|
|
|
284
284
|
def __repr__(self) -> str:
|
|
285
285
|
return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
|
|
286
286
|
|
|
287
|
+
def iter_all(self) -> Iterator[ResourcePath]:
|
|
288
|
+
"""Iterate over all URIs without regard to whether they are primary
|
|
289
|
+
or component.
|
|
290
|
+
"""
|
|
291
|
+
if self.primaryURI is not None:
|
|
292
|
+
yield self.primaryURI
|
|
293
|
+
yield from self.componentURIs.values()
|
|
294
|
+
|
|
287
295
|
|
|
288
296
|
class Datastore(FileTransferSource, metaclass=ABCMeta):
|
|
289
297
|
"""Datastore interface.
|
|
@@ -536,7 +544,7 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
|
|
|
536
544
|
|
|
537
545
|
Returns
|
|
538
546
|
-------
|
|
539
|
-
exists : `dict`[`DatasetRef`, `bool`]
|
|
547
|
+
exists : `dict` [`DatasetRef`, `bool`]
|
|
540
548
|
Mapping of dataset to boolean indicating whether the dataset
|
|
541
549
|
is known to the datastore.
|
|
542
550
|
"""
|
|
@@ -825,6 +833,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
|
|
|
825
833
|
in an external system or if the file is to be compressed in place.
|
|
826
834
|
It is up to the datastore whether this parameter is relevant.
|
|
827
835
|
|
|
836
|
+
Returns
|
|
837
|
+
-------
|
|
838
|
+
None
|
|
839
|
+
|
|
828
840
|
Raises
|
|
829
841
|
------
|
|
830
842
|
NotImplementedError
|
|
@@ -1143,6 +1155,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
|
|
|
1143
1155
|
Determine whether errors should be ignored. When multiple
|
|
1144
1156
|
refs are being trashed there will be no per-ref check.
|
|
1145
1157
|
|
|
1158
|
+
Returns
|
|
1159
|
+
-------
|
|
1160
|
+
None
|
|
1161
|
+
|
|
1146
1162
|
Raises
|
|
1147
1163
|
------
|
|
1148
1164
|
FileNotFoundError
|
|
@@ -1278,6 +1294,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
|
|
|
1278
1294
|
Entity to compare with configuration retrieved using the
|
|
1279
1295
|
specified lookup key.
|
|
1280
1296
|
|
|
1297
|
+
Returns
|
|
1298
|
+
-------
|
|
1299
|
+
None
|
|
1300
|
+
|
|
1281
1301
|
Raises
|
|
1282
1302
|
------
|
|
1283
1303
|
DatastoreValidationError
|
|
@@ -49,7 +49,7 @@ if TYPE_CHECKING:
|
|
|
49
49
|
# Pydantic requires the possible value types to be explicitly enumerated in
|
|
50
50
|
# order for `uuid.UUID` in particular to work. `typing.Any` does not work
|
|
51
51
|
# here.
|
|
52
|
-
_Record: TypeAlias = dict[str, int | str |
|
|
52
|
+
_Record: TypeAlias = dict[str, int | str | None]
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
class SerializedDatastoreRecordData(pydantic.BaseModel):
|
|
@@ -423,8 +423,8 @@ def make_datastore_path_relative(path: str) -> str:
|
|
|
423
423
|
path : `str`
|
|
424
424
|
The file path from a `StoredFileInfo`.
|
|
425
425
|
|
|
426
|
-
|
|
427
|
-
|
|
426
|
+
Returns
|
|
427
|
+
-------
|
|
428
428
|
normalized_path : `str`
|
|
429
429
|
The original path, if it was relative. Otherwise, a version of it that
|
|
430
430
|
was converted to a relative path, stripping URI scheme and netloc from
|
|
@@ -1068,9 +1068,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1068
1068
|
# Work out the name we want this ingested file to have
|
|
1069
1069
|
# inside the datastore
|
|
1070
1070
|
tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
|
|
1071
|
-
if not tgtLocation.uri.dirname().exists():
|
|
1072
|
-
log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
|
|
1073
|
-
tgtLocation.uri.dirname().mkdir()
|
|
1074
1071
|
|
|
1075
1072
|
# if we are transferring from a local file to a remote location
|
|
1076
1073
|
# it may be more efficient to get the size and checksum of the
|
|
@@ -1311,12 +1308,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1311
1308
|
f"and storage class type ({required_pytype})"
|
|
1312
1309
|
)
|
|
1313
1310
|
|
|
1314
|
-
uri = location.uri
|
|
1315
|
-
|
|
1316
|
-
if not uri.dirname().exists():
|
|
1317
|
-
log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
|
|
1318
|
-
uri.dirname().mkdir()
|
|
1319
|
-
|
|
1320
1311
|
if self._transaction is None:
|
|
1321
1312
|
raise RuntimeError("Attempting to write artifact without transaction enabled")
|
|
1322
1313
|
|
|
@@ -1332,6 +1323,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1332
1323
|
|
|
1333
1324
|
# Register a callback to try to delete the uploaded data if
|
|
1334
1325
|
# something fails below
|
|
1326
|
+
uri = location.uri
|
|
1335
1327
|
self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
|
|
1336
1328
|
|
|
1337
1329
|
# Need to record the specified formatter but if this is a V1 formatter
|
|
@@ -2160,7 +2152,13 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2160
2152
|
|
|
2161
2153
|
return artifact_map
|
|
2162
2154
|
|
|
2163
|
-
def ingest_zip(
|
|
2155
|
+
def ingest_zip(
|
|
2156
|
+
self,
|
|
2157
|
+
zip_path: ResourcePath,
|
|
2158
|
+
transfer: str | None,
|
|
2159
|
+
*,
|
|
2160
|
+
dry_run: bool = False,
|
|
2161
|
+
) -> None:
|
|
2164
2162
|
"""Ingest an indexed Zip file and contents.
|
|
2165
2163
|
|
|
2166
2164
|
The Zip file must have an index file as created by `retrieveArtifacts`.
|
|
@@ -2220,9 +2218,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2220
2218
|
else:
|
|
2221
2219
|
# Name the zip file based on index contents.
|
|
2222
2220
|
tgtLocation = self.locationFactory.fromPath(index.calculate_zip_file_path_in_store())
|
|
2223
|
-
if not tgtLocation.uri.dirname().exists():
|
|
2224
|
-
log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
|
|
2225
|
-
tgtLocation.uri.dirname().mkdir()
|
|
2226
2221
|
|
|
2227
2222
|
# Transfer the Zip file into the datastore.
|
|
2228
2223
|
if not dry_run:
|
|
@@ -2987,6 +2982,10 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2987
2982
|
If `True`, output a log message for every validation error
|
|
2988
2983
|
detected.
|
|
2989
2984
|
|
|
2985
|
+
Returns
|
|
2986
|
+
-------
|
|
2987
|
+
None
|
|
2988
|
+
|
|
2990
2989
|
Raises
|
|
2991
2990
|
------
|
|
2992
2991
|
DatastoreValidationError
|
|
@@ -3177,6 +3176,20 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
3177
3176
|
|
|
3178
3177
|
def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
|
|
3179
3178
|
# Docstring inherited from the base class.
|
|
3179
|
+
|
|
3180
|
+
# This call to 'bridge.check' filters out "partially deleted" datasets.
|
|
3181
|
+
# Specifically, ones in the unusual edge state that:
|
|
3182
|
+
# 1. They have an entry in the registry dataset tables
|
|
3183
|
+
# 2. They were "trashed" from the datastore, so they are not
|
|
3184
|
+
# present in the "dataset_location" table.)
|
|
3185
|
+
# 3. But the trash has not been "emptied", so there are still entries
|
|
3186
|
+
# in the "opaque" datastore records table.
|
|
3187
|
+
#
|
|
3188
|
+
# As far as I can tell, this can only occur in the case of a concurrent
|
|
3189
|
+
# or aborted call to `Butler.pruneDatasets(unstore=True, purge=False)`.
|
|
3190
|
+
# Datasets (with or without files existing on disk) can persist in
|
|
3191
|
+
# this zombie state indefinitely, until someone manually empties
|
|
3192
|
+
# the trash.
|
|
3180
3193
|
exported_refs = list(self._bridge.check(refs))
|
|
3181
3194
|
ids = {ref.id for ref in exported_refs}
|
|
3182
3195
|
records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
|
|
@@ -97,12 +97,12 @@ def generate_datastore_get_information(
|
|
|
97
97
|
|
|
98
98
|
Parameters
|
|
99
99
|
----------
|
|
100
|
-
fileLocations : `list`[`DatasetLocationInformation`]
|
|
100
|
+
fileLocations : `list` [`DatasetLocationInformation`]
|
|
101
101
|
List of file locations for this artifact and their associated datastore
|
|
102
102
|
records.
|
|
103
103
|
ref : `DatasetRef`
|
|
104
104
|
The registry information associated with this artifact.
|
|
105
|
-
parameters :
|
|
105
|
+
parameters : `~collections.abc.Mapping` [`str`, `typing.Any`]
|
|
106
106
|
`StorageClass` and `Formatter` parameters.
|
|
107
107
|
readStorageClass : `StorageClass` | `None`, optional
|
|
108
108
|
The StorageClass to use when ultimately returning the resulting object
|
|
@@ -255,12 +255,12 @@ def get_dataset_as_python_object_from_get_info(
|
|
|
255
255
|
|
|
256
256
|
Parameters
|
|
257
257
|
----------
|
|
258
|
-
allGetInfo : `list`[`DatastoreFileGetInformation`]
|
|
258
|
+
allGetInfo : `list` [`DatastoreFileGetInformation`]
|
|
259
259
|
Pre-processed information about each file associated with this
|
|
260
260
|
artifact.
|
|
261
261
|
ref : `DatasetRef`
|
|
262
262
|
The registry information associated with this artifact.
|
|
263
|
-
parameters :
|
|
263
|
+
parameters : `~collections.abc.Mapping` [`str`, `typing.Any`]
|
|
264
264
|
`StorageClass` and `Formatter` parameters.
|
|
265
265
|
cache_manager : `AbstractDatastoreCacheManager`
|
|
266
266
|
The cache manager to use for caching retrieved files.
|
|
@@ -274,7 +274,11 @@ class ZipIndex(BaseModel):
|
|
|
274
274
|
Path to the Zip file.
|
|
275
275
|
"""
|
|
276
276
|
with zip_path.open("rb") as fd, zipfile.ZipFile(fd) as zf:
|
|
277
|
-
|
|
277
|
+
return cls.from_open_zip(zf)
|
|
278
|
+
|
|
279
|
+
@classmethod
|
|
280
|
+
def from_open_zip(cls, zf: zipfile.ZipFile) -> Self:
|
|
281
|
+
json_data = zf.read(cls.index_name)
|
|
278
282
|
return cls.model_validate_json(json_data)
|
|
279
283
|
|
|
280
284
|
|
|
@@ -55,8 +55,8 @@ def retrieve_file_transfer_records(
|
|
|
55
55
|
Cache mapping datastore artifact to existence. Updated by
|
|
56
56
|
this method with details of all artifacts tested.
|
|
57
57
|
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
60
|
files : `FileTransferMap`
|
|
61
61
|
A dictionary from `DatasetId` to a list of `FileTransferRecord`,
|
|
62
62
|
containing information about the files that were found for these
|
|
@@ -590,6 +590,10 @@ class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
|
|
|
590
590
|
ignore_errors : `bool`, optional
|
|
591
591
|
Indicate that errors should be ignored.
|
|
592
592
|
|
|
593
|
+
Returns
|
|
594
|
+
-------
|
|
595
|
+
None
|
|
596
|
+
|
|
593
597
|
Raises
|
|
594
598
|
------
|
|
595
599
|
FileNotFoundError
|
|
@@ -721,6 +725,10 @@ class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
|
|
|
721
725
|
If `True`, output a log message for every validation error
|
|
722
726
|
detected.
|
|
723
727
|
|
|
728
|
+
Returns
|
|
729
|
+
-------
|
|
730
|
+
None
|
|
731
|
+
|
|
724
732
|
Raises
|
|
725
733
|
------
|
|
726
734
|
DatastoreValidationError
|
lsst/daf/butler/ddl.py
CHANGED
|
@@ -537,7 +537,7 @@ class IndexSpec:
|
|
|
537
537
|
----------
|
|
538
538
|
*columns : `str`
|
|
539
539
|
Names of the columns to index.
|
|
540
|
-
**kwargs : `Any`
|
|
540
|
+
**kwargs : `typing.Any`
|
|
541
541
|
Additional keyword arguments to pass directly to
|
|
542
542
|
`sqlalchemy.schema.Index` constructor. This could be used to provide
|
|
543
543
|
backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL
|
|
@@ -556,7 +556,7 @@ class IndexSpec:
|
|
|
556
556
|
|
|
557
557
|
kwargs: dict[str, Any]
|
|
558
558
|
"""Additional keyword arguments passed directly to
|
|
559
|
-
`sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]).
|
|
559
|
+
`sqlalchemy.schema.Index` constructor (`dict` [ `str`, `typing.Any` ]).
|
|
560
560
|
"""
|
|
561
561
|
|
|
562
562
|
|
|
@@ -35,8 +35,6 @@ from __future__ import annotations
|
|
|
35
35
|
__all__ = (
|
|
36
36
|
"DataCoordinate",
|
|
37
37
|
"DataId",
|
|
38
|
-
"DataIdKey",
|
|
39
|
-
"DataIdValue",
|
|
40
38
|
"SerializedDataCoordinate",
|
|
41
39
|
"SerializedDataId",
|
|
42
40
|
)
|
|
@@ -55,7 +53,7 @@ from .._timespan import Timespan
|
|
|
55
53
|
from ..json import from_json_pydantic, to_json_pydantic
|
|
56
54
|
from ..persistence_context import PersistenceContextVars
|
|
57
55
|
from ._group import DimensionGroup
|
|
58
|
-
from ._records import
|
|
56
|
+
from ._records import DataIdValue, DimensionRecord, SerializedDimensionRecord
|
|
59
57
|
|
|
60
58
|
if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
|
|
61
59
|
from ..registry import Registry
|
|
@@ -559,11 +557,11 @@ class DataCoordinate:
|
|
|
559
557
|
Returns
|
|
560
558
|
-------
|
|
561
559
|
state : `bool`
|
|
562
|
-
If `True`,
|
|
563
|
-
|
|
564
|
-
for implied dimensions, and the
|
|
560
|
+
If `True`, ``__getitem__``, `get`, and ``__contains__`` (but not
|
|
561
|
+
``keys``!) will act as though the mapping includes key-value pairs
|
|
562
|
+
for implied dimensions, and the ``full`` property may be used. If
|
|
565
563
|
`False`, these operations only include key-value pairs for required
|
|
566
|
-
dimensions, and accessing
|
|
564
|
+
dimensions, and accessing ``full`` is an error. Always `True` if
|
|
567
565
|
there are no implied dimensions.
|
|
568
566
|
"""
|
|
569
567
|
raise NotImplementedError()
|
|
@@ -718,7 +716,7 @@ class DataCoordinate:
|
|
|
718
716
|
|
|
719
717
|
Parameters
|
|
720
718
|
----------
|
|
721
|
-
simple : `dict` of [`str`, `Any`]
|
|
719
|
+
simple : `dict` of [`str`, `typing.Any`]
|
|
722
720
|
The `dict` returned by `to_simple()`.
|
|
723
721
|
universe : `DimensionUniverse`
|
|
724
722
|
Object that manages all known dimensions.
|
|
@@ -755,6 +753,11 @@ class DataCoordinate:
|
|
|
755
753
|
to_json = to_json_pydantic
|
|
756
754
|
from_json: ClassVar[Callable[..., Self]] = cast(Callable[..., Self], classmethod(from_json_pydantic))
|
|
757
755
|
|
|
756
|
+
@property
|
|
757
|
+
def dataId(self) -> Self:
|
|
758
|
+
"""Return this `DataCoordinate` instance, unmodified."""
|
|
759
|
+
return self
|
|
760
|
+
|
|
758
761
|
|
|
759
762
|
DataId = DataCoordinate | Mapping[str, Any]
|
|
760
763
|
"""A type-annotation alias for signatures that accept both informal data ID
|
|
@@ -27,7 +27,13 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = (
|
|
30
|
+
__all__ = (
|
|
31
|
+
"DataIdKey",
|
|
32
|
+
"DataIdValue",
|
|
33
|
+
"DimensionRecord",
|
|
34
|
+
"SerializedDimensionRecord",
|
|
35
|
+
"SerializedKeyValueDimensionRecord",
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
import itertools
|
|
33
39
|
from collections.abc import Callable, Hashable
|
|
@@ -451,8 +457,8 @@ class DimensionRecord:
|
|
|
451
457
|
registry : `lsst.daf.butler.Registry`, optional
|
|
452
458
|
Registry from which a universe can be extracted. Can be `None`
|
|
453
459
|
if universe is provided explicitly.
|
|
454
|
-
cacheKey : `Hashable` or `None`
|
|
455
|
-
If this is not None
|
|
460
|
+
cacheKey : `collections.abc.Hashable` or `None`
|
|
461
|
+
If this is not `None`, it will be used as a key for any cached
|
|
456
462
|
reconstruction instead of calculating a value from the serialized
|
|
457
463
|
format.
|
|
458
464
|
|
|
@@ -884,6 +884,8 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
884
884
|
if isinstance(datasetRefOrType, DatasetRef):
|
|
885
885
|
if collections is not None:
|
|
886
886
|
warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
|
|
887
|
+
if predict and not datasetRefOrType.dataId.hasRecords():
|
|
888
|
+
return datasetRefOrType.expanded(self.registry.expandDataId(datasetRefOrType.dataId))
|
|
887
889
|
# May need to retrieve datastore records if requested.
|
|
888
890
|
if datastore_records and datasetRefOrType._datastore_records is None:
|
|
889
891
|
datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
|
|
@@ -936,6 +938,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
936
938
|
run = self.run
|
|
937
939
|
if run is None:
|
|
938
940
|
raise TypeError("Cannot predict dataset ID/location with run=None.")
|
|
941
|
+
dataId = self.registry.expandDataId(dataId)
|
|
939
942
|
return DatasetRef(datasetType, dataId, run=run)
|
|
940
943
|
else:
|
|
941
944
|
if collections is None:
|
|
@@ -1655,29 +1658,9 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1655
1658
|
*,
|
|
1656
1659
|
transfer_dimensions: bool = False,
|
|
1657
1660
|
dry_run: bool = False,
|
|
1661
|
+
skip_existing: bool = False,
|
|
1658
1662
|
) -> None:
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
The Zip file must have been created by `retrieve_artifacts_zip`.
|
|
1662
|
-
|
|
1663
|
-
Parameters
|
|
1664
|
-
----------
|
|
1665
|
-
zip_file : `lsst.resources.ResourcePathExpression`
|
|
1666
|
-
Path to the Zip file.
|
|
1667
|
-
transfer : `str`, optional
|
|
1668
|
-
Method to use to transfer the Zip into the datastore.
|
|
1669
|
-
transfer_dimensions : `bool`, optional
|
|
1670
|
-
If `True`, dimension record data associated with the new datasets
|
|
1671
|
-
will be transferred from the Zip, if present.
|
|
1672
|
-
dry_run : `bool`, optional
|
|
1673
|
-
If `True` the ingest will be processed without any modifications
|
|
1674
|
-
made to the target butler and as if the target butler did not
|
|
1675
|
-
have any of the datasets.
|
|
1676
|
-
|
|
1677
|
-
Notes
|
|
1678
|
-
-----
|
|
1679
|
-
Run collections and dataset types are created as needed.
|
|
1680
|
-
"""
|
|
1663
|
+
# Docstring inherited.
|
|
1681
1664
|
if not self.isWriteable():
|
|
1682
1665
|
raise TypeError("Butler is read-only.")
|
|
1683
1666
|
|
|
@@ -1703,6 +1686,29 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1703
1686
|
datasets.append(dataset)
|
|
1704
1687
|
processed_ids.update(unprocessed)
|
|
1705
1688
|
|
|
1689
|
+
new_datasets, existing_datasets = self._partition_datasets_by_known(datasets)
|
|
1690
|
+
if existing_datasets:
|
|
1691
|
+
if skip_existing:
|
|
1692
|
+
_LOG.info(
|
|
1693
|
+
"Skipping %d datasets from zip file %s which already exist in the repository.",
|
|
1694
|
+
len(existing_datasets),
|
|
1695
|
+
zip_file,
|
|
1696
|
+
)
|
|
1697
|
+
else:
|
|
1698
|
+
raise ConflictingDefinitionError(
|
|
1699
|
+
f"Datastore already contains {len(existing_datasets)} of the given datasets."
|
|
1700
|
+
f" Example: {existing_datasets[0]}"
|
|
1701
|
+
)
|
|
1702
|
+
if new_datasets:
|
|
1703
|
+
# Can not yet support partial zip ingests where a zip contains
|
|
1704
|
+
# some datasets that are already in another zip.
|
|
1705
|
+
raise ValueError(
|
|
1706
|
+
f"The given zip file from {zip_file} contains {len(new_datasets)} datasets not known "
|
|
1707
|
+
f"to this butler but also contains {len(existing_datasets)} datasets already known to "
|
|
1708
|
+
"this butler. Currently butler can not ingest zip files with overlapping content."
|
|
1709
|
+
)
|
|
1710
|
+
return
|
|
1711
|
+
|
|
1706
1712
|
# Ingest doesn't create the RUN collections so we have to do that
|
|
1707
1713
|
# here.
|
|
1708
1714
|
#
|
|
@@ -1721,7 +1727,18 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1721
1727
|
datasets, progress, dry_run=dry_run, transfer_dimensions=transfer_dimensions
|
|
1722
1728
|
)
|
|
1723
1729
|
|
|
1724
|
-
|
|
1730
|
+
# Calculate some statistics based on the given list of datasets.
|
|
1731
|
+
n_datasets = 0
|
|
1732
|
+
for d in datasets:
|
|
1733
|
+
n_datasets += len(d.refs)
|
|
1734
|
+
srefs = "s" if n_datasets != 1 else ""
|
|
1735
|
+
|
|
1736
|
+
with (
|
|
1737
|
+
self._metrics.instrument_ingest(
|
|
1738
|
+
n_datasets, _LOG, msg=f"Ingesting zip file {zip_file} with {n_datasets} dataset{srefs}"
|
|
1739
|
+
),
|
|
1740
|
+
self.transaction(),
|
|
1741
|
+
):
|
|
1725
1742
|
# Do not need expanded dataset refs so can ignore the return value.
|
|
1726
1743
|
self._ingest_file_datasets(datasets, import_info, progress, dry_run=dry_run)
|
|
1727
1744
|
|
|
@@ -1822,12 +1839,25 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1822
1839
|
f" Example: {existing_datasets[0]}"
|
|
1823
1840
|
)
|
|
1824
1841
|
|
|
1842
|
+
# Calculate some statistics based on the given list of datasets.
|
|
1843
|
+
n_files = len(datasets)
|
|
1844
|
+
n_datasets = 0
|
|
1845
|
+
for d in datasets:
|
|
1846
|
+
n_datasets += len(d.refs)
|
|
1847
|
+
sfiles = "s" if n_files != 1 else ""
|
|
1848
|
+
srefs = "s" if n_datasets != 1 else ""
|
|
1849
|
+
|
|
1825
1850
|
# We use `datasets` rather `new_datasets` for the Registry
|
|
1826
1851
|
# portion of this, to let it confirm that everything matches the
|
|
1827
1852
|
# existing datasets.
|
|
1828
1853
|
import_info = self._prepare_ingest_file_datasets(datasets, progress)
|
|
1829
1854
|
|
|
1830
|
-
with
|
|
1855
|
+
with (
|
|
1856
|
+
self._metrics.instrument_ingest(
|
|
1857
|
+
n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
|
|
1858
|
+
),
|
|
1859
|
+
self.transaction(),
|
|
1860
|
+
):
|
|
1831
1861
|
self._ingest_file_datasets(datasets, import_info, progress)
|
|
1832
1862
|
|
|
1833
1863
|
# Bulk-insert everything into Datastore.
|
|
@@ -1982,7 +2012,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1982
2012
|
doImport(filename) # type: ignore
|
|
1983
2013
|
|
|
1984
2014
|
def transfer_dimension_records_from(
|
|
1985
|
-
self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
|
|
2015
|
+
self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
|
|
1986
2016
|
) -> None:
|
|
1987
2017
|
# Allowed dimensions in the target butler.
|
|
1988
2018
|
elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
|
|
@@ -2012,16 +2042,13 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2012
2042
|
source_butler, data_ids, allowed_elements
|
|
2013
2043
|
)
|
|
2014
2044
|
|
|
2015
|
-
can_query = True if isinstance(source_butler, Butler) else False
|
|
2016
|
-
|
|
2017
2045
|
additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
|
|
2018
2046
|
for original_element, record_mapping in primary_records.items():
|
|
2019
2047
|
# Get dimensions that depend on this dimension.
|
|
2020
2048
|
populated_by = self.dimensions.get_elements_populated_by(
|
|
2021
2049
|
self.dimensions[original_element.name] # type: ignore
|
|
2022
2050
|
)
|
|
2023
|
-
|
|
2024
|
-
for data_id in record_mapping.keys():
|
|
2051
|
+
if populated_by:
|
|
2025
2052
|
for element in populated_by:
|
|
2026
2053
|
if element not in allowed_elements:
|
|
2027
2054
|
continue
|
|
@@ -2040,28 +2067,32 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2040
2067
|
# have to be scanned.
|
|
2041
2068
|
continue
|
|
2042
2069
|
|
|
2043
|
-
if
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2070
|
+
if record_mapping:
|
|
2071
|
+
if not isinstance(source_butler, Butler):
|
|
2072
|
+
raise RuntimeError(
|
|
2073
|
+
f"Transferring populated_by records like {element.name}"
|
|
2074
|
+
" requires a full Butler."
|
|
2075
|
+
)
|
|
2047
2076
|
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
additional_records[record.definition].setdefault(record.dataId, record)
|
|
2077
|
+
with source_butler.query() as query:
|
|
2078
|
+
records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
|
|
2079
|
+
element.name
|
|
2080
|
+
)
|
|
2081
|
+
for record in records:
|
|
2082
|
+
additional_records[record.definition].setdefault(record.dataId, record)
|
|
2055
2083
|
|
|
2056
2084
|
# The next step is to walk back through the additional records to
|
|
2057
2085
|
# pick up any missing content (such as visit_definition needing to
|
|
2058
2086
|
# know the exposure). Want to ensure we do not request records we
|
|
2059
2087
|
# already have.
|
|
2060
2088
|
missing_data_ids = set()
|
|
2061
|
-
for
|
|
2089
|
+
for record_mapping in additional_records.values():
|
|
2062
2090
|
for data_id in record_mapping.keys():
|
|
2063
|
-
|
|
2064
|
-
|
|
2091
|
+
for dimension in data_id.dimensions.required:
|
|
2092
|
+
element = source_butler.dimensions[dimension]
|
|
2093
|
+
dimension_key = data_id.subset(dimension)
|
|
2094
|
+
if dimension_key not in primary_records[element]:
|
|
2095
|
+
missing_data_ids.add(dimension_key)
|
|
2065
2096
|
|
|
2066
2097
|
# Fill out the new records. Assume that these new records do not
|
|
2067
2098
|
# also need to carry over additional populated_by records.
|
|
@@ -2078,19 +2109,19 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2078
2109
|
def _extract_dimension_records_from_data_ids(
|
|
2079
2110
|
self,
|
|
2080
2111
|
source_butler: LimitedButler | Butler,
|
|
2081
|
-
data_ids:
|
|
2112
|
+
data_ids: Iterable[DataCoordinate],
|
|
2082
2113
|
allowed_elements: frozenset[DimensionElement],
|
|
2083
2114
|
) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
|
|
2084
2115
|
dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
|
|
2085
2116
|
|
|
2117
|
+
data_ids = set(data_ids)
|
|
2118
|
+
if not all(data_id.hasRecords() for data_id in data_ids):
|
|
2119
|
+
if isinstance(source_butler, Butler):
|
|
2120
|
+
data_ids = source_butler._expand_data_ids(data_ids)
|
|
2121
|
+
else:
|
|
2122
|
+
raise TypeError("Input butler needs to be a full butler to expand DataId.")
|
|
2123
|
+
|
|
2086
2124
|
for data_id in data_ids:
|
|
2087
|
-
# Need an expanded record, if not expanded that we need a full
|
|
2088
|
-
# butler with registry (allow mocks with registry too).
|
|
2089
|
-
if not data_id.hasRecords():
|
|
2090
|
-
if registry := getattr(source_butler, "registry", None):
|
|
2091
|
-
data_id = registry.expandDataId(data_id)
|
|
2092
|
-
else:
|
|
2093
|
-
raise TypeError("Input butler needs to be a full butler to expand DataId.")
|
|
2094
2125
|
# If this butler doesn't know about a dimension in the source
|
|
2095
2126
|
# butler things will break later.
|
|
2096
2127
|
for element_name in data_id.dimensions.elements:
|
|
@@ -2569,6 +2600,9 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2569
2600
|
"""Immediately load caches that are used for common operations."""
|
|
2570
2601
|
self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
|
|
2571
2602
|
|
|
2603
|
+
def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
|
|
2604
|
+
return self._registry.expand_data_ids(data_ids)
|
|
2605
|
+
|
|
2572
2606
|
_config: ButlerConfig
|
|
2573
2607
|
"""Configuration for this Butler instance."""
|
|
2574
2608
|
|