lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. lsst/daf/butler/_butler.py +27 -8
  2. lsst/daf/butler/_butler_collections.py +4 -4
  3. lsst/daf/butler/_butler_metrics.py +51 -2
  4. lsst/daf/butler/_dataset_provenance.py +1 -1
  5. lsst/daf/butler/_dataset_ref.py +1 -1
  6. lsst/daf/butler/_exceptions.py +2 -2
  7. lsst/daf/butler/_file_dataset.py +2 -1
  8. lsst/daf/butler/_formatter.py +14 -7
  9. lsst/daf/butler/_labeled_butler_factory.py +28 -8
  10. lsst/daf/butler/_query_all_datasets.py +2 -0
  11. lsst/daf/butler/_rubin/temporary_for_ingest.py +207 -0
  12. lsst/daf/butler/cli/cmd/_remove_runs.py +1 -12
  13. lsst/daf/butler/column_spec.py +4 -4
  14. lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
  15. lsst/daf/butler/configs/storageClasses.yaml +15 -0
  16. lsst/daf/butler/datastore/_datastore.py +21 -1
  17. lsst/daf/butler/datastore/record_data.py +1 -1
  18. lsst/daf/butler/datastore/stored_file_info.py +2 -2
  19. lsst/daf/butler/datastores/chainedDatastore.py +4 -0
  20. lsst/daf/butler/datastores/fileDatastore.py +26 -13
  21. lsst/daf/butler/datastores/file_datastore/get.py +4 -4
  22. lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +5 -1
  23. lsst/daf/butler/datastores/file_datastore/transfer.py +2 -2
  24. lsst/daf/butler/datastores/inMemoryDatastore.py +8 -0
  25. lsst/daf/butler/ddl.py +2 -2
  26. lsst/daf/butler/dimensions/_coordinate.py +11 -8
  27. lsst/daf/butler/dimensions/_record_set.py +1 -1
  28. lsst/daf/butler/dimensions/_records.py +9 -3
  29. lsst/daf/butler/direct_butler/_direct_butler.py +85 -51
  30. lsst/daf/butler/direct_query_driver/_driver.py +5 -4
  31. lsst/daf/butler/direct_query_driver/_result_page_converter.py +1 -1
  32. lsst/daf/butler/formatters/parquet.py +6 -6
  33. lsst/daf/butler/logging.py +9 -3
  34. lsst/daf/butler/nonempty_mapping.py +1 -1
  35. lsst/daf/butler/persistence_context.py +8 -5
  36. lsst/daf/butler/queries/_general_query_results.py +1 -1
  37. lsst/daf/butler/queries/driver.py +1 -1
  38. lsst/daf/butler/queries/expression_factory.py +2 -2
  39. lsst/daf/butler/queries/expressions/parser/exprTree.py +1 -1
  40. lsst/daf/butler/queries/expressions/parser/parserYacc.py +1 -1
  41. lsst/daf/butler/queries/overlaps.py +2 -2
  42. lsst/daf/butler/queries/tree/_column_set.py +1 -1
  43. lsst/daf/butler/registry/_collection_record_cache.py +1 -1
  44. lsst/daf/butler/registry/_collection_summary_cache.py +5 -4
  45. lsst/daf/butler/registry/_registry.py +4 -0
  46. lsst/daf/butler/registry/bridge/monolithic.py +17 -13
  47. lsst/daf/butler/registry/databases/postgresql.py +2 -1
  48. lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py +1 -1
  49. lsst/daf/butler/registry/datasets/byDimensions/_manager.py +53 -47
  50. lsst/daf/butler/registry/datasets/byDimensions/summaries.py +3 -2
  51. lsst/daf/butler/registry/expand_data_ids.py +93 -0
  52. lsst/daf/butler/registry/interfaces/_database.py +6 -1
  53. lsst/daf/butler/registry/interfaces/_datasets.py +2 -1
  54. lsst/daf/butler/registry/interfaces/_obscore.py +1 -1
  55. lsst/daf/butler/registry/obscore/_records.py +1 -1
  56. lsst/daf/butler/registry/obscore/_spatial.py +2 -2
  57. lsst/daf/butler/registry/queries/_results.py +2 -2
  58. lsst/daf/butler/registry/sql_registry.py +3 -25
  59. lsst/daf/butler/registry/wildcards.py +5 -5
  60. lsst/daf/butler/remote_butler/_get.py +1 -1
  61. lsst/daf/butler/remote_butler/_remote_butler.py +6 -1
  62. lsst/daf/butler/remote_butler/_remote_file_transfer_source.py +4 -0
  63. lsst/daf/butler/remote_butler/authentication/cadc.py +4 -3
  64. lsst/daf/butler/script/_pruneDatasets.py +4 -2
  65. lsst/daf/butler/script/configValidate.py +2 -2
  66. lsst/daf/butler/script/queryCollections.py +2 -2
  67. lsst/daf/butler/script/removeCollections.py +2 -0
  68. lsst/daf/butler/script/removeRuns.py +2 -0
  69. lsst/daf/butler/tests/cliCmdTestBase.py +2 -0
  70. lsst/daf/butler/tests/cliLogTestBase.py +2 -0
  71. lsst/daf/butler/tests/hybrid_butler.py +10 -2
  72. lsst/daf/butler/tests/registry_data/lsstcam-subset.yaml +191 -0
  73. lsst/daf/butler/tests/registry_data/spatial.py +4 -2
  74. lsst/daf/butler/tests/testFormatters.py +2 -2
  75. lsst/daf/butler/tests/utils.py +1 -1
  76. lsst/daf/butler/timespan_database_representation.py +3 -3
  77. lsst/daf/butler/transfers/_context.py +7 -6
  78. lsst/daf/butler/version.py +1 -1
  79. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/METADATA +3 -2
  80. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/RECORD +88 -85
  81. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/WHEEL +1 -1
  82. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/entry_points.txt +0 -0
  83. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/COPYRIGHT +0 -0
  84. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/LICENSE +0 -0
  85. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/bsd_license.txt +0 -0
  86. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/licenses/gpl-v3.0.txt +0 -0
  87. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/top_level.txt +0 -0
  88. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.1.dist-info}/zip-safe +0 -0
@@ -109,12 +109,12 @@ class ColumnValueSerializer(ABC):
109
109
 
110
110
  Parameters
111
111
  ----------
112
- value : `Any`
112
+ value : `typing.Any`
113
113
  Column value to be serialized.
114
114
 
115
115
  Returns
116
116
  -------
117
- value : `Any`
117
+ value : `typing.Any`
118
118
  Column value in serializable format.
119
119
  """
120
120
  raise NotImplementedError
@@ -125,12 +125,12 @@ class ColumnValueSerializer(ABC):
125
125
 
126
126
  Parameters
127
127
  ----------
128
- value : `Any`
128
+ value : `typing.Any`
129
129
  Serialized column value.
130
130
 
131
131
  Returns
132
132
  -------
133
- value : `Any`
133
+ value : `typing.Any`
134
134
  Deserialized column value.
135
135
  """
136
136
  raise NotImplementedError
@@ -100,3 +100,4 @@ VisitBackgroundModel: lsst.daf.butler.formatters.json.JsonFormatter
100
100
  VignettingCorrection: lsst.ts.observatory.control.utils.extras.vignetting_storage.VignettingCorrectionFormatter
101
101
  SSPAuxiliaryFile: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFileFormatter
102
102
  VisitGeometry: lsst.daf.butler.formatters.json.JsonFormatter
103
+ ProvenanceQuantumGraph: lsst.pipe.base.quantum_graph.formatter.ProvenanceFormatter
@@ -443,3 +443,18 @@ storageClasses:
443
443
  pytype: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFile
444
444
  VisitGeometry:
445
445
  pytype: lsst.obs.base.visit_geometry.VisitGeometry
446
+ ProvenanceQuantumGraph:
447
+ pytype: lsst.pipe.base.quantum_graph.ProvenanceQuantumGraph
448
+ parameters:
449
+ - import_mode # lsst.pipe.base.pipeline_graph.TaskImportMode
450
+ - quanta # iterable of uuid.UUID; quanta to read
451
+ - datasets # iterable of uuid.UUID; datasets to read
452
+ - read_init_quanta # bool, defaults to True; whether to read pre-exec-init info
453
+ derivedComponents:
454
+ packages: Packages # ignores node parameters
455
+
456
+ # UUID keys can be quantum or data IDs (whichever is passed in via
457
+ # parameters). Nested lists are attempts to run the quantum (last is
458
+ # most recent).
459
+ logs: StructuredDataDict # dict[uuid.UUID, list[ButlerLogRecords]]
460
+ metadata: StructuredDataDict # dict[uuid.UUID, list[TaskMetadata]]
@@ -284,6 +284,14 @@ class DatasetRefURIs(abc.Sequence):
284
284
  def __repr__(self) -> str:
285
285
  return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})"
286
286
 
287
+ def iter_all(self) -> Iterator[ResourcePath]:
288
+ """Iterate over all URIs without regard to whether they are primary
289
+ or component.
290
+ """
291
+ if self.primaryURI is not None:
292
+ yield self.primaryURI
293
+ yield from self.componentURIs.values()
294
+
287
295
 
288
296
  class Datastore(FileTransferSource, metaclass=ABCMeta):
289
297
  """Datastore interface.
@@ -536,7 +544,7 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
536
544
 
537
545
  Returns
538
546
  -------
539
- exists : `dict`[`DatasetRef`, `bool`]
547
+ exists : `dict` [`DatasetRef`, `bool`]
540
548
  Mapping of dataset to boolean indicating whether the dataset
541
549
  is known to the datastore.
542
550
  """
@@ -825,6 +833,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
825
833
  in an external system or if the file is to be compressed in place.
826
834
  It is up to the datastore whether this parameter is relevant.
827
835
 
836
+ Returns
837
+ -------
838
+ None
839
+
828
840
  Raises
829
841
  ------
830
842
  NotImplementedError
@@ -1143,6 +1155,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
1143
1155
  Determine whether errors should be ignored. When multiple
1144
1156
  refs are being trashed there will be no per-ref check.
1145
1157
 
1158
+ Returns
1159
+ -------
1160
+ None
1161
+
1146
1162
  Raises
1147
1163
  ------
1148
1164
  FileNotFoundError
@@ -1278,6 +1294,10 @@ class Datastore(FileTransferSource, metaclass=ABCMeta):
1278
1294
  Entity to compare with configuration retrieved using the
1279
1295
  specified lookup key.
1280
1296
 
1297
+ Returns
1298
+ -------
1299
+ None
1300
+
1281
1301
  Raises
1282
1302
  ------
1283
1303
  DatastoreValidationError
@@ -49,7 +49,7 @@ if TYPE_CHECKING:
49
49
  # Pydantic requires the possible value types to be explicitly enumerated in
50
50
  # order for `uuid.UUID` in particular to work. `typing.Any` does not work
51
51
  # here.
52
- _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
52
+ _Record: TypeAlias = dict[str, int | str | None]
53
53
 
54
54
 
55
55
  class SerializedDatastoreRecordData(pydantic.BaseModel):
@@ -423,8 +423,8 @@ def make_datastore_path_relative(path: str) -> str:
423
423
  path : `str`
424
424
  The file path from a `StoredFileInfo`.
425
425
 
426
- Return
427
- ------
426
+ Returns
427
+ -------
428
428
  normalized_path : `str`
429
429
  The original path, if it was relative. Otherwise, a version of it that
430
430
  was converted to a relative path, stripping URI scheme and netloc from
@@ -1077,6 +1077,10 @@ class ChainedDatastore(Datastore):
1077
1077
  If `True`, output a log message for every validation error
1078
1078
  detected.
1079
1079
 
1080
+ Returns
1081
+ -------
1082
+ None
1083
+
1080
1084
  Raises
1081
1085
  ------
1082
1086
  DatastoreValidationError
@@ -1068,9 +1068,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1068
1068
  # Work out the name we want this ingested file to have
1069
1069
  # inside the datastore
1070
1070
  tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
1071
- if not tgtLocation.uri.dirname().exists():
1072
- log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
1073
- tgtLocation.uri.dirname().mkdir()
1074
1071
 
1075
1072
  # if we are transferring from a local file to a remote location
1076
1073
  # it may be more efficient to get the size and checksum of the
@@ -1311,12 +1308,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1311
1308
  f"and storage class type ({required_pytype})"
1312
1309
  )
1313
1310
 
1314
- uri = location.uri
1315
-
1316
- if not uri.dirname().exists():
1317
- log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
1318
- uri.dirname().mkdir()
1319
-
1320
1311
  if self._transaction is None:
1321
1312
  raise RuntimeError("Attempting to write artifact without transaction enabled")
1322
1313
 
@@ -1332,6 +1323,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1332
1323
 
1333
1324
  # Register a callback to try to delete the uploaded data if
1334
1325
  # something fails below
1326
+ uri = location.uri
1335
1327
  self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
1336
1328
 
1337
1329
  # Need to record the specified formatter but if this is a V1 formatter
@@ -2160,7 +2152,13 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2160
2152
 
2161
2153
  return artifact_map
2162
2154
 
2163
- def ingest_zip(self, zip_path: ResourcePath, transfer: str | None, *, dry_run: bool = False) -> None:
2155
+ def ingest_zip(
2156
+ self,
2157
+ zip_path: ResourcePath,
2158
+ transfer: str | None,
2159
+ *,
2160
+ dry_run: bool = False,
2161
+ ) -> None:
2164
2162
  """Ingest an indexed Zip file and contents.
2165
2163
 
2166
2164
  The Zip file must have an index file as created by `retrieveArtifacts`.
@@ -2220,9 +2218,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2220
2218
  else:
2221
2219
  # Name the zip file based on index contents.
2222
2220
  tgtLocation = self.locationFactory.fromPath(index.calculate_zip_file_path_in_store())
2223
- if not tgtLocation.uri.dirname().exists():
2224
- log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
2225
- tgtLocation.uri.dirname().mkdir()
2226
2221
 
2227
2222
  # Transfer the Zip file into the datastore.
2228
2223
  if not dry_run:
@@ -2987,6 +2982,10 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2987
2982
  If `True`, output a log message for every validation error
2988
2983
  detected.
2989
2984
 
2985
+ Returns
2986
+ -------
2987
+ None
2988
+
2990
2989
  Raises
2991
2990
  ------
2992
2991
  DatastoreValidationError
@@ -3177,6 +3176,20 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
3177
3176
 
3178
3177
  def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
3179
3178
  # Docstring inherited from the base class.
3179
+
3180
+ # This call to 'bridge.check' filters out "partially deleted" datasets.
3181
+ # Specifically, ones in the unusual edge state that:
3182
+ # 1. They have an entry in the registry dataset tables
3183
+ # 2. They were "trashed" from the datastore, so they are not
3184
+ # present in the "dataset_location" table.)
3185
+ # 3. But the trash has not been "emptied", so there are still entries
3186
+ # in the "opaque" datastore records table.
3187
+ #
3188
+ # As far as I can tell, this can only occur in the case of a concurrent
3189
+ # or aborted call to `Butler.pruneDatasets(unstore=True, purge=False)`.
3190
+ # Datasets (with or without files existing on disk) can persist in
3191
+ # this zombie state indefinitely, until someone manually empties
3192
+ # the trash.
3180
3193
  exported_refs = list(self._bridge.check(refs))
3181
3194
  ids = {ref.id for ref in exported_refs}
3182
3195
  records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
@@ -97,12 +97,12 @@ def generate_datastore_get_information(
97
97
 
98
98
  Parameters
99
99
  ----------
100
- fileLocations : `list`[`DatasetLocationInformation`]
100
+ fileLocations : `list` [`DatasetLocationInformation`]
101
101
  List of file locations for this artifact and their associated datastore
102
102
  records.
103
103
  ref : `DatasetRef`
104
104
  The registry information associated with this artifact.
105
- parameters : `Mapping`[`str`, `Any`]
105
+ parameters : `~collections.abc.Mapping` [`str`, `typing.Any`]
106
106
  `StorageClass` and `Formatter` parameters.
107
107
  readStorageClass : `StorageClass` | `None`, optional
108
108
  The StorageClass to use when ultimately returning the resulting object
@@ -255,12 +255,12 @@ def get_dataset_as_python_object_from_get_info(
255
255
 
256
256
  Parameters
257
257
  ----------
258
- allGetInfo : `list`[`DatastoreFileGetInformation`]
258
+ allGetInfo : `list` [`DatastoreFileGetInformation`]
259
259
  Pre-processed information about each file associated with this
260
260
  artifact.
261
261
  ref : `DatasetRef`
262
262
  The registry information associated with this artifact.
263
- parameters : `Mapping`[`str`, `Any`]
263
+ parameters : `~collections.abc.Mapping` [`str`, `typing.Any`]
264
264
  `StorageClass` and `Formatter` parameters.
265
265
  cache_manager : `AbstractDatastoreCacheManager`
266
266
  The cache manager to use for caching retrieved files.
@@ -274,7 +274,11 @@ class ZipIndex(BaseModel):
274
274
  Path to the Zip file.
275
275
  """
276
276
  with zip_path.open("rb") as fd, zipfile.ZipFile(fd) as zf:
277
- json_data = zf.read(cls.index_name)
277
+ return cls.from_open_zip(zf)
278
+
279
+ @classmethod
280
+ def from_open_zip(cls, zf: zipfile.ZipFile) -> Self:
281
+ json_data = zf.read(cls.index_name)
278
282
  return cls.model_validate_json(json_data)
279
283
 
280
284
 
@@ -55,8 +55,8 @@ def retrieve_file_transfer_records(
55
55
  Cache mapping datastore artifact to existence. Updated by
56
56
  this method with details of all artifacts tested.
57
57
 
58
- Return
59
- ------
58
+ Returns
59
+ -------
60
60
  files : `FileTransferMap`
61
61
  A dictionary from `DatasetId` to a list of `FileTransferRecord`,
62
62
  containing information about the files that were found for these
@@ -590,6 +590,10 @@ class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
590
590
  ignore_errors : `bool`, optional
591
591
  Indicate that errors should be ignored.
592
592
 
593
+ Returns
594
+ -------
595
+ None
596
+
593
597
  Raises
594
598
  ------
595
599
  FileNotFoundError
@@ -721,6 +725,10 @@ class InMemoryDatastore(GenericBaseDatastore[StoredMemoryItemInfo]):
721
725
  If `True`, output a log message for every validation error
722
726
  detected.
723
727
 
728
+ Returns
729
+ -------
730
+ None
731
+
724
732
  Raises
725
733
  ------
726
734
  DatastoreValidationError
lsst/daf/butler/ddl.py CHANGED
@@ -537,7 +537,7 @@ class IndexSpec:
537
537
  ----------
538
538
  *columns : `str`
539
539
  Names of the columns to index.
540
- **kwargs : `Any`
540
+ **kwargs : `typing.Any`
541
541
  Additional keyword arguments to pass directly to
542
542
  `sqlalchemy.schema.Index` constructor. This could be used to provide
543
543
  backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL
@@ -556,7 +556,7 @@ class IndexSpec:
556
556
 
557
557
  kwargs: dict[str, Any]
558
558
  """Additional keyword arguments passed directly to
559
- `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]).
559
+ `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `typing.Any` ]).
560
560
  """
561
561
 
562
562
 
@@ -35,8 +35,6 @@ from __future__ import annotations
35
35
  __all__ = (
36
36
  "DataCoordinate",
37
37
  "DataId",
38
- "DataIdKey",
39
- "DataIdValue",
40
38
  "SerializedDataCoordinate",
41
39
  "SerializedDataId",
42
40
  )
@@ -55,7 +53,7 @@ from .._timespan import Timespan
55
53
  from ..json import from_json_pydantic, to_json_pydantic
56
54
  from ..persistence_context import PersistenceContextVars
57
55
  from ._group import DimensionGroup
58
- from ._records import DataIdKey, DataIdValue, DimensionRecord, SerializedDimensionRecord
56
+ from ._records import DataIdValue, DimensionRecord, SerializedDimensionRecord
59
57
 
60
58
  if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
61
59
  from ..registry import Registry
@@ -559,11 +557,11 @@ class DataCoordinate:
559
557
  Returns
560
558
  -------
561
559
  state : `bool`
562
- If `True`, `__getitem__`, `get`, and `__contains__` (but not
563
- `keys`!) will act as though the mapping includes key-value pairs
564
- for implied dimensions, and the `full` property may be used. If
560
+ If `True`, ``__getitem__``, `get`, and ``__contains__`` (but not
561
+ ``keys``!) will act as though the mapping includes key-value pairs
562
+ for implied dimensions, and the ``full`` property may be used. If
565
563
  `False`, these operations only include key-value pairs for required
566
- dimensions, and accessing `full` is an error. Always `True` if
564
+ dimensions, and accessing ``full`` is an error. Always `True` if
567
565
  there are no implied dimensions.
568
566
  """
569
567
  raise NotImplementedError()
@@ -718,7 +716,7 @@ class DataCoordinate:
718
716
 
719
717
  Parameters
720
718
  ----------
721
- simple : `dict` of [`str`, `Any`]
719
+ simple : `dict` of [`str`, `typing.Any`]
722
720
  The `dict` returned by `to_simple()`.
723
721
  universe : `DimensionUniverse`
724
722
  Object that manages all known dimensions.
@@ -755,6 +753,11 @@ class DataCoordinate:
755
753
  to_json = to_json_pydantic
756
754
  from_json: ClassVar[Callable[..., Self]] = cast(Callable[..., Self], classmethod(from_json_pydantic))
757
755
 
756
+ @property
757
+ def dataId(self) -> Self:
758
+ """Return this `DataCoordinate` instance, unmodified."""
759
+ return self
760
+
758
761
 
759
762
  DataId = DataCoordinate | Mapping[str, Any]
760
763
  """A type-annotation alias for signatures that accept both informal data ID
@@ -97,7 +97,7 @@ def fail_record_lookup(
97
97
 
98
98
  Returns
99
99
  -------
100
- record : `DimensionRecord`
100
+ record : `DimensionRecord`
101
101
  Never returned; this function always raises `LookupError`.
102
102
  """
103
103
  raise LookupError(
@@ -27,7 +27,13 @@
27
27
 
28
28
  from __future__ import annotations
29
29
 
30
- __all__ = ("DimensionRecord", "SerializedDimensionRecord", "SerializedKeyValueDimensionRecord")
30
+ __all__ = (
31
+ "DataIdKey",
32
+ "DataIdValue",
33
+ "DimensionRecord",
34
+ "SerializedDimensionRecord",
35
+ "SerializedKeyValueDimensionRecord",
36
+ )
31
37
 
32
38
  import itertools
33
39
  from collections.abc import Callable, Hashable
@@ -451,8 +457,8 @@ class DimensionRecord:
451
457
  registry : `lsst.daf.butler.Registry`, optional
452
458
  Registry from which a universe can be extracted. Can be `None`
453
459
  if universe is provided explicitly.
454
- cacheKey : `Hashable` or `None`
455
- If this is not None, it will be used as a key for any cached
460
+ cacheKey : `collections.abc.Hashable` or `None`
461
+ If this is not `None`, it will be used as a key for any cached
456
462
  reconstruction instead of calculating a value from the serialized
457
463
  format.
458
464
 
@@ -884,6 +884,8 @@ class DirectButler(Butler): # numpydoc ignore=PR02
884
884
  if isinstance(datasetRefOrType, DatasetRef):
885
885
  if collections is not None:
886
886
  warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
887
+ if predict and not datasetRefOrType.dataId.hasRecords():
888
+ return datasetRefOrType.expanded(self.registry.expandDataId(datasetRefOrType.dataId))
887
889
  # May need to retrieve datastore records if requested.
888
890
  if datastore_records and datasetRefOrType._datastore_records is None:
889
891
  datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
@@ -936,6 +938,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
936
938
  run = self.run
937
939
  if run is None:
938
940
  raise TypeError("Cannot predict dataset ID/location with run=None.")
941
+ dataId = self.registry.expandDataId(dataId)
939
942
  return DatasetRef(datasetType, dataId, run=run)
940
943
  else:
941
944
  if collections is None:
@@ -1655,29 +1658,9 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1655
1658
  *,
1656
1659
  transfer_dimensions: bool = False,
1657
1660
  dry_run: bool = False,
1661
+ skip_existing: bool = False,
1658
1662
  ) -> None:
1659
- """Ingest a Zip file into this butler.
1660
-
1661
- The Zip file must have been created by `retrieve_artifacts_zip`.
1662
-
1663
- Parameters
1664
- ----------
1665
- zip_file : `lsst.resources.ResourcePathExpression`
1666
- Path to the Zip file.
1667
- transfer : `str`, optional
1668
- Method to use to transfer the Zip into the datastore.
1669
- transfer_dimensions : `bool`, optional
1670
- If `True`, dimension record data associated with the new datasets
1671
- will be transferred from the Zip, if present.
1672
- dry_run : `bool`, optional
1673
- If `True` the ingest will be processed without any modifications
1674
- made to the target butler and as if the target butler did not
1675
- have any of the datasets.
1676
-
1677
- Notes
1678
- -----
1679
- Run collections and dataset types are created as needed.
1680
- """
1663
+ # Docstring inherited.
1681
1664
  if not self.isWriteable():
1682
1665
  raise TypeError("Butler is read-only.")
1683
1666
 
@@ -1703,6 +1686,29 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1703
1686
  datasets.append(dataset)
1704
1687
  processed_ids.update(unprocessed)
1705
1688
 
1689
+ new_datasets, existing_datasets = self._partition_datasets_by_known(datasets)
1690
+ if existing_datasets:
1691
+ if skip_existing:
1692
+ _LOG.info(
1693
+ "Skipping %d datasets from zip file %s which already exist in the repository.",
1694
+ len(existing_datasets),
1695
+ zip_file,
1696
+ )
1697
+ else:
1698
+ raise ConflictingDefinitionError(
1699
+ f"Datastore already contains {len(existing_datasets)} of the given datasets."
1700
+ f" Example: {existing_datasets[0]}"
1701
+ )
1702
+ if new_datasets:
1703
+ # Can not yet support partial zip ingests where a zip contains
1704
+ # some datasets that are already in another zip.
1705
+ raise ValueError(
1706
+ f"The given zip file from {zip_file} contains {len(new_datasets)} datasets not known "
1707
+ f"to this butler but also contains {len(existing_datasets)} datasets already known to "
1708
+ "this butler. Currently butler can not ingest zip files with overlapping content."
1709
+ )
1710
+ return
1711
+
1706
1712
  # Ingest doesn't create the RUN collections so we have to do that
1707
1713
  # here.
1708
1714
  #
@@ -1721,7 +1727,18 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1721
1727
  datasets, progress, dry_run=dry_run, transfer_dimensions=transfer_dimensions
1722
1728
  )
1723
1729
 
1724
- with self.transaction():
1730
+ # Calculate some statistics based on the given list of datasets.
1731
+ n_datasets = 0
1732
+ for d in datasets:
1733
+ n_datasets += len(d.refs)
1734
+ srefs = "s" if n_datasets != 1 else ""
1735
+
1736
+ with (
1737
+ self._metrics.instrument_ingest(
1738
+ n_datasets, _LOG, msg=f"Ingesting zip file {zip_file} with {n_datasets} dataset{srefs}"
1739
+ ),
1740
+ self.transaction(),
1741
+ ):
1725
1742
  # Do not need expanded dataset refs so can ignore the return value.
1726
1743
  self._ingest_file_datasets(datasets, import_info, progress, dry_run=dry_run)
1727
1744
 
@@ -1822,12 +1839,25 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1822
1839
  f" Example: {existing_datasets[0]}"
1823
1840
  )
1824
1841
 
1842
+ # Calculate some statistics based on the given list of datasets.
1843
+ n_files = len(datasets)
1844
+ n_datasets = 0
1845
+ for d in datasets:
1846
+ n_datasets += len(d.refs)
1847
+ sfiles = "s" if n_files != 1 else ""
1848
+ srefs = "s" if n_datasets != 1 else ""
1849
+
1825
1850
  # We use `datasets` rather `new_datasets` for the Registry
1826
1851
  # portion of this, to let it confirm that everything matches the
1827
1852
  # existing datasets.
1828
1853
  import_info = self._prepare_ingest_file_datasets(datasets, progress)
1829
1854
 
1830
- with self.transaction():
1855
+ with (
1856
+ self._metrics.instrument_ingest(
1857
+ n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
1858
+ ),
1859
+ self.transaction(),
1860
+ ):
1831
1861
  self._ingest_file_datasets(datasets, import_info, progress)
1832
1862
 
1833
1863
  # Bulk-insert everything into Datastore.
@@ -1982,7 +2012,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1982
2012
  doImport(filename) # type: ignore
1983
2013
 
1984
2014
  def transfer_dimension_records_from(
1985
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
2015
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1986
2016
  ) -> None:
1987
2017
  # Allowed dimensions in the target butler.
1988
2018
  elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
@@ -2012,16 +2042,13 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2012
2042
  source_butler, data_ids, allowed_elements
2013
2043
  )
2014
2044
 
2015
- can_query = True if isinstance(source_butler, Butler) else False
2016
-
2017
2045
  additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2018
2046
  for original_element, record_mapping in primary_records.items():
2019
2047
  # Get dimensions that depend on this dimension.
2020
2048
  populated_by = self.dimensions.get_elements_populated_by(
2021
2049
  self.dimensions[original_element.name] # type: ignore
2022
2050
  )
2023
-
2024
- for data_id in record_mapping.keys():
2051
+ if populated_by:
2025
2052
  for element in populated_by:
2026
2053
  if element not in allowed_elements:
2027
2054
  continue
@@ -2040,28 +2067,32 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2040
2067
  # have to be scanned.
2041
2068
  continue
2042
2069
 
2043
- if not can_query:
2044
- raise RuntimeError(
2045
- f"Transferring populated_by records like {element.name} requires a full Butler."
2046
- )
2070
+ if record_mapping:
2071
+ if not isinstance(source_butler, Butler):
2072
+ raise RuntimeError(
2073
+ f"Transferring populated_by records like {element.name}"
2074
+ " requires a full Butler."
2075
+ )
2047
2076
 
2048
- records = source_butler.query_dimension_records( # type: ignore
2049
- element.name,
2050
- explain=False,
2051
- **data_id.mapping, # type: ignore
2052
- )
2053
- for record in records:
2054
- additional_records[record.definition].setdefault(record.dataId, record)
2077
+ with source_butler.query() as query:
2078
+ records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
2079
+ element.name
2080
+ )
2081
+ for record in records:
2082
+ additional_records[record.definition].setdefault(record.dataId, record)
2055
2083
 
2056
2084
  # The next step is to walk back through the additional records to
2057
2085
  # pick up any missing content (such as visit_definition needing to
2058
2086
  # know the exposure). Want to ensure we do not request records we
2059
2087
  # already have.
2060
2088
  missing_data_ids = set()
2061
- for name, record_mapping in additional_records.items():
2089
+ for record_mapping in additional_records.values():
2062
2090
  for data_id in record_mapping.keys():
2063
- if data_id not in primary_records[name]:
2064
- missing_data_ids.add(data_id)
2091
+ for dimension in data_id.dimensions.required:
2092
+ element = source_butler.dimensions[dimension]
2093
+ dimension_key = data_id.subset(dimension)
2094
+ if dimension_key not in primary_records[element]:
2095
+ missing_data_ids.add(dimension_key)
2065
2096
 
2066
2097
  # Fill out the new records. Assume that these new records do not
2067
2098
  # also need to carry over additional populated_by records.
@@ -2078,19 +2109,19 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2078
2109
  def _extract_dimension_records_from_data_ids(
2079
2110
  self,
2080
2111
  source_butler: LimitedButler | Butler,
2081
- data_ids: set[DataCoordinate],
2112
+ data_ids: Iterable[DataCoordinate],
2082
2113
  allowed_elements: frozenset[DimensionElement],
2083
2114
  ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
2084
2115
  dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2085
2116
 
2117
+ data_ids = set(data_ids)
2118
+ if not all(data_id.hasRecords() for data_id in data_ids):
2119
+ if isinstance(source_butler, Butler):
2120
+ data_ids = source_butler._expand_data_ids(data_ids)
2121
+ else:
2122
+ raise TypeError("Input butler needs to be a full butler to expand DataId.")
2123
+
2086
2124
  for data_id in data_ids:
2087
- # Need an expanded record, if not expanded that we need a full
2088
- # butler with registry (allow mocks with registry too).
2089
- if not data_id.hasRecords():
2090
- if registry := getattr(source_butler, "registry", None):
2091
- data_id = registry.expandDataId(data_id)
2092
- else:
2093
- raise TypeError("Input butler needs to be a full butler to expand DataId.")
2094
2125
  # If this butler doesn't know about a dimension in the source
2095
2126
  # butler things will break later.
2096
2127
  for element_name in data_id.dimensions.elements:
@@ -2569,6 +2600,9 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2569
2600
  """Immediately load caches that are used for common operations."""
2570
2601
  self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
2571
2602
 
2603
+ def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2604
+ return self._registry.expand_data_ids(data_ids)
2605
+
2572
2606
  _config: ButlerConfig
2573
2607
  """Configuration for this Butler instance."""
2574
2608