lsst-daf-butler 29.1.0rc2__py3-none-any.whl → 29.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. lsst/daf/butler/_limited_butler.py +8 -1
  2. lsst/daf/butler/cli/utils.py +1 -1
  3. lsst/daf/butler/datastore/__init__.py +1 -0
  4. lsst/daf/butler/datastore/_datastore.py +18 -15
  5. lsst/daf/butler/datastore/_transfer.py +102 -0
  6. lsst/daf/butler/datastore/stored_file_info.py +34 -0
  7. lsst/daf/butler/datastores/chainedDatastore.py +53 -7
  8. lsst/daf/butler/datastores/fileDatastore.py +51 -180
  9. lsst/daf/butler/datastores/file_datastore/transfer.py +104 -0
  10. lsst/daf/butler/dimensions/_coordinate.py +3 -0
  11. lsst/daf/butler/direct_butler/_direct_butler.py +31 -28
  12. lsst/daf/butler/formatters/parquet.py +7 -3
  13. lsst/daf/butler/registry/interfaces/_database.py +1 -2
  14. lsst/daf/butler/registry/obscore/_config.py +5 -0
  15. lsst/daf/butler/registry/obscore/_records.py +4 -2
  16. lsst/daf/butler/remote_butler/_http_connection.py +6 -2
  17. lsst/daf/butler/remote_butler/_remote_butler.py +5 -0
  18. lsst/daf/butler/remote_butler/_remote_file_transfer_source.py +124 -0
  19. lsst/daf/butler/remote_butler/server/_factory.py +4 -0
  20. lsst/daf/butler/remote_butler/server/handlers/_external.py +90 -3
  21. lsst/daf/butler/remote_butler/server/handlers/_utils.py +15 -1
  22. lsst/daf/butler/remote_butler/server_models.py +17 -1
  23. lsst/daf/butler/tests/hybrid_butler.py +5 -1
  24. lsst/daf/butler/version.py +1 -1
  25. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/METADATA +1 -1
  26. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/RECORD +34 -31
  27. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/WHEEL +0 -0
  28. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/entry_points.txt +0 -0
  29. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/COPYRIGHT +0 -0
  30. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/LICENSE +0 -0
  31. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/bsd_license.txt +0 -0
  32. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/gpl-v3.0.txt +0 -0
  33. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/top_level.txt +0 -0
  34. {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/zip-safe +0 -0
@@ -109,6 +109,9 @@ from lsst.utils.iteration import chunk_iterable
109
109
  from lsst.utils.logging import VERBOSE, getLogger
110
110
  from lsst.utils.timer import time_this
111
111
 
112
+ from ..datastore import FileTransferMap, FileTransferRecord
113
+ from ..datastore.stored_file_info import make_datastore_path_relative
114
+
112
115
  if TYPE_CHECKING:
113
116
  from lsst.daf.butler import DatasetProvenance, LookupKey
114
117
  from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
@@ -1972,12 +1975,12 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1972
1975
 
1973
1976
  return uris
1974
1977
 
1975
- @staticmethod
1976
1978
  def _find_missing_records(
1977
- datastore: FileDatastore,
1979
+ self,
1978
1980
  refs: Iterable[DatasetRef],
1979
1981
  missing_ids: set[DatasetId],
1980
1982
  artifact_existence: dict[ResourcePath, bool] | None = None,
1983
+ warn_for_missing: bool = True,
1981
1984
  ) -> dict[DatasetId, list[StoredFileInfo]]:
1982
1985
  if not missing_ids:
1983
1986
  return {}
@@ -1998,7 +2001,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
1998
2001
  # Ask the source datastore where the missing artifacts
1999
2002
  # should be. An execution butler might not know about the
2000
2003
  # artifacts even if they are there.
2001
- expected = datastore._get_expected_dataset_locations_info(id_to_ref[missing])
2004
+ expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
2002
2005
  records[missing] = [info for _, info in expected]
2003
2006
 
2004
2007
  # Call the mexist helper method in case we have not already
@@ -2007,17 +2010,18 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2007
2010
  # datastore.mexists() itself does not give us access to the
2008
2011
  # derived datastore record.
2009
2012
  log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
2010
- ref_exists = datastore._process_mexists_records(
2013
+ ref_exists = self._process_mexists_records(
2011
2014
  id_to_ref, records, False, artifact_existence=artifact_existence
2012
2015
  )
2013
2016
 
2014
2017
  # Now go through the records and propagate the ones that exist.
2015
- location_factory = datastore.locationFactory
2018
+ location_factory = self.locationFactory
2016
2019
  for missing, record_list in records.items():
2017
2020
  # Skip completely if the ref does not exist.
2018
2021
  ref = id_to_ref[missing]
2019
2022
  if not ref_exists[ref]:
2020
- log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2023
+ if warn_for_missing:
2024
+ log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
2021
2025
  continue
2022
2026
  # Check for file artifact to decide which parts of a
2023
2027
  # disassembled composite do exist. If there is only a
@@ -2107,7 +2111,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2107
2111
  if missing_ids and not self.trustGetRequest:
2108
2112
  raise ValueError(f"Number of datasets missing from this datastore: {len(missing_ids)}")
2109
2113
 
2110
- missing_records = self._find_missing_records(self, refs, missing_ids)
2114
+ missing_records = self._find_missing_records(refs, missing_ids)
2111
2115
  records.update(missing_records)
2112
2116
 
2113
2117
  # One artifact can be used by multiple DatasetRef.
@@ -2784,13 +2788,13 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2784
2788
  @transactional
2785
2789
  def transfer_from(
2786
2790
  self,
2787
- source_datastore: Datastore,
2791
+ source_records: FileTransferMap,
2788
2792
  refs: Collection[DatasetRef],
2789
2793
  transfer: str = "auto",
2790
2794
  artifact_existence: dict[ResourcePath, bool] | None = None,
2791
2795
  dry_run: bool = False,
2792
2796
  ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2793
- log.verbose("Transferring %d datasets from %s to %s", len(refs), source_datastore.name, self.name)
2797
+ log.verbose("Transferring %d datasets to %s", len(refs), self.name)
2794
2798
 
2795
2799
  # Stop early if "direct" transfer mode is requested. That would
2796
2800
  # require that the URI inside the source datastore should be stored
@@ -2805,125 +2809,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2805
2809
  if not refs:
2806
2810
  return set(), set()
2807
2811
 
2808
- # Potentially can be transferring from a chain.
2809
- datastores = getattr(source_datastore, "datastores", [source_datastore])
2810
-
2811
- incompatible: list[Datastore] = []
2812
- acceptable: list[FileDatastore] = []
2813
- for current_source in datastores:
2814
- if not isinstance(current_source, FileDatastore):
2815
- incompatible.append(current_source)
2816
- else:
2817
- acceptable.append(current_source)
2818
-
2819
- if len(incompatible) == len(datastores):
2820
- if len(datastores) == 1:
2821
- raise TypeError(
2822
- "Can only transfer to a FileDatastore from another FileDatastore, not"
2823
- f" {get_full_type_name(source_datastore)}"
2824
- )
2825
- else:
2826
- types = [get_full_type_name(d) for d in datastores]
2827
- raise TypeError(
2828
- f"ChainedDatastore encountered that had no FileDatastores. Had {','.join(types)}"
2829
- )
2830
-
2831
- if len(acceptable) == 1:
2832
- # No need to filter in advance since there is only one usable
2833
- # source datastore.
2834
- return self._transfer_from(
2835
- acceptable[0], refs, transfer=transfer, artifact_existence=artifact_existence, dry_run=dry_run
2836
- )
2837
-
2838
- # To avoid complaints from the transfer that the source does not have
2839
- # a ref, partition refs by source datastores, and any unknown to both
2840
- # are sent to any that support trustGetRequest.
2841
- unassigned_refs: set[DatasetRef] = set(refs)
2842
- known_refs: list[set[DatasetRef]] = []
2843
- for datastore in acceptable:
2844
- known_to_datastore = {ref for ref, known in datastore.knows_these(refs).items() if known}
2845
- known_refs.append(known_to_datastore)
2846
- unassigned_refs -= known_to_datastore
2847
-
2848
- if unassigned_refs:
2849
- for datastore, refs_known_to_datastore in zip(acceptable, known_refs, strict=True):
2850
- if datastore.trustGetRequest:
2851
- # Have to check each datastore in turn. If we do not do
2852
- # this warnings will be issued further down for datasets
2853
- # that are in one and not the other. The existence cache
2854
- # will prevent repeat checks.
2855
- exist_in_store = datastore.mexists(unassigned_refs, artifact_existence=artifact_existence)
2856
- present = {ref for ref, exists in exist_in_store.items() if exists}
2857
- refs_known_to_datastore.update(present)
2858
- # Only transferring once so do not need to check later
2859
- # datastores.
2860
- unassigned_refs -= present
2861
- log.debug(
2862
- "Adding %d missing refs to list for transfer from %s", len(present), datastore.name
2863
- )
2864
-
2865
- if unassigned_refs:
2866
- log.warning(
2867
- "Encountered %d dataset%s where no file artifacts exist from the "
2868
- "source datastore and will be skipped.",
2869
- len(unassigned_refs),
2870
- "s" if len(unassigned_refs) != 1 else "",
2871
- )
2872
-
2873
- # Once we have accepted refs from one datastore, do not need to try to
2874
- # transfer them again.
2875
- accepted: set[DatasetRef] = set()
2876
- rejected: set[DatasetRef] = set()
2877
- if artifact_existence is None:
2878
- artifact_existence = {}
2879
-
2880
- for current_source, refs_to_transfer in zip(acceptable, known_refs, strict=True):
2881
- # Do not transfer if already transferred.
2882
- refs_to_transfer -= accepted
2883
- # No need to retry something that has already been rejected.
2884
- refs_to_transfer -= rejected
2885
-
2886
- if not refs_to_transfer:
2887
- continue
2888
-
2889
- log.verbose(
2890
- "Requesting transfer of %d dataset%s from datastore %s to %s",
2891
- len(refs_to_transfer),
2892
- "s" if len(refs_to_transfer) != 1 else "",
2893
- current_source.name,
2894
- self.name,
2895
- )
2896
- current_accepted, current_rejected = self._transfer_from(
2897
- current_source,
2898
- refs_to_transfer,
2899
- transfer=transfer,
2900
- artifact_existence=artifact_existence,
2901
- dry_run=dry_run,
2902
- )
2903
-
2904
- accepted.update(current_accepted)
2905
- rejected.update(current_rejected)
2906
-
2907
- log.verbose(
2908
- "Finished transfer_from %s to %s with %d accepted, %d rejected, %d requested",
2909
- source_datastore.name,
2910
- self.name,
2911
- len(accepted),
2912
- len(rejected),
2913
- len(refs),
2914
- )
2915
-
2916
- return accepted, rejected
2917
-
2918
- @transactional
2919
- def _transfer_from(
2920
- self,
2921
- source_datastore: FileDatastore,
2922
- refs: Collection[DatasetRef],
2923
- transfer: str = "auto",
2924
- artifact_existence: dict[ResourcePath, bool] | None = None,
2925
- dry_run: bool = False,
2926
- ) -> tuple[set[DatasetRef], set[DatasetRef]]:
2927
2812
  # Empty existence lookup if none given.
2928
2813
  if artifact_existence is None:
2929
2814
  artifact_existence = {}
@@ -2941,46 +2826,8 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
2941
2826
  # the dataset should be transferred. This will only happen if
2942
2827
  # the detached Butler has had a local ingest.
2943
2828
 
2944
- # What we really want is all the records in the source datastore
2945
- # associated with these refs. Or derived ones if they don't exist
2946
- # in the source.
2947
- log.verbose("Looking up source datastore records in %s", source_datastore.name)
2948
- source_records = source_datastore._get_stored_records_associated_with_refs(
2949
- refs, ignore_datastore_records=True
2950
- )
2951
-
2952
- # The source dataset_ids are the keys in these records
2953
- source_ids = set(source_records)
2954
- log.debug("Number of datastore records found in source: %d", len(source_ids))
2955
-
2956
- requested_ids = {ref.id for ref in refs}
2957
- missing_ids = requested_ids - source_ids
2958
-
2959
- # Missing IDs can be okay if that datastore has allowed
2960
- # gets based on file existence. Should we transfer what we can
2961
- # or complain about it and warn?
2962
- if missing_ids and not source_datastore.trustGetRequest:
2963
- raise ValueError(
2964
- f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
2965
- )
2966
-
2967
- # Need to map these missing IDs to a DatasetRef so we can guess
2968
- # the details.
2969
- if missing_ids:
2970
- log.info(
2971
- "Number of expected datasets missing from source datastore records: %d out of %d",
2972
- len(missing_ids),
2973
- len(requested_ids),
2974
- )
2975
- found_records = self._find_missing_records(
2976
- source_datastore, refs, missing_ids, artifact_existence
2977
- )
2978
- source_records.update(found_records)
2979
-
2980
2829
  # See if we already have these records
2981
- log.verbose(
2982
- "Looking up existing datastore records in target %s for %d refs", self.name, len(requested_ids)
2983
- )
2830
+ log.verbose("Looking up existing datastore records in target %s for %d refs", self.name, len(refs))
2984
2831
  target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
2985
2832
 
2986
2833
  # The artifacts to register
@@ -3017,8 +2864,9 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
3017
2864
  continue
3018
2865
 
3019
2866
  # mypy needs to know these are always resolved refs
3020
- for info in source_records[ref.id]:
3021
- source_location = info.file_location(source_datastore.locationFactory)
2867
+ for transfer_info in source_records.get(ref.id, []):
2868
+ info = transfer_info.file_info
2869
+ source_location = transfer_info.location
3022
2870
  target_location = info.file_location(self.locationFactory)
3023
2871
  if source_location == target_location and not source_location.pathInStore.isabs():
3024
2872
  # Artifact is already in the target location.
@@ -3096,14 +2944,45 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
3096
2944
  )
3097
2945
 
3098
2946
  log.verbose(
3099
- "Finished transfer_from %s to %s with %d accepted, %d rejected",
3100
- source_datastore.name,
2947
+ "Finished transfer_from to %s with %d accepted, %d rejected",
3101
2948
  self.name,
3102
2949
  len(accepted),
3103
2950
  len(rejected),
3104
2951
  )
3105
2952
  return accepted, rejected
3106
2953
 
2954
+ def get_file_info_for_transfer(self, dataset_ids: Iterable[DatasetId]) -> FileTransferMap:
2955
+ source_records = self._get_stored_records_associated_with_refs(
2956
+ [FakeDatasetRef(id) for id in dataset_ids], ignore_datastore_records=True
2957
+ )
2958
+ return self._convert_stored_file_info_to_file_transfer_record(source_records)
2959
+
2960
+ def locate_missing_files_for_transfer(
2961
+ self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool]
2962
+ ) -> FileTransferMap:
2963
+ missing_ids = {ref.id for ref in refs}
2964
+ # Missing IDs can be okay if that datastore has allowed
2965
+ # gets based on file existence. Should we transfer what we can
2966
+ # or complain about it and warn?
2967
+ if not self.trustGetRequest:
2968
+ return {}
2969
+
2970
+ found_records = self._find_missing_records(
2971
+ refs, missing_ids, artifact_existence, warn_for_missing=False
2972
+ )
2973
+ return self._convert_stored_file_info_to_file_transfer_record(found_records)
2974
+
2975
+ def _convert_stored_file_info_to_file_transfer_record(
2976
+ self, info_map: dict[DatasetId, list[StoredFileInfo]]
2977
+ ) -> FileTransferMap:
2978
+ output: dict[DatasetId, list[FileTransferRecord]] = {}
2979
+ for k, file_info_list in info_map.items():
2980
+ output[k] = [
2981
+ FileTransferRecord(file_info=info, location=info.file_location(self.locationFactory))
2982
+ for info in file_info_list
2983
+ ]
2984
+ return output
2985
+
3107
2986
  @transactional
3108
2987
  def forget(self, refs: Iterable[DatasetRef]) -> None:
3109
2988
  # Docstring inherited.
@@ -3357,16 +3236,8 @@ def _to_file_info_payload(
3357
3236
  ) -> FileDatastoreGetPayloadFileInfo:
3358
3237
  location, file_info = info
3359
3238
 
3360
- # Make sure that we send only relative paths, to avoid leaking
3361
- # details of our configuration to the client.
3362
- path = location.pathInStore
3363
- if path.isabs():
3364
- relative_path = path.relativeToPathRoot
3365
- else:
3366
- relative_path = str(path)
3367
-
3368
3239
  datastoreRecords = file_info.to_simple()
3369
- datastoreRecords.path = relative_path
3240
+ datastoreRecords.path = make_datastore_path_relative(datastoreRecords.path)
3370
3241
 
3371
3242
  return FileDatastoreGetPayloadFileInfo(
3372
3243
  url=location.uri.generate_presigned_get_url(expiration_time_seconds=url_expiration_time_seconds),
@@ -0,0 +1,104 @@
1
+ # This file is part of daf_butler.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ from collections.abc import Iterable
31
+
32
+ from lsst.resources import ResourcePath
33
+ from lsst.utils.logging import getLogger
34
+
35
+ from ..._dataset_ref import DatasetRef
36
+ from ...datastore import FileTransferMap, FileTransferSource
37
+
38
+ log = getLogger(__name__)
39
+
40
+
41
+ def retrieve_file_transfer_records(
42
+ source_datastore: FileTransferSource,
43
+ refs: Iterable[DatasetRef],
44
+ artifact_existence: dict[ResourcePath, bool],
45
+ ) -> FileTransferMap:
46
+ """Look up the datastore records corresponding to the given datasets.
47
+
48
+ Parameters
49
+ ----------
50
+ source_datastore : `FileTransferSource`
51
+ Object used to look up records.
52
+ refs : `~collections.abc.Iterable` [ `DatasetRef` ]
53
+ List of datasets to retrieve records for.
54
+ artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
55
+ Cache mapping datastore artifact to existence. Updated by
56
+ this method with details of all artifacts tested.
57
+
58
+ Return
59
+ ------
60
+ files : `FileTransferMap`
61
+ A dictionary from `DatasetId` to a list of `FileTransferRecord`,
62
+ containing information about the files that were found for these
63
+ artifacts. If files were not found for a given `DatasetRef`, there
64
+ will be no entry for it in this dictionary.
65
+
66
+ Notes
67
+ -----
68
+ This will first attempt to look up records using the database, and then
69
+ fall back to searching the filesystem if the transfer source is configured
70
+ to do so.
71
+ """
72
+ log.verbose("Looking up source datastore records in %s", source_datastore.name)
73
+ refs_by_id = {ref.id: ref for ref in refs}
74
+ source_records = source_datastore.get_file_info_for_transfer(refs_by_id.keys())
75
+
76
+ log.debug("Number of datastore records found in source: %d", len(source_records))
77
+
78
+ # If we couldn't find all of the datasets in the database, continue
79
+ # searching. Some datastores may have artifacts on disk that do not have
80
+ # corresponding records in the database.
81
+ missing_ids = refs_by_id.keys() - source_records.keys()
82
+ if missing_ids:
83
+ log.info(
84
+ "Number of expected datasets missing from source datastore records: %d out of %d",
85
+ len(missing_ids),
86
+ len(refs_by_id),
87
+ )
88
+ missing_refs = {refs_by_id[id] for id in missing_ids}
89
+ found_records = source_datastore.locate_missing_files_for_transfer(missing_refs, artifact_existence)
90
+ source_records |= found_records
91
+
92
+ still_missing = len(missing_refs) - len(found_records)
93
+ if still_missing:
94
+ for ref in missing_refs:
95
+ if ref.id not in found_records:
96
+ log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
97
+ log.warning(
98
+ "Encountered %d dataset%s where no file artifacts exist from the "
99
+ "source datastore and will be skipped.",
100
+ still_missing,
101
+ "s" if still_missing != 1 else "",
102
+ )
103
+
104
+ return source_records
@@ -427,6 +427,9 @@ class DataCoordinate:
427
427
  @overload
428
428
  def get(self, key: str, default: str) -> str: ...
429
429
 
430
+ @overload
431
+ def get(self, key: str, default: DataIdValue | None) -> DataIdValue | None: ...
432
+
430
433
  def get(self, key: str, default: DataIdValue | None = None) -> DataIdValue | None:
431
434
  try:
432
435
  return self.__getitem__(key)
@@ -75,6 +75,7 @@ from .._storage_class import StorageClass, StorageClassFactory
75
75
  from .._timespan import Timespan
76
76
  from ..datastore import Datastore, NullDatastore
77
77
  from ..datastores.file_datastore.retrieve_artifacts import ZipIndex, retrieve_and_zip
78
+ from ..datastores.file_datastore.transfer import retrieve_file_transfer_records
78
79
  from ..dimensions import DataCoordinate, Dimension, DimensionGroup
79
80
  from ..direct_query_driver import DirectQueryDriver
80
81
  from ..progress import Progress
@@ -1765,7 +1766,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1765
1766
  import_info = self._prepare_for_import_refs(
1766
1767
  self,
1767
1768
  refs,
1768
- skip_missing=False,
1769
1769
  register_dataset_types=True,
1770
1770
  dry_run=dry_run,
1771
1771
  transfer_dimensions=transfer_dimensions,
@@ -2063,7 +2063,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2063
2063
  source_butler: LimitedButler,
2064
2064
  source_refs: Iterable[DatasetRef],
2065
2065
  *,
2066
- skip_missing: bool = True,
2067
2066
  register_dataset_types: bool = False,
2068
2067
  transfer_dimensions: bool = False,
2069
2068
  dry_run: bool = False,
@@ -2087,27 +2086,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2087
2086
  str(self),
2088
2087
  )
2089
2088
 
2090
- # In some situations the datastore artifact may be missing
2091
- # and we do not want that registry entry to be imported.
2092
- # Asking datastore is not sufficient, the records may have been
2093
- # purged, we have to ask for the (predicted) URI and check
2094
- # existence explicitly. Execution butler is set up exactly like
2095
- # this with no datastore records.
2096
- artifact_existence: dict[ResourcePath, bool] = {}
2097
- if skip_missing:
2098
- dataset_existence = source_butler._datastore.mexists(
2099
- source_refs, artifact_existence=artifact_existence
2100
- )
2101
- source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2102
- filtered_count = len(source_refs)
2103
- n_missing = original_count - filtered_count
2104
- _LOG.verbose(
2105
- "%d dataset%s removed because the artifact does not exist. Now have %d.",
2106
- n_missing,
2107
- "" if n_missing == 1 else "s",
2108
- filtered_count,
2109
- )
2110
-
2111
2089
  # Importing requires that we group the refs by dimension group and run
2112
2090
  # before doing the import.
2113
2091
  source_dataset_types = set()
@@ -2207,7 +2185,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2207
2185
  dimension_records = self._extract_all_dimension_records_from_data_ids(
2208
2186
  source_butler, dataIds, elements
2209
2187
  )
2210
- return _ImportDatasetsInfo(grouped_refs, dimension_records, artifact_existence)
2188
+ return _ImportDatasetsInfo(grouped_refs, dimension_records)
2211
2189
 
2212
2190
  def _import_dimension_records(
2213
2191
  self,
@@ -2294,15 +2272,40 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2294
2272
  dry_run: bool = False,
2295
2273
  ) -> collections.abc.Collection[DatasetRef]:
2296
2274
  # Docstring inherited.
2275
+ source_refs = list(source_refs)
2297
2276
  if not self.isWriteable():
2298
2277
  raise TypeError("Butler is read-only.")
2299
2278
 
2300
2279
  progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2301
2280
 
2281
+ artifact_existence: dict[ResourcePath, bool] = {}
2282
+ file_transfer_source = source_butler._file_transfer_source
2283
+ transfer_records = retrieve_file_transfer_records(
2284
+ file_transfer_source, source_refs, artifact_existence
2285
+ )
2286
+ # In some situations the datastore artifact may be missing and we do
2287
+ # not want that registry entry to be imported. For example, this can
2288
+ # happen if a file was removed but the dataset was left in the registry
2289
+ # for provenance, or if a pipeline task didn't create all of the
2290
+ # possible files in a QuantumBackedButler.
2291
+ if skip_missing:
2292
+ original_ids = {ref.id for ref in source_refs}
2293
+ missing_ids = original_ids - transfer_records.keys()
2294
+ if missing_ids:
2295
+ original_count = len(source_refs)
2296
+ source_refs = [ref for ref in source_refs if ref.id not in missing_ids]
2297
+ filtered_count = len(source_refs)
2298
+ n_missing = original_count - filtered_count
2299
+ _LOG.verbose(
2300
+ "%d dataset%s removed because the artifact does not exist. Now have %d.",
2301
+ n_missing,
2302
+ "" if n_missing == 1 else "s",
2303
+ filtered_count,
2304
+ )
2305
+
2302
2306
  import_info = self._prepare_for_import_refs(
2303
2307
  source_butler,
2304
2308
  source_refs,
2305
- skip_missing=skip_missing,
2306
2309
  register_dataset_types=register_dataset_types,
2307
2310
  dry_run=dry_run,
2308
2311
  transfer_dimensions=transfer_dimensions,
@@ -2317,11 +2320,12 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2317
2320
 
2318
2321
  # Ask the datastore to transfer. The datastore has to check that
2319
2322
  # the source datastore is compatible with the target datastore.
2323
+ _LOG.verbose("Transferring %d datasets from %s", len(transfer_records), file_transfer_source.name)
2320
2324
  accepted, rejected = self._datastore.transfer_from(
2321
- source_butler._datastore,
2325
+ transfer_records,
2322
2326
  imported_refs,
2323
2327
  transfer=transfer,
2324
- artifact_existence=import_info.artifact_existence,
2328
+ artifact_existence=artifact_existence,
2325
2329
  dry_run=dry_run,
2326
2330
  )
2327
2331
  if rejected:
@@ -2567,4 +2571,3 @@ class _ImportDatasetsInfo(NamedTuple):
2567
2571
 
2568
2572
  grouped_refs: defaultdict[_RefGroup, list[DatasetRef]]
2569
2573
  dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]
2570
- artifact_existence: dict[ResourcePath, bool]
@@ -295,7 +295,7 @@ def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray | np.ma.MaskedArray:
295
295
  numpy_dict = arrow_to_numpy_dict(arrow_table)
296
296
 
297
297
  has_mask = False
298
- dtype = []
298
+ dtype: list[tuple] = []
299
299
  for name, col in numpy_dict.items():
300
300
  if len(shape := numpy_dict[name].shape) <= 1:
301
301
  dtype.append((name, col.dtype))
@@ -429,7 +429,11 @@ def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
429
429
  md = {}
430
430
  md[b"lsst::arrow::rowcount"] = str(len(np_array))
431
431
 
432
- for name in np_array.dtype.names:
432
+ names = np_array.dtype.names
433
+ if names is None:
434
+ names = ()
435
+
436
+ for name in names:
433
437
  _append_numpy_string_metadata(md, name, np_array.dtype[name])
434
438
  _append_numpy_multidim_metadata(md, name, np_array.dtype[name])
435
439
 
@@ -1379,7 +1383,7 @@ def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, i
1379
1383
  """
1380
1384
  import numpy as np
1381
1385
 
1382
- dtype_list = []
1386
+ dtype_list: list[tuple] = []
1383
1387
  rowcount = 0
1384
1388
  for name, col in numpy_dict.items():
1385
1389
  if rowcount == 0:
@@ -1696,9 +1696,8 @@ class Database(ABC):
1696
1696
  for k, v in content.items():
1697
1697
  if k == name:
1698
1698
  continue
1699
- column = table.columns[k]
1700
1699
  # The set only has one element
1701
- clauses.append(column == v.pop())
1700
+ clauses.append(table.columns[k] == v.pop())
1702
1701
 
1703
1702
  # The IN operator will not work for "infinite" numbers of
1704
1703
  # rows so must batch it up into distinct calls.
@@ -178,6 +178,11 @@ class ObsCoreConfig(pydantic.BaseModel):
178
178
  indexing support, but a standard ``s_region`` column is always included.
179
179
  """
180
180
 
181
+ fallback_instrument: str | None = None
182
+ """Instrument to use if a dataset type does not have an instrument
183
+ dimension. Will be left unset if `None`. Can be dangerous to set this
184
+ in a repository containing data from multiple instruments."""
185
+
181
186
 
182
187
  class ConfigCollectionType(str, enum.Enum):
183
188
  """Enum class defining possible values for configuration attributes."""
@@ -394,12 +394,14 @@ class DafButlerRecordFactory(RecordFactory):
394
394
  dataId = ref.dataId
395
395
  record: dict[str, str | int | float | UUID | None] = {}
396
396
 
397
- instrument_name = cast(str, dataId.get("instrument"))
397
+ instrument_name = cast(str | None, dataId.get("instrument", self.config.fallback_instrument))
398
398
  record["instrument_name"] = instrument_name
399
399
  if self.schema.dataset_fk is not None:
400
400
  record[self.schema.dataset_fk.name] = ref.id
401
401
 
402
- record["facility_name"] = self.config.facility_map.get(instrument_name, self.config.facility_name)
402
+ record["facility_name"] = self.config.facility_map.get(
403
+ instrument_name or "", self.config.facility_name
404
+ )
403
405
 
404
406
  timespan = dataId.timespan
405
407
  if timespan is not None:
@@ -69,10 +69,14 @@ class RemoteButlerHttpConnection:
69
69
  self.server_url = server_url
70
70
  self._access_token = access_token
71
71
 
72
- auth_headers = get_authentication_headers(access_token)
72
+ self._auth_headers = get_authentication_headers(access_token)
73
73
  headers = {"user-agent": f"RemoteButler/{__version__}"}
74
74
 
75
- self._headers = auth_headers | headers
75
+ self._headers = self._auth_headers | headers
76
+
77
+ @property
78
+ def authentication_headers(self) -> dict[str, str]:
79
+ return self._auth_headers
76
80
 
77
81
  def post(self, path: str, model: BaseModel) -> httpx.Response:
78
82
  """Send a POST request to the Butler server.
@@ -76,6 +76,7 @@ from ._query_results import convert_dataset_ref_results, read_query_results
76
76
  from ._ref_utils import apply_storage_class_override, normalize_dataset_type_name, simplify_dataId
77
77
  from ._registry import RemoteButlerRegistry
78
78
  from ._remote_butler_collections import RemoteButlerCollections
79
+ from ._remote_file_transfer_source import RemoteFileTransferSource
79
80
  from .server_models import (
80
81
  CollectionList,
81
82
  FindDatasetRequestModel,
@@ -713,6 +714,10 @@ class RemoteButler(Butler): # numpydoc ignore=PR02
713
714
  connection=self._connection, cache=self._cache, defaults=defaults, metrics=metrics
714
715
  )
715
716
 
717
+ @property
718
+ def _file_transfer_source(self) -> RemoteFileTransferSource:
719
+ return RemoteFileTransferSource(self._connection)
720
+
716
721
  def __str__(self) -> str:
717
722
  return f"RemoteButler({self._connection.server_url})"
718
723