lsst-daf-butler 29.1.0rc2__py3-none-any.whl → 29.1.0rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/daf/butler/_limited_butler.py +8 -1
- lsst/daf/butler/cli/utils.py +1 -1
- lsst/daf/butler/datastore/__init__.py +1 -0
- lsst/daf/butler/datastore/_datastore.py +18 -15
- lsst/daf/butler/datastore/_transfer.py +102 -0
- lsst/daf/butler/datastore/stored_file_info.py +34 -0
- lsst/daf/butler/datastores/chainedDatastore.py +53 -7
- lsst/daf/butler/datastores/fileDatastore.py +51 -180
- lsst/daf/butler/datastores/file_datastore/transfer.py +104 -0
- lsst/daf/butler/dimensions/_coordinate.py +3 -0
- lsst/daf/butler/direct_butler/_direct_butler.py +31 -28
- lsst/daf/butler/formatters/parquet.py +7 -3
- lsst/daf/butler/registry/interfaces/_database.py +1 -2
- lsst/daf/butler/registry/obscore/_config.py +5 -0
- lsst/daf/butler/registry/obscore/_records.py +4 -2
- lsst/daf/butler/remote_butler/_http_connection.py +6 -2
- lsst/daf/butler/remote_butler/_remote_butler.py +5 -0
- lsst/daf/butler/remote_butler/_remote_file_transfer_source.py +124 -0
- lsst/daf/butler/remote_butler/server/_factory.py +4 -0
- lsst/daf/butler/remote_butler/server/handlers/_external.py +90 -3
- lsst/daf/butler/remote_butler/server/handlers/_utils.py +15 -1
- lsst/daf/butler/remote_butler/server_models.py +17 -1
- lsst/daf/butler/tests/hybrid_butler.py +5 -1
- lsst/daf/butler/version.py +1 -1
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/METADATA +1 -1
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/RECORD +34 -31
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/WHEEL +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/entry_points.txt +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/LICENSE +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/top_level.txt +0 -0
- {lsst_daf_butler-29.1.0rc2.dist-info → lsst_daf_butler-29.1.0rc4.dist-info}/zip-safe +0 -0
|
@@ -109,6 +109,9 @@ from lsst.utils.iteration import chunk_iterable
|
|
|
109
109
|
from lsst.utils.logging import VERBOSE, getLogger
|
|
110
110
|
from lsst.utils.timer import time_this
|
|
111
111
|
|
|
112
|
+
from ..datastore import FileTransferMap, FileTransferRecord
|
|
113
|
+
from ..datastore.stored_file_info import make_datastore_path_relative
|
|
114
|
+
|
|
112
115
|
if TYPE_CHECKING:
|
|
113
116
|
from lsst.daf.butler import DatasetProvenance, LookupKey
|
|
114
117
|
from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager
|
|
@@ -1972,12 +1975,12 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1972
1975
|
|
|
1973
1976
|
return uris
|
|
1974
1977
|
|
|
1975
|
-
@staticmethod
|
|
1976
1978
|
def _find_missing_records(
|
|
1977
|
-
|
|
1979
|
+
self,
|
|
1978
1980
|
refs: Iterable[DatasetRef],
|
|
1979
1981
|
missing_ids: set[DatasetId],
|
|
1980
1982
|
artifact_existence: dict[ResourcePath, bool] | None = None,
|
|
1983
|
+
warn_for_missing: bool = True,
|
|
1981
1984
|
) -> dict[DatasetId, list[StoredFileInfo]]:
|
|
1982
1985
|
if not missing_ids:
|
|
1983
1986
|
return {}
|
|
@@ -1998,7 +2001,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
1998
2001
|
# Ask the source datastore where the missing artifacts
|
|
1999
2002
|
# should be. An execution butler might not know about the
|
|
2000
2003
|
# artifacts even if they are there.
|
|
2001
|
-
expected =
|
|
2004
|
+
expected = self._get_expected_dataset_locations_info(id_to_ref[missing])
|
|
2002
2005
|
records[missing] = [info for _, info in expected]
|
|
2003
2006
|
|
|
2004
2007
|
# Call the mexist helper method in case we have not already
|
|
@@ -2007,17 +2010,18 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2007
2010
|
# datastore.mexists() itself does not give us access to the
|
|
2008
2011
|
# derived datastore record.
|
|
2009
2012
|
log.verbose("Checking existence of %d datasets unknown to datastore", len(records))
|
|
2010
|
-
ref_exists =
|
|
2013
|
+
ref_exists = self._process_mexists_records(
|
|
2011
2014
|
id_to_ref, records, False, artifact_existence=artifact_existence
|
|
2012
2015
|
)
|
|
2013
2016
|
|
|
2014
2017
|
# Now go through the records and propagate the ones that exist.
|
|
2015
|
-
location_factory =
|
|
2018
|
+
location_factory = self.locationFactory
|
|
2016
2019
|
for missing, record_list in records.items():
|
|
2017
2020
|
# Skip completely if the ref does not exist.
|
|
2018
2021
|
ref = id_to_ref[missing]
|
|
2019
2022
|
if not ref_exists[ref]:
|
|
2020
|
-
|
|
2023
|
+
if warn_for_missing:
|
|
2024
|
+
log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
|
|
2021
2025
|
continue
|
|
2022
2026
|
# Check for file artifact to decide which parts of a
|
|
2023
2027
|
# disassembled composite do exist. If there is only a
|
|
@@ -2107,7 +2111,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2107
2111
|
if missing_ids and not self.trustGetRequest:
|
|
2108
2112
|
raise ValueError(f"Number of datasets missing from this datastore: {len(missing_ids)}")
|
|
2109
2113
|
|
|
2110
|
-
missing_records = self._find_missing_records(
|
|
2114
|
+
missing_records = self._find_missing_records(refs, missing_ids)
|
|
2111
2115
|
records.update(missing_records)
|
|
2112
2116
|
|
|
2113
2117
|
# One artifact can be used by multiple DatasetRef.
|
|
@@ -2784,13 +2788,13 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2784
2788
|
@transactional
|
|
2785
2789
|
def transfer_from(
|
|
2786
2790
|
self,
|
|
2787
|
-
|
|
2791
|
+
source_records: FileTransferMap,
|
|
2788
2792
|
refs: Collection[DatasetRef],
|
|
2789
2793
|
transfer: str = "auto",
|
|
2790
2794
|
artifact_existence: dict[ResourcePath, bool] | None = None,
|
|
2791
2795
|
dry_run: bool = False,
|
|
2792
2796
|
) -> tuple[set[DatasetRef], set[DatasetRef]]:
|
|
2793
|
-
log.verbose("Transferring %d datasets
|
|
2797
|
+
log.verbose("Transferring %d datasets to %s", len(refs), self.name)
|
|
2794
2798
|
|
|
2795
2799
|
# Stop early if "direct" transfer mode is requested. That would
|
|
2796
2800
|
# require that the URI inside the source datastore should be stored
|
|
@@ -2805,125 +2809,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2805
2809
|
if not refs:
|
|
2806
2810
|
return set(), set()
|
|
2807
2811
|
|
|
2808
|
-
# Potentially can be transferring from a chain.
|
|
2809
|
-
datastores = getattr(source_datastore, "datastores", [source_datastore])
|
|
2810
|
-
|
|
2811
|
-
incompatible: list[Datastore] = []
|
|
2812
|
-
acceptable: list[FileDatastore] = []
|
|
2813
|
-
for current_source in datastores:
|
|
2814
|
-
if not isinstance(current_source, FileDatastore):
|
|
2815
|
-
incompatible.append(current_source)
|
|
2816
|
-
else:
|
|
2817
|
-
acceptable.append(current_source)
|
|
2818
|
-
|
|
2819
|
-
if len(incompatible) == len(datastores):
|
|
2820
|
-
if len(datastores) == 1:
|
|
2821
|
-
raise TypeError(
|
|
2822
|
-
"Can only transfer to a FileDatastore from another FileDatastore, not"
|
|
2823
|
-
f" {get_full_type_name(source_datastore)}"
|
|
2824
|
-
)
|
|
2825
|
-
else:
|
|
2826
|
-
types = [get_full_type_name(d) for d in datastores]
|
|
2827
|
-
raise TypeError(
|
|
2828
|
-
f"ChainedDatastore encountered that had no FileDatastores. Had {','.join(types)}"
|
|
2829
|
-
)
|
|
2830
|
-
|
|
2831
|
-
if len(acceptable) == 1:
|
|
2832
|
-
# No need to filter in advance since there is only one usable
|
|
2833
|
-
# source datastore.
|
|
2834
|
-
return self._transfer_from(
|
|
2835
|
-
acceptable[0], refs, transfer=transfer, artifact_existence=artifact_existence, dry_run=dry_run
|
|
2836
|
-
)
|
|
2837
|
-
|
|
2838
|
-
# To avoid complaints from the transfer that the source does not have
|
|
2839
|
-
# a ref, partition refs by source datastores, and any unknown to both
|
|
2840
|
-
# are sent to any that support trustGetRequest.
|
|
2841
|
-
unassigned_refs: set[DatasetRef] = set(refs)
|
|
2842
|
-
known_refs: list[set[DatasetRef]] = []
|
|
2843
|
-
for datastore in acceptable:
|
|
2844
|
-
known_to_datastore = {ref for ref, known in datastore.knows_these(refs).items() if known}
|
|
2845
|
-
known_refs.append(known_to_datastore)
|
|
2846
|
-
unassigned_refs -= known_to_datastore
|
|
2847
|
-
|
|
2848
|
-
if unassigned_refs:
|
|
2849
|
-
for datastore, refs_known_to_datastore in zip(acceptable, known_refs, strict=True):
|
|
2850
|
-
if datastore.trustGetRequest:
|
|
2851
|
-
# Have to check each datastore in turn. If we do not do
|
|
2852
|
-
# this warnings will be issued further down for datasets
|
|
2853
|
-
# that are in one and not the other. The existence cache
|
|
2854
|
-
# will prevent repeat checks.
|
|
2855
|
-
exist_in_store = datastore.mexists(unassigned_refs, artifact_existence=artifact_existence)
|
|
2856
|
-
present = {ref for ref, exists in exist_in_store.items() if exists}
|
|
2857
|
-
refs_known_to_datastore.update(present)
|
|
2858
|
-
# Only transferring once so do not need to check later
|
|
2859
|
-
# datastores.
|
|
2860
|
-
unassigned_refs -= present
|
|
2861
|
-
log.debug(
|
|
2862
|
-
"Adding %d missing refs to list for transfer from %s", len(present), datastore.name
|
|
2863
|
-
)
|
|
2864
|
-
|
|
2865
|
-
if unassigned_refs:
|
|
2866
|
-
log.warning(
|
|
2867
|
-
"Encountered %d dataset%s where no file artifacts exist from the "
|
|
2868
|
-
"source datastore and will be skipped.",
|
|
2869
|
-
len(unassigned_refs),
|
|
2870
|
-
"s" if len(unassigned_refs) != 1 else "",
|
|
2871
|
-
)
|
|
2872
|
-
|
|
2873
|
-
# Once we have accepted refs from one datastore, do not need to try to
|
|
2874
|
-
# transfer them again.
|
|
2875
|
-
accepted: set[DatasetRef] = set()
|
|
2876
|
-
rejected: set[DatasetRef] = set()
|
|
2877
|
-
if artifact_existence is None:
|
|
2878
|
-
artifact_existence = {}
|
|
2879
|
-
|
|
2880
|
-
for current_source, refs_to_transfer in zip(acceptable, known_refs, strict=True):
|
|
2881
|
-
# Do not transfer if already transferred.
|
|
2882
|
-
refs_to_transfer -= accepted
|
|
2883
|
-
# No need to retry something that has already been rejected.
|
|
2884
|
-
refs_to_transfer -= rejected
|
|
2885
|
-
|
|
2886
|
-
if not refs_to_transfer:
|
|
2887
|
-
continue
|
|
2888
|
-
|
|
2889
|
-
log.verbose(
|
|
2890
|
-
"Requesting transfer of %d dataset%s from datastore %s to %s",
|
|
2891
|
-
len(refs_to_transfer),
|
|
2892
|
-
"s" if len(refs_to_transfer) != 1 else "",
|
|
2893
|
-
current_source.name,
|
|
2894
|
-
self.name,
|
|
2895
|
-
)
|
|
2896
|
-
current_accepted, current_rejected = self._transfer_from(
|
|
2897
|
-
current_source,
|
|
2898
|
-
refs_to_transfer,
|
|
2899
|
-
transfer=transfer,
|
|
2900
|
-
artifact_existence=artifact_existence,
|
|
2901
|
-
dry_run=dry_run,
|
|
2902
|
-
)
|
|
2903
|
-
|
|
2904
|
-
accepted.update(current_accepted)
|
|
2905
|
-
rejected.update(current_rejected)
|
|
2906
|
-
|
|
2907
|
-
log.verbose(
|
|
2908
|
-
"Finished transfer_from %s to %s with %d accepted, %d rejected, %d requested",
|
|
2909
|
-
source_datastore.name,
|
|
2910
|
-
self.name,
|
|
2911
|
-
len(accepted),
|
|
2912
|
-
len(rejected),
|
|
2913
|
-
len(refs),
|
|
2914
|
-
)
|
|
2915
|
-
|
|
2916
|
-
return accepted, rejected
|
|
2917
|
-
|
|
2918
|
-
@transactional
|
|
2919
|
-
def _transfer_from(
|
|
2920
|
-
self,
|
|
2921
|
-
source_datastore: FileDatastore,
|
|
2922
|
-
refs: Collection[DatasetRef],
|
|
2923
|
-
transfer: str = "auto",
|
|
2924
|
-
artifact_existence: dict[ResourcePath, bool] | None = None,
|
|
2925
|
-
dry_run: bool = False,
|
|
2926
|
-
) -> tuple[set[DatasetRef], set[DatasetRef]]:
|
|
2927
2812
|
# Empty existence lookup if none given.
|
|
2928
2813
|
if artifact_existence is None:
|
|
2929
2814
|
artifact_existence = {}
|
|
@@ -2941,46 +2826,8 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
2941
2826
|
# the dataset should be transferred. This will only happen if
|
|
2942
2827
|
# the detached Butler has had a local ingest.
|
|
2943
2828
|
|
|
2944
|
-
# What we really want is all the records in the source datastore
|
|
2945
|
-
# associated with these refs. Or derived ones if they don't exist
|
|
2946
|
-
# in the source.
|
|
2947
|
-
log.verbose("Looking up source datastore records in %s", source_datastore.name)
|
|
2948
|
-
source_records = source_datastore._get_stored_records_associated_with_refs(
|
|
2949
|
-
refs, ignore_datastore_records=True
|
|
2950
|
-
)
|
|
2951
|
-
|
|
2952
|
-
# The source dataset_ids are the keys in these records
|
|
2953
|
-
source_ids = set(source_records)
|
|
2954
|
-
log.debug("Number of datastore records found in source: %d", len(source_ids))
|
|
2955
|
-
|
|
2956
|
-
requested_ids = {ref.id for ref in refs}
|
|
2957
|
-
missing_ids = requested_ids - source_ids
|
|
2958
|
-
|
|
2959
|
-
# Missing IDs can be okay if that datastore has allowed
|
|
2960
|
-
# gets based on file existence. Should we transfer what we can
|
|
2961
|
-
# or complain about it and warn?
|
|
2962
|
-
if missing_ids and not source_datastore.trustGetRequest:
|
|
2963
|
-
raise ValueError(
|
|
2964
|
-
f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}"
|
|
2965
|
-
)
|
|
2966
|
-
|
|
2967
|
-
# Need to map these missing IDs to a DatasetRef so we can guess
|
|
2968
|
-
# the details.
|
|
2969
|
-
if missing_ids:
|
|
2970
|
-
log.info(
|
|
2971
|
-
"Number of expected datasets missing from source datastore records: %d out of %d",
|
|
2972
|
-
len(missing_ids),
|
|
2973
|
-
len(requested_ids),
|
|
2974
|
-
)
|
|
2975
|
-
found_records = self._find_missing_records(
|
|
2976
|
-
source_datastore, refs, missing_ids, artifact_existence
|
|
2977
|
-
)
|
|
2978
|
-
source_records.update(found_records)
|
|
2979
|
-
|
|
2980
2829
|
# See if we already have these records
|
|
2981
|
-
log.verbose(
|
|
2982
|
-
"Looking up existing datastore records in target %s for %d refs", self.name, len(requested_ids)
|
|
2983
|
-
)
|
|
2830
|
+
log.verbose("Looking up existing datastore records in target %s for %d refs", self.name, len(refs))
|
|
2984
2831
|
target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True)
|
|
2985
2832
|
|
|
2986
2833
|
# The artifacts to register
|
|
@@ -3017,8 +2864,9 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
3017
2864
|
continue
|
|
3018
2865
|
|
|
3019
2866
|
# mypy needs to know these are always resolved refs
|
|
3020
|
-
for
|
|
3021
|
-
|
|
2867
|
+
for transfer_info in source_records.get(ref.id, []):
|
|
2868
|
+
info = transfer_info.file_info
|
|
2869
|
+
source_location = transfer_info.location
|
|
3022
2870
|
target_location = info.file_location(self.locationFactory)
|
|
3023
2871
|
if source_location == target_location and not source_location.pathInStore.isabs():
|
|
3024
2872
|
# Artifact is already in the target location.
|
|
@@ -3096,14 +2944,45 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
|
|
|
3096
2944
|
)
|
|
3097
2945
|
|
|
3098
2946
|
log.verbose(
|
|
3099
|
-
"Finished transfer_from
|
|
3100
|
-
source_datastore.name,
|
|
2947
|
+
"Finished transfer_from to %s with %d accepted, %d rejected",
|
|
3101
2948
|
self.name,
|
|
3102
2949
|
len(accepted),
|
|
3103
2950
|
len(rejected),
|
|
3104
2951
|
)
|
|
3105
2952
|
return accepted, rejected
|
|
3106
2953
|
|
|
2954
|
+
def get_file_info_for_transfer(self, dataset_ids: Iterable[DatasetId]) -> FileTransferMap:
|
|
2955
|
+
source_records = self._get_stored_records_associated_with_refs(
|
|
2956
|
+
[FakeDatasetRef(id) for id in dataset_ids], ignore_datastore_records=True
|
|
2957
|
+
)
|
|
2958
|
+
return self._convert_stored_file_info_to_file_transfer_record(source_records)
|
|
2959
|
+
|
|
2960
|
+
def locate_missing_files_for_transfer(
|
|
2961
|
+
self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool]
|
|
2962
|
+
) -> FileTransferMap:
|
|
2963
|
+
missing_ids = {ref.id for ref in refs}
|
|
2964
|
+
# Missing IDs can be okay if that datastore has allowed
|
|
2965
|
+
# gets based on file existence. Should we transfer what we can
|
|
2966
|
+
# or complain about it and warn?
|
|
2967
|
+
if not self.trustGetRequest:
|
|
2968
|
+
return {}
|
|
2969
|
+
|
|
2970
|
+
found_records = self._find_missing_records(
|
|
2971
|
+
refs, missing_ids, artifact_existence, warn_for_missing=False
|
|
2972
|
+
)
|
|
2973
|
+
return self._convert_stored_file_info_to_file_transfer_record(found_records)
|
|
2974
|
+
|
|
2975
|
+
def _convert_stored_file_info_to_file_transfer_record(
|
|
2976
|
+
self, info_map: dict[DatasetId, list[StoredFileInfo]]
|
|
2977
|
+
) -> FileTransferMap:
|
|
2978
|
+
output: dict[DatasetId, list[FileTransferRecord]] = {}
|
|
2979
|
+
for k, file_info_list in info_map.items():
|
|
2980
|
+
output[k] = [
|
|
2981
|
+
FileTransferRecord(file_info=info, location=info.file_location(self.locationFactory))
|
|
2982
|
+
for info in file_info_list
|
|
2983
|
+
]
|
|
2984
|
+
return output
|
|
2985
|
+
|
|
3107
2986
|
@transactional
|
|
3108
2987
|
def forget(self, refs: Iterable[DatasetRef]) -> None:
|
|
3109
2988
|
# Docstring inherited.
|
|
@@ -3357,16 +3236,8 @@ def _to_file_info_payload(
|
|
|
3357
3236
|
) -> FileDatastoreGetPayloadFileInfo:
|
|
3358
3237
|
location, file_info = info
|
|
3359
3238
|
|
|
3360
|
-
# Make sure that we send only relative paths, to avoid leaking
|
|
3361
|
-
# details of our configuration to the client.
|
|
3362
|
-
path = location.pathInStore
|
|
3363
|
-
if path.isabs():
|
|
3364
|
-
relative_path = path.relativeToPathRoot
|
|
3365
|
-
else:
|
|
3366
|
-
relative_path = str(path)
|
|
3367
|
-
|
|
3368
3239
|
datastoreRecords = file_info.to_simple()
|
|
3369
|
-
datastoreRecords.path =
|
|
3240
|
+
datastoreRecords.path = make_datastore_path_relative(datastoreRecords.path)
|
|
3370
3241
|
|
|
3371
3242
|
return FileDatastoreGetPayloadFileInfo(
|
|
3372
3243
|
url=location.uri.generate_presigned_get_url(expiration_time_seconds=url_expiration_time_seconds),
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# This file is part of daf_butler.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (http://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from collections.abc import Iterable
|
|
31
|
+
|
|
32
|
+
from lsst.resources import ResourcePath
|
|
33
|
+
from lsst.utils.logging import getLogger
|
|
34
|
+
|
|
35
|
+
from ..._dataset_ref import DatasetRef
|
|
36
|
+
from ...datastore import FileTransferMap, FileTransferSource
|
|
37
|
+
|
|
38
|
+
log = getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def retrieve_file_transfer_records(
|
|
42
|
+
source_datastore: FileTransferSource,
|
|
43
|
+
refs: Iterable[DatasetRef],
|
|
44
|
+
artifact_existence: dict[ResourcePath, bool],
|
|
45
|
+
) -> FileTransferMap:
|
|
46
|
+
"""Look up the datastore records corresponding to the given datasets.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
source_datastore : `FileTransferSource`
|
|
51
|
+
Object used to look up records.
|
|
52
|
+
refs : `~collections.abc.Iterable` [ `DatasetRef` ]
|
|
53
|
+
List of datasets to retrieve records for.
|
|
54
|
+
artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`]
|
|
55
|
+
Cache mapping datastore artifact to existence. Updated by
|
|
56
|
+
this method with details of all artifacts tested.
|
|
57
|
+
|
|
58
|
+
Return
|
|
59
|
+
------
|
|
60
|
+
files : `FileTransferMap`
|
|
61
|
+
A dictionary from `DatasetId` to a list of `FileTransferRecord`,
|
|
62
|
+
containing information about the files that were found for these
|
|
63
|
+
artifacts. If files were not found for a given `DatasetRef`, there
|
|
64
|
+
will be no entry for it in this dictionary.
|
|
65
|
+
|
|
66
|
+
Notes
|
|
67
|
+
-----
|
|
68
|
+
This will first attempt to look up records using the database, and then
|
|
69
|
+
fall back to searching the filesystem if the transfer source is configured
|
|
70
|
+
to do so.
|
|
71
|
+
"""
|
|
72
|
+
log.verbose("Looking up source datastore records in %s", source_datastore.name)
|
|
73
|
+
refs_by_id = {ref.id: ref for ref in refs}
|
|
74
|
+
source_records = source_datastore.get_file_info_for_transfer(refs_by_id.keys())
|
|
75
|
+
|
|
76
|
+
log.debug("Number of datastore records found in source: %d", len(source_records))
|
|
77
|
+
|
|
78
|
+
# If we couldn't find all of the datasets in the database, continue
|
|
79
|
+
# searching. Some datastores may have artifacts on disk that do not have
|
|
80
|
+
# corresponding records in the database.
|
|
81
|
+
missing_ids = refs_by_id.keys() - source_records.keys()
|
|
82
|
+
if missing_ids:
|
|
83
|
+
log.info(
|
|
84
|
+
"Number of expected datasets missing from source datastore records: %d out of %d",
|
|
85
|
+
len(missing_ids),
|
|
86
|
+
len(refs_by_id),
|
|
87
|
+
)
|
|
88
|
+
missing_refs = {refs_by_id[id] for id in missing_ids}
|
|
89
|
+
found_records = source_datastore.locate_missing_files_for_transfer(missing_refs, artifact_existence)
|
|
90
|
+
source_records |= found_records
|
|
91
|
+
|
|
92
|
+
still_missing = len(missing_refs) - len(found_records)
|
|
93
|
+
if still_missing:
|
|
94
|
+
for ref in missing_refs:
|
|
95
|
+
if ref.id not in found_records:
|
|
96
|
+
log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref)
|
|
97
|
+
log.warning(
|
|
98
|
+
"Encountered %d dataset%s where no file artifacts exist from the "
|
|
99
|
+
"source datastore and will be skipped.",
|
|
100
|
+
still_missing,
|
|
101
|
+
"s" if still_missing != 1 else "",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return source_records
|
|
@@ -427,6 +427,9 @@ class DataCoordinate:
|
|
|
427
427
|
@overload
|
|
428
428
|
def get(self, key: str, default: str) -> str: ...
|
|
429
429
|
|
|
430
|
+
@overload
|
|
431
|
+
def get(self, key: str, default: DataIdValue | None) -> DataIdValue | None: ...
|
|
432
|
+
|
|
430
433
|
def get(self, key: str, default: DataIdValue | None = None) -> DataIdValue | None:
|
|
431
434
|
try:
|
|
432
435
|
return self.__getitem__(key)
|
|
@@ -75,6 +75,7 @@ from .._storage_class import StorageClass, StorageClassFactory
|
|
|
75
75
|
from .._timespan import Timespan
|
|
76
76
|
from ..datastore import Datastore, NullDatastore
|
|
77
77
|
from ..datastores.file_datastore.retrieve_artifacts import ZipIndex, retrieve_and_zip
|
|
78
|
+
from ..datastores.file_datastore.transfer import retrieve_file_transfer_records
|
|
78
79
|
from ..dimensions import DataCoordinate, Dimension, DimensionGroup
|
|
79
80
|
from ..direct_query_driver import DirectQueryDriver
|
|
80
81
|
from ..progress import Progress
|
|
@@ -1765,7 +1766,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
1765
1766
|
import_info = self._prepare_for_import_refs(
|
|
1766
1767
|
self,
|
|
1767
1768
|
refs,
|
|
1768
|
-
skip_missing=False,
|
|
1769
1769
|
register_dataset_types=True,
|
|
1770
1770
|
dry_run=dry_run,
|
|
1771
1771
|
transfer_dimensions=transfer_dimensions,
|
|
@@ -2063,7 +2063,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2063
2063
|
source_butler: LimitedButler,
|
|
2064
2064
|
source_refs: Iterable[DatasetRef],
|
|
2065
2065
|
*,
|
|
2066
|
-
skip_missing: bool = True,
|
|
2067
2066
|
register_dataset_types: bool = False,
|
|
2068
2067
|
transfer_dimensions: bool = False,
|
|
2069
2068
|
dry_run: bool = False,
|
|
@@ -2087,27 +2086,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2087
2086
|
str(self),
|
|
2088
2087
|
)
|
|
2089
2088
|
|
|
2090
|
-
# In some situations the datastore artifact may be missing
|
|
2091
|
-
# and we do not want that registry entry to be imported.
|
|
2092
|
-
# Asking datastore is not sufficient, the records may have been
|
|
2093
|
-
# purged, we have to ask for the (predicted) URI and check
|
|
2094
|
-
# existence explicitly. Execution butler is set up exactly like
|
|
2095
|
-
# this with no datastore records.
|
|
2096
|
-
artifact_existence: dict[ResourcePath, bool] = {}
|
|
2097
|
-
if skip_missing:
|
|
2098
|
-
dataset_existence = source_butler._datastore.mexists(
|
|
2099
|
-
source_refs, artifact_existence=artifact_existence
|
|
2100
|
-
)
|
|
2101
|
-
source_refs = [ref for ref, exists in dataset_existence.items() if exists]
|
|
2102
|
-
filtered_count = len(source_refs)
|
|
2103
|
-
n_missing = original_count - filtered_count
|
|
2104
|
-
_LOG.verbose(
|
|
2105
|
-
"%d dataset%s removed because the artifact does not exist. Now have %d.",
|
|
2106
|
-
n_missing,
|
|
2107
|
-
"" if n_missing == 1 else "s",
|
|
2108
|
-
filtered_count,
|
|
2109
|
-
)
|
|
2110
|
-
|
|
2111
2089
|
# Importing requires that we group the refs by dimension group and run
|
|
2112
2090
|
# before doing the import.
|
|
2113
2091
|
source_dataset_types = set()
|
|
@@ -2207,7 +2185,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2207
2185
|
dimension_records = self._extract_all_dimension_records_from_data_ids(
|
|
2208
2186
|
source_butler, dataIds, elements
|
|
2209
2187
|
)
|
|
2210
|
-
return _ImportDatasetsInfo(grouped_refs, dimension_records
|
|
2188
|
+
return _ImportDatasetsInfo(grouped_refs, dimension_records)
|
|
2211
2189
|
|
|
2212
2190
|
def _import_dimension_records(
|
|
2213
2191
|
self,
|
|
@@ -2294,15 +2272,40 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2294
2272
|
dry_run: bool = False,
|
|
2295
2273
|
) -> collections.abc.Collection[DatasetRef]:
|
|
2296
2274
|
# Docstring inherited.
|
|
2275
|
+
source_refs = list(source_refs)
|
|
2297
2276
|
if not self.isWriteable():
|
|
2298
2277
|
raise TypeError("Butler is read-only.")
|
|
2299
2278
|
|
|
2300
2279
|
progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
|
|
2301
2280
|
|
|
2281
|
+
artifact_existence: dict[ResourcePath, bool] = {}
|
|
2282
|
+
file_transfer_source = source_butler._file_transfer_source
|
|
2283
|
+
transfer_records = retrieve_file_transfer_records(
|
|
2284
|
+
file_transfer_source, source_refs, artifact_existence
|
|
2285
|
+
)
|
|
2286
|
+
# In some situations the datastore artifact may be missing and we do
|
|
2287
|
+
# not want that registry entry to be imported. For example, this can
|
|
2288
|
+
# happen if a file was removed but the dataset was left in the registry
|
|
2289
|
+
# for provenance, or if a pipeline task didn't create all of the
|
|
2290
|
+
# possible files in a QuantumBackedButler.
|
|
2291
|
+
if skip_missing:
|
|
2292
|
+
original_ids = {ref.id for ref in source_refs}
|
|
2293
|
+
missing_ids = original_ids - transfer_records.keys()
|
|
2294
|
+
if missing_ids:
|
|
2295
|
+
original_count = len(source_refs)
|
|
2296
|
+
source_refs = [ref for ref in source_refs if ref.id not in missing_ids]
|
|
2297
|
+
filtered_count = len(source_refs)
|
|
2298
|
+
n_missing = original_count - filtered_count
|
|
2299
|
+
_LOG.verbose(
|
|
2300
|
+
"%d dataset%s removed because the artifact does not exist. Now have %d.",
|
|
2301
|
+
n_missing,
|
|
2302
|
+
"" if n_missing == 1 else "s",
|
|
2303
|
+
filtered_count,
|
|
2304
|
+
)
|
|
2305
|
+
|
|
2302
2306
|
import_info = self._prepare_for_import_refs(
|
|
2303
2307
|
source_butler,
|
|
2304
2308
|
source_refs,
|
|
2305
|
-
skip_missing=skip_missing,
|
|
2306
2309
|
register_dataset_types=register_dataset_types,
|
|
2307
2310
|
dry_run=dry_run,
|
|
2308
2311
|
transfer_dimensions=transfer_dimensions,
|
|
@@ -2317,11 +2320,12 @@ class DirectButler(Butler): # numpydoc ignore=PR02
|
|
|
2317
2320
|
|
|
2318
2321
|
# Ask the datastore to transfer. The datastore has to check that
|
|
2319
2322
|
# the source datastore is compatible with the target datastore.
|
|
2323
|
+
_LOG.verbose("Transferring %d datasets from %s", len(transfer_records), file_transfer_source.name)
|
|
2320
2324
|
accepted, rejected = self._datastore.transfer_from(
|
|
2321
|
-
|
|
2325
|
+
transfer_records,
|
|
2322
2326
|
imported_refs,
|
|
2323
2327
|
transfer=transfer,
|
|
2324
|
-
artifact_existence=
|
|
2328
|
+
artifact_existence=artifact_existence,
|
|
2325
2329
|
dry_run=dry_run,
|
|
2326
2330
|
)
|
|
2327
2331
|
if rejected:
|
|
@@ -2567,4 +2571,3 @@ class _ImportDatasetsInfo(NamedTuple):
|
|
|
2567
2571
|
|
|
2568
2572
|
grouped_refs: defaultdict[_RefGroup, list[DatasetRef]]
|
|
2569
2573
|
dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]
|
|
2570
|
-
artifact_existence: dict[ResourcePath, bool]
|
|
@@ -295,7 +295,7 @@ def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray | np.ma.MaskedArray:
|
|
|
295
295
|
numpy_dict = arrow_to_numpy_dict(arrow_table)
|
|
296
296
|
|
|
297
297
|
has_mask = False
|
|
298
|
-
dtype = []
|
|
298
|
+
dtype: list[tuple] = []
|
|
299
299
|
for name, col in numpy_dict.items():
|
|
300
300
|
if len(shape := numpy_dict[name].shape) <= 1:
|
|
301
301
|
dtype.append((name, col.dtype))
|
|
@@ -429,7 +429,11 @@ def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
|
|
|
429
429
|
md = {}
|
|
430
430
|
md[b"lsst::arrow::rowcount"] = str(len(np_array))
|
|
431
431
|
|
|
432
|
-
|
|
432
|
+
names = np_array.dtype.names
|
|
433
|
+
if names is None:
|
|
434
|
+
names = ()
|
|
435
|
+
|
|
436
|
+
for name in names:
|
|
433
437
|
_append_numpy_string_metadata(md, name, np_array.dtype[name])
|
|
434
438
|
_append_numpy_multidim_metadata(md, name, np_array.dtype[name])
|
|
435
439
|
|
|
@@ -1379,7 +1383,7 @@ def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, i
|
|
|
1379
1383
|
"""
|
|
1380
1384
|
import numpy as np
|
|
1381
1385
|
|
|
1382
|
-
dtype_list = []
|
|
1386
|
+
dtype_list: list[tuple] = []
|
|
1383
1387
|
rowcount = 0
|
|
1384
1388
|
for name, col in numpy_dict.items():
|
|
1385
1389
|
if rowcount == 0:
|
|
@@ -1696,9 +1696,8 @@ class Database(ABC):
|
|
|
1696
1696
|
for k, v in content.items():
|
|
1697
1697
|
if k == name:
|
|
1698
1698
|
continue
|
|
1699
|
-
column = table.columns[k]
|
|
1700
1699
|
# The set only has one element
|
|
1701
|
-
clauses.append(
|
|
1700
|
+
clauses.append(table.columns[k] == v.pop())
|
|
1702
1701
|
|
|
1703
1702
|
# The IN operator will not work for "infinite" numbers of
|
|
1704
1703
|
# rows so must batch it up into distinct calls.
|
|
@@ -178,6 +178,11 @@ class ObsCoreConfig(pydantic.BaseModel):
|
|
|
178
178
|
indexing support, but a standard ``s_region`` column is always included.
|
|
179
179
|
"""
|
|
180
180
|
|
|
181
|
+
fallback_instrument: str | None = None
|
|
182
|
+
"""Instrument to use if a dataset type does not have an instrument
|
|
183
|
+
dimension. Will be left unset if `None`. Can be dangerous to set this
|
|
184
|
+
in a repository containing data from multiple instruments."""
|
|
185
|
+
|
|
181
186
|
|
|
182
187
|
class ConfigCollectionType(str, enum.Enum):
|
|
183
188
|
"""Enum class defining possible values for configuration attributes."""
|
|
@@ -394,12 +394,14 @@ class DafButlerRecordFactory(RecordFactory):
|
|
|
394
394
|
dataId = ref.dataId
|
|
395
395
|
record: dict[str, str | int | float | UUID | None] = {}
|
|
396
396
|
|
|
397
|
-
instrument_name = cast(str, dataId.get("instrument"))
|
|
397
|
+
instrument_name = cast(str | None, dataId.get("instrument", self.config.fallback_instrument))
|
|
398
398
|
record["instrument_name"] = instrument_name
|
|
399
399
|
if self.schema.dataset_fk is not None:
|
|
400
400
|
record[self.schema.dataset_fk.name] = ref.id
|
|
401
401
|
|
|
402
|
-
record["facility_name"] = self.config.facility_map.get(
|
|
402
|
+
record["facility_name"] = self.config.facility_map.get(
|
|
403
|
+
instrument_name or "", self.config.facility_name
|
|
404
|
+
)
|
|
403
405
|
|
|
404
406
|
timespan = dataId.timespan
|
|
405
407
|
if timespan is not None:
|
|
@@ -69,10 +69,14 @@ class RemoteButlerHttpConnection:
|
|
|
69
69
|
self.server_url = server_url
|
|
70
70
|
self._access_token = access_token
|
|
71
71
|
|
|
72
|
-
|
|
72
|
+
self._auth_headers = get_authentication_headers(access_token)
|
|
73
73
|
headers = {"user-agent": f"RemoteButler/{__version__}"}
|
|
74
74
|
|
|
75
|
-
self._headers =
|
|
75
|
+
self._headers = self._auth_headers | headers
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def authentication_headers(self) -> dict[str, str]:
|
|
79
|
+
return self._auth_headers
|
|
76
80
|
|
|
77
81
|
def post(self, path: str, model: BaseModel) -> httpx.Response:
|
|
78
82
|
"""Send a POST request to the Butler server.
|
|
@@ -76,6 +76,7 @@ from ._query_results import convert_dataset_ref_results, read_query_results
|
|
|
76
76
|
from ._ref_utils import apply_storage_class_override, normalize_dataset_type_name, simplify_dataId
|
|
77
77
|
from ._registry import RemoteButlerRegistry
|
|
78
78
|
from ._remote_butler_collections import RemoteButlerCollections
|
|
79
|
+
from ._remote_file_transfer_source import RemoteFileTransferSource
|
|
79
80
|
from .server_models import (
|
|
80
81
|
CollectionList,
|
|
81
82
|
FindDatasetRequestModel,
|
|
@@ -713,6 +714,10 @@ class RemoteButler(Butler): # numpydoc ignore=PR02
|
|
|
713
714
|
connection=self._connection, cache=self._cache, defaults=defaults, metrics=metrics
|
|
714
715
|
)
|
|
715
716
|
|
|
717
|
+
@property
|
|
718
|
+
def _file_transfer_source(self) -> RemoteFileTransferSource:
|
|
719
|
+
return RemoteFileTransferSource(self._connection)
|
|
720
|
+
|
|
716
721
|
def __str__(self) -> str:
|
|
717
722
|
return f"RemoteButler({self._connection.server_url})"
|
|
718
723
|
|