lsst-resources 29.2025.1700__py3-none-any.whl → 29.2025.1900__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -168,7 +168,9 @@ class HttpReadResourceHandle(BaseResourceHandle[bytes]):
168
168
  # return the result
169
169
  self._completeBuffer = io.BytesIO()
170
170
  with time_this(self._log, msg="Read from remote resource %s", args=(self._url,)):
171
- resp = self._session.get(_dav_to_http(self._url), stream=False, timeout=self._timeout)
171
+ with self._session as session:
172
+ resp = session.get(_dav_to_http(self._url), stream=False, timeout=self._timeout)
173
+
172
174
  if (code := resp.status_code) not in (requests.codes.ok, requests.codes.partial):
173
175
  raise FileNotFoundError(f"Unable to read resource {self._url}; status code: {code}")
174
176
  self._completeBuffer.write(resp.content)
@@ -190,9 +192,10 @@ class HttpReadResourceHandle(BaseResourceHandle[bytes]):
190
192
  with time_this(
191
193
  self._log, msg="Read from remote resource %s using headers %s", args=(self._url, headers)
192
194
  ):
193
- resp = self._session.get(
194
- _dav_to_http(self._url), stream=False, timeout=self._timeout, headers=headers
195
- )
195
+ with self._session as session:
196
+ resp = session.get(
197
+ _dav_to_http(self._url), stream=False, timeout=self._timeout, headers=headers
198
+ )
196
199
 
197
200
  if resp.status_code == requests.codes.range_not_satisfiable:
198
201
  # Must have run off the end of the file. A standard file handle
@@ -19,14 +19,14 @@ import copy
19
19
  import io
20
20
  import locale
21
21
  import logging
22
- import multiprocessing
23
22
  import os
24
23
  import posixpath
25
24
  import re
26
25
  import urllib.parse
27
- from functools import cache
26
+ from collections import defaultdict
28
27
  from pathlib import Path, PurePath, PurePosixPath
29
28
  from random import Random
29
+ from typing import TypeAlias
30
30
 
31
31
  try:
32
32
  import fsspec
@@ -39,7 +39,7 @@ from collections.abc import Iterable, Iterator
39
39
  from typing import TYPE_CHECKING, Any, Literal, NamedTuple, overload
40
40
 
41
41
  from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
42
- from .utils import get_tempdir
42
+ from .utils import _get_num_workers, get_tempdir
43
43
 
44
44
  if TYPE_CHECKING:
45
45
  from .utils import TransactionProtocol
@@ -53,49 +53,81 @@ ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")
53
53
  # Precomputed escaped hash
54
54
  ESCAPED_HASH = urllib.parse.quote("#")
55
55
 
56
- # Maximum number of worker threads for parallelized operations.
57
- # If greater than 10, be aware that this number has to be consistent
58
- # with connection pool sizing (for example in urllib3).
59
- MAX_WORKERS = 10
60
56
 
61
-
62
- class MTransferResult(NamedTuple):
63
- """Report on a bulk transfer."""
57
+ class MBulkResult(NamedTuple):
58
+ """Report on a bulk operation."""
64
59
 
65
60
  success: bool
66
61
  exception: Exception | None
67
62
 
68
63
 
69
- def _get_int_env_var(env_var: str) -> int | None:
70
- int_value = None
71
- env_value = os.getenv(env_var)
72
- if env_value is not None:
73
- with contextlib.suppress(TypeError):
74
- int_value = int(env_value)
75
- return int_value
64
+ _EXECUTOR_TYPE: TypeAlias = type[
65
+ concurrent.futures.ThreadPoolExecutor | concurrent.futures.ProcessPoolExecutor
66
+ ]
67
+
68
+ # Cache value for executor class so as not to issue warning multiple
69
+ # times but still allow tests to override the value.
70
+ _POOL_EXECUTOR_CLASS: _EXECUTOR_TYPE | None = None
76
71
 
77
72
 
78
- @cache
79
- def _get_num_workers() -> int:
80
- f"""Calculate the number of workers to use.
73
+ def _get_executor_class() -> _EXECUTOR_TYPE:
74
+ """Return the executor class used for parallelized execution.
81
75
 
82
76
  Returns
83
77
  -------
84
- num : `int`
85
- The number of workers to use. Will use the value of the
86
- ``LSST_RESOURCES_NUM_WORKERS`` environment variable if set. Will fall
87
- back to using the CPU count (plus 2) but capped at {MAX_WORKERS}.
78
+ cls : `concurrent.futures.Executor`
79
+ The ``Executor`` class. Default is
80
+ `concurrent.futures.ThreadPoolExecutor`. Can be set explicitly by
81
+ setting the ``$LSST_RESOURCES_EXECUTOR`` environment variable to
82
+ "thread" or "process". Returns "thread" pool if the value of the
83
+ variable is not recognized.
84
+ """
85
+ global _POOL_EXECUTOR_CLASS
86
+
87
+ if _POOL_EXECUTOR_CLASS is not None:
88
+ return _POOL_EXECUTOR_CLASS
89
+
90
+ pool_executor_classes = {
91
+ "threads": concurrent.futures.ThreadPoolExecutor,
92
+ "process": concurrent.futures.ProcessPoolExecutor,
93
+ }
94
+ default_executor = "threads"
95
+ external = os.getenv("LSST_RESOURCES_EXECUTOR", default_executor)
96
+ if not external:
97
+ external = default_executor
98
+ if external not in pool_executor_classes:
99
+ log.warning(
100
+ "Unrecognized value of '%s' for LSST_RESOURCES_EXECUTOR env var. Using '%s'",
101
+ external,
102
+ default_executor,
103
+ )
104
+ external = default_executor
105
+ _POOL_EXECUTOR_CLASS = pool_executor_classes[external]
106
+ return _POOL_EXECUTOR_CLASS
107
+
108
+
109
+ @contextlib.contextmanager
110
+ def _patch_environ(new_values: dict[str, str]) -> Iterator[None]:
111
+ """Patch os.environ temporarily using the supplied values.
112
+
113
+ Parameters
114
+ ----------
115
+ new_values : `dict` [ `str`, `str` ]
116
+ New values to be stored in the environment.
88
117
  """
89
- num_workers: int | None = None
90
- num_workers = _get_int_env_var("LSST_RESOURCES_NUM_WORKERS")
91
- if num_workers is None:
92
- # CPU_LIMIT is used on nublado.
93
- cpu_limit = _get_int_env_var("CPU_LIMIT") or multiprocessing.cpu_count()
94
- if cpu_limit is not None:
95
- num_workers = cpu_limit + 2
118
+ old_values: dict[str, str] = {}
119
+ for k, v in new_values.items():
120
+ if k in os.environ:
121
+ old_values[k] = os.environ[k]
122
+ os.environ[k] = v
96
123
 
97
- # But don't ever return more than the maximum allowed.
98
- return min([num_workers, MAX_WORKERS])
124
+ try:
125
+ yield
126
+ finally:
127
+ for k in new_values:
128
+ del os.environ[k]
129
+ if k in old_values:
130
+ os.environ[k] = old_values[k]
99
131
 
100
132
 
101
133
  class ResourcePath: # numpydoc ignore=PR02
@@ -882,51 +914,101 @@ class ResourcePath: # numpydoc ignore=PR02
882
914
  raise NotImplementedError()
883
915
 
884
916
  @classmethod
885
- def mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
917
+ def _group_uris(cls, uris: Iterable[ResourcePath]) -> dict[type[ResourcePath], list[ResourcePath]]:
918
+ """Group URIs by class/scheme."""
919
+ grouped: dict[type, list[ResourcePath]] = defaultdict(list)
920
+ for uri in uris:
921
+ grouped[uri.__class__].append(uri)
922
+ return grouped
923
+
924
+ @classmethod
925
+ def mexists(
926
+ cls, uris: Iterable[ResourcePath], *, num_workers: int | None = None
927
+ ) -> dict[ResourcePath, bool]:
886
928
  """Check for existence of multiple URIs at once.
887
929
 
888
930
  Parameters
889
931
  ----------
890
932
  uris : iterable of `ResourcePath`
891
933
  The URIs to test.
934
+ num_workers : `int` or `None`, optional
935
+ The number of parallel workers to use when checking for existence
936
+ If `None`, the default value will be taken from the environment.
937
+ If this number is higher than the default and a thread pool is
938
+ used, there may not be enough cached connections available.
892
939
 
893
940
  Returns
894
941
  -------
895
942
  existence : `dict` of [`ResourcePath`, `bool`]
896
943
  Mapping of original URI to boolean indicating existence.
897
944
  """
898
- # Group by scheme to allow a subclass to be able to use
899
- # specialized implementations.
900
- grouped: dict[type, list[ResourcePath]] = {}
901
- for uri in uris:
902
- uri_class = uri.__class__
903
- if uri_class not in grouped:
904
- grouped[uri_class] = []
905
- grouped[uri_class].append(uri)
906
-
907
945
  existence: dict[ResourcePath, bool] = {}
908
- for uri_class in grouped:
909
- existence.update(uri_class._mexists(grouped[uri_class]))
946
+ for uri_class, group in cls._group_uris(uris).items():
947
+ existence.update(uri_class._mexists(group, num_workers=num_workers))
910
948
 
911
949
  return existence
912
950
 
913
951
  @classmethod
914
- def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
952
+ def _mexists(
953
+ cls, uris: Iterable[ResourcePath], *, num_workers: int | None = None
954
+ ) -> dict[ResourcePath, bool]:
915
955
  """Check for existence of multiple URIs at once.
916
956
 
917
957
  Implementation helper method for `mexists`.
918
958
 
959
+
919
960
  Parameters
920
961
  ----------
921
962
  uris : iterable of `ResourcePath`
922
963
  The URIs to test.
964
+ num_workers : `int` or `None`, optional
965
+ The number of parallel workers to use when checking for existence
966
+ If `None`, the default value will be taken from the environment.
923
967
 
924
968
  Returns
925
969
  -------
926
970
  existence : `dict` of [`ResourcePath`, `bool`]
927
971
  Mapping of original URI to boolean indicating existence.
928
972
  """
929
- with concurrent.futures.ThreadPoolExecutor(max_workers=_get_num_workers()) as exists_executor:
973
+ pool_executor_class = _get_executor_class()
974
+ if issubclass(pool_executor_class, concurrent.futures.ProcessPoolExecutor):
975
+ # Patch the environment to make it think there is only one worker
976
+ # for each subprocess.
977
+ with _patch_environ({"LSST_RESOURCES_NUM_WORKERS": "1"}):
978
+ return cls._mexists_pool(pool_executor_class, uris)
979
+ else:
980
+ return cls._mexists_pool(pool_executor_class, uris, num_workers=num_workers)
981
+
982
+ @classmethod
983
+ def _mexists_pool(
984
+ cls,
985
+ pool_executor_class: _EXECUTOR_TYPE,
986
+ uris: Iterable[ResourcePath],
987
+ *,
988
+ num_workers: int | None = None,
989
+ ) -> dict[ResourcePath, bool]:
990
+ """Check for existence of multiple URIs at once using specified pool
991
+ executor.
992
+
993
+ Implementation helper method for `_mexists`.
994
+
995
+ Parameters
996
+ ----------
997
+ pool_executor_class : `type` [ `concurrent.futures.Executor` ]
998
+ Type of executor pool to use.
999
+ uris : iterable of `ResourcePath`
1000
+ The URIs to test.
1001
+ num_workers : `int` or `None`, optional
1002
+ The number of parallel workers to use when checking for existence
1003
+ If `None`, the default value will be taken from the environment.
1004
+
1005
+ Returns
1006
+ -------
1007
+ existence : `dict` of [`ResourcePath`, `bool`]
1008
+ Mapping of original URI to boolean indicating existence.
1009
+ """
1010
+ max_workers = num_workers if num_workers is not None else _get_num_workers()
1011
+ with pool_executor_class(max_workers=max_workers) as exists_executor:
930
1012
  future_exists = {exists_executor.submit(uri.exists): uri for uri in uris}
931
1013
 
932
1014
  results: dict[ResourcePath, bool] = {}
@@ -947,7 +1029,68 @@ class ResourcePath: # numpydoc ignore=PR02
947
1029
  overwrite: bool = False,
948
1030
  transaction: TransactionProtocol | None = None,
949
1031
  do_raise: bool = True,
950
- ) -> dict[ResourcePath, MTransferResult]:
1032
+ ) -> dict[ResourcePath, MBulkResult]:
1033
+ """Transfer many files in bulk.
1034
+
1035
+ Parameters
1036
+ ----------
1037
+ transfer : `str`
1038
+ Mode to use for transferring the resource. Generically there are
1039
+ many standard options: copy, link, symlink, hardlink, relsymlink.
1040
+ Not all URIs support all modes.
1041
+ from_to : `list` [ `tuple` [ `ResourcePath`, `ResourcePath` ] ]
1042
+ A sequence of the source URIs and the target URIs.
1043
+ overwrite : `bool`, optional
1044
+ Allow an existing file to be overwritten. Defaults to `False`.
1045
+ transaction : `~lsst.resources.utils.TransactionProtocol`, optional
1046
+ A transaction object that can (depending on implementation)
1047
+ rollback transfers on error. Not guaranteed to be implemented.
1048
+ The transaction object must be thread safe.
1049
+ do_raise : `bool`, optional
1050
+ If `True` an `ExceptionGroup` will be raised containing any
1051
+ exceptions raised by the individual transfers. If `False`, or if
1052
+ there were no exceptions, a dict reporting the status of each
1053
+ `ResourcePath` will be returned.
1054
+
1055
+ Returns
1056
+ -------
1057
+ copy_status : `dict` [ `ResourcePath`, `MBulkResult` ]
1058
+ A dict of all the transfer attempts with a value indicating
1059
+ whether the transfer succeeded for the target URI. If ``do_raise``
1060
+ is `True`, this will only be returned if there are no errors.
1061
+ """
1062
+ pool_executor_class = _get_executor_class()
1063
+ if issubclass(pool_executor_class, concurrent.futures.ProcessPoolExecutor):
1064
+ # Patch the environment to make it think there is only one worker
1065
+ # for each subprocess.
1066
+ with _patch_environ({"LSST_RESOURCES_NUM_WORKERS": "1"}):
1067
+ return cls._mtransfer(
1068
+ pool_executor_class,
1069
+ transfer,
1070
+ from_to,
1071
+ overwrite=overwrite,
1072
+ transaction=transaction,
1073
+ do_raise=do_raise,
1074
+ )
1075
+ return cls._mtransfer(
1076
+ pool_executor_class,
1077
+ transfer,
1078
+ from_to,
1079
+ overwrite=overwrite,
1080
+ transaction=transaction,
1081
+ do_raise=do_raise,
1082
+ )
1083
+
1084
+ @classmethod
1085
+ def _mtransfer(
1086
+ cls,
1087
+ pool_executor_class: _EXECUTOR_TYPE,
1088
+ transfer: str,
1089
+ from_to: Iterable[tuple[ResourcePath, ResourcePath]],
1090
+ overwrite: bool = False,
1091
+ transaction: TransactionProtocol | None = None,
1092
+ do_raise: bool = True,
1093
+ ) -> dict[ResourcePath, MBulkResult]:
951
1094
  """Transfer many files in bulk.
952
1095
 
953
1096
  Parameters
@@ -971,11 +1114,11 @@ class ResourcePath: # numpydoc ignore=PR02
971
1114
 
972
1115
  Returns
973
1116
  -------
974
- copy_status : `dict` [ `ResourcePath`, `MTransferResult` ]
1117
+ copy_status : `dict` [ `ResourcePath`, `MBulkResult` ]
975
1118
  A dict of all the transfer attempts with a value indicating
976
1119
  whether the transfer succeeded for the target URI.
977
1120
  """
978
- with concurrent.futures.ThreadPoolExecutor(max_workers=_get_num_workers()) as transfer_executor:
1121
+ with pool_executor_class(max_workers=_get_num_workers()) as transfer_executor:
979
1122
  future_transfers = {
980
1123
  transfer_executor.submit(
981
1124
  to_uri.transfer_from,
@@ -987,17 +1130,17 @@ class ResourcePath: # numpydoc ignore=PR02
987
1130
  ): to_uri
988
1131
  for from_uri, to_uri in from_to
989
1132
  }
990
- results: dict[ResourcePath, MTransferResult] = {}
1133
+ results: dict[ResourcePath, MBulkResult] = {}
991
1134
  failed = False
992
1135
  for future in concurrent.futures.as_completed(future_transfers):
993
1136
  to_uri = future_transfers[future]
994
1137
  try:
995
1138
  future.result()
996
1139
  except Exception as e:
997
- transferred = MTransferResult(False, e)
1140
+ transferred = MBulkResult(False, e)
998
1141
  failed = True
999
1142
  else:
1000
- transferred = MTransferResult(True, None)
1143
+ transferred = MBulkResult(True, None)
1001
1144
  results[to_uri] = transferred
1002
1145
 
1003
1146
  if do_raise and failed:
@@ -1012,6 +1155,81 @@ class ResourcePath: # numpydoc ignore=PR02
1012
1155
  """Remove the resource."""
1013
1156
  raise NotImplementedError()
1014
1157
 
1158
+ @classmethod
1159
+ def mremove(
1160
+ cls, uris: Iterable[ResourcePath], *, do_raise: bool = True
1161
+ ) -> dict[ResourcePath, MBulkResult]:
1162
+ """Remove multiple URIs at once.
1163
+
1164
+ Parameters
1165
+ ----------
1166
+ uris : iterable of `ResourcePath`
1167
+ URIs to remove.
1168
+ do_raise : `bool`, optional
1169
+ If `True` an `ExceptionGroup` will be raised containing any
1170
+ exceptions raised by the individual transfers. If `False`, or if
1171
+ there were no exceptions, a dict reporting the status of each
1172
+ `ResourcePath` will be returned.
1173
+
1174
+ Returns
1175
+ -------
1176
+ results : `dict` [ `ResourcePath`, `MBulkResult` ]
1177
+ Dictionary mapping each URI to a result object indicating whether
1178
+ the removal succeeded or resulted in an exception. If ``do_raise``
1179
+ is `True` this will only be returned if everything succeeded.
1180
+ """
1181
+ # Group URIs by scheme since some URI schemes support native bulk
1182
+ # APIs.
1183
+ results: dict[ResourcePath, MBulkResult] = {}
1184
+ for uri_class, group in cls._group_uris(uris).items():
1185
+ results.update(uri_class._mremove(group))
1186
+ if do_raise:
1187
+ failed = any(not r.success for r in results.values())
1188
+ if failed:
1189
+ s = "s" if len(results) != 1 else ""
1190
+ raise ExceptionGroup(
1191
+ f"Error{s} removing {len(results)} artifact{s}",
1192
+ tuple(res.exception for res in results.values() if res.exception is not None),
1193
+ )
1194
+
1195
+ return results
1196
+
1197
+ @classmethod
1198
+ def _mremove(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, MBulkResult]:
1199
+ """Remove multiple URIs using futures."""
1200
+ pool_executor_class = _get_executor_class()
1201
+ if issubclass(pool_executor_class, concurrent.futures.ProcessPoolExecutor):
1202
+ # Patch the environment to make it think there is only one worker
1203
+ # for each subprocess.
1204
+ with _patch_environ({"LSST_RESOURCES_NUM_WORKERS": "1"}):
1205
+ return cls._mremove_pool(pool_executor_class, uris)
1206
+ else:
1207
+ return cls._mremove_pool(pool_executor_class, uris)
1208
+
1209
+ @classmethod
1210
+ def _mremove_pool(
1211
+ cls,
1212
+ pool_executor_class: _EXECUTOR_TYPE,
1213
+ uris: Iterable[ResourcePath],
1214
+ *,
1215
+ num_workers: int | None = None,
1216
+ ) -> dict[ResourcePath, MBulkResult]:
1217
+ """Remove URIs using a futures pool."""
1218
+ max_workers = num_workers if num_workers is not None else _get_num_workers()
1219
+ results: dict[ResourcePath, MBulkResult] = {}
1220
+ with pool_executor_class(max_workers=max_workers) as remove_executor:
1221
+ future_remove = {remove_executor.submit(uri.remove): uri for uri in uris}
1222
+ for future in concurrent.futures.as_completed(future_remove):
1223
+ try:
1224
+ future.result()
1225
+ except Exception as e:
1226
+ removed = MBulkResult(False, e)
1227
+ else:
1228
+ removed = MBulkResult(True, None)
1229
+ uri = future_remove[future]
1230
+ results[uri] = removed
1231
+ return results
1232
+
1015
1233
  def isabs(self) -> bool:
1016
1234
  """Indicate that the resource is fully specified.
1017
1235
 
lsst/resources/http.py CHANGED
@@ -59,7 +59,7 @@ from lsst.utils.timer import time_this
59
59
  from ._resourceHandles import ResourceHandleProtocol
60
60
  from ._resourceHandles._httpResourceHandle import HttpReadResourceHandle, parse_content_range_header
61
61
  from ._resourcePath import ResourcePath
62
- from .utils import get_tempdir
62
+ from .utils import _get_num_workers, get_tempdir
63
63
 
64
64
  if TYPE_CHECKING:
65
65
  from .utils import TransactionProtocol
@@ -165,14 +165,14 @@ class HttpResourcePathConfig:
165
165
  if self._front_end_connections is not None:
166
166
  return self._front_end_connections
167
167
 
168
+ default_pool_size = max(_get_num_workers(), self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS)
169
+
168
170
  try:
169
171
  self._front_end_connections = int(
170
- os.environ.get(
171
- "LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
172
- )
172
+ os.environ.get("LSST_HTTP_FRONTEND_PERSISTENT_CONNECTIONS", default_pool_size)
173
173
  )
174
174
  except ValueError:
175
- self._front_end_connections = self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS
175
+ self._front_end_connections = default_pool_size
176
176
 
177
177
  return self._front_end_connections
178
178
 
@@ -182,14 +182,14 @@ class HttpResourcePathConfig:
182
182
  if self._back_end_connections is not None:
183
183
  return self._back_end_connections
184
184
 
185
+ default_pool_size = max(_get_num_workers(), self.DEFAULT_FRONTEND_PERSISTENT_CONNECTIONS)
186
+
185
187
  try:
186
188
  self._back_end_connections = int(
187
- os.environ.get(
188
- "LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
189
- )
189
+ os.environ.get("LSST_HTTP_BACKEND_PERSISTENT_CONNECTIONS", default_pool_size)
190
190
  )
191
191
  except ValueError:
192
- self._back_end_connections = self.DEFAULT_BACKEND_PERSISTENT_CONNECTIONS
192
+ self._back_end_connections = default_pool_size
193
193
 
194
194
  return self._back_end_connections
195
195
 
@@ -587,7 +587,7 @@ class SessionStore:
587
587
  Note that "https://www.example.org" and "https://www.example.org:12345"
588
588
  will have different sessions since the port number is not identical.
589
589
  """
590
- root_uri = str(rpath.root_uri())
590
+ root_uri = _dav_to_http(str(rpath.root_uri()))
591
591
  if root_uri not in self._sessions:
592
592
  # We don't have yet a session for this endpoint: create a new one.
593
593
  self._sessions[root_uri] = self._make_session(rpath)
lsst/resources/s3.py CHANGED
@@ -20,17 +20,19 @@ import os
20
20
  import re
21
21
  import sys
22
22
  import threading
23
+ from collections import defaultdict
23
24
  from collections.abc import Iterable, Iterator
24
25
  from functools import cache, cached_property
25
26
  from typing import IO, TYPE_CHECKING, cast
26
27
 
27
28
  from botocore.exceptions import ClientError
28
29
 
30
+ from lsst.utils.iteration import chunk_iterable
29
31
  from lsst.utils.timer import time_this
30
32
 
31
33
  from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
32
34
  from ._resourceHandles._s3ResourceHandle import S3ResourceHandle
33
- from ._resourcePath import ResourcePath
35
+ from ._resourcePath import MBulkResult, ResourcePath
34
36
  from .s3utils import (
35
37
  _get_s3_connection_parameters,
36
38
  _s3_disable_bucket_validation,
@@ -220,7 +222,9 @@ class S3ResourcePath(ResourcePath):
220
222
  return bucket
221
223
 
222
224
  @classmethod
223
- def _mexists(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, bool]:
225
+ def _mexists(
226
+ cls, uris: Iterable[ResourcePath], *, num_workers: int | None = None
227
+ ) -> dict[ResourcePath, bool]:
224
228
  # Force client to be created for each profile before creating threads.
225
229
  profiles = set[str | None]()
226
230
  for path in uris:
@@ -230,7 +234,56 @@ class S3ResourcePath(ResourcePath):
230
234
  for profile in profiles:
231
235
  getS3Client(profile)
232
236
 
233
- return super()._mexists(uris)
237
+ return super()._mexists(uris, num_workers=num_workers)
238
+
239
+ @classmethod
240
+ def _mremove(cls, uris: Iterable[ResourcePath]) -> dict[ResourcePath, MBulkResult]:
241
+ # Delete multiple objects in one API call.
242
+ # Must group by profile and bucket.
243
+ grouped_uris: dict[tuple[str | None, str], list[S3ResourcePath]] = defaultdict(list)
244
+ for uri in uris:
245
+ uri = cast(S3ResourcePath, uri)
246
+ grouped_uris[uri._profile, uri._bucket].append(uri)
247
+
248
+ results: dict[ResourcePath, MBulkResult] = {}
249
+ for related_uris in grouped_uris.values():
250
+ # The client and bucket are the same for each of the remaining
251
+ # URIs.
252
+ first_uri = related_uris[0]
253
+ # API requires no more than 1000 per call.
254
+ for chunk in chunk_iterable(related_uris, chunk_size=1_000):
255
+ key_to_uri: dict[str, ResourcePath] = {}
256
+ keys: list[dict[str, str]] = []
257
+ for uri in chunk:
258
+ key = uri.relativeToPathRoot
259
+ key_to_uri[key] = uri
260
+ keys.append({"Key": key})
261
+ # Default to assuming everything worked.
262
+ results[uri] = MBulkResult(True, None)
263
+ errored = cls._delete_related_objects(first_uri.client, first_uri._bucket, keys)
264
+
265
+ # Update with error information.
266
+ for key, bulk_result in errored.items():
267
+ results[key_to_uri[key]] = bulk_result
268
+
269
+ return results
270
+
271
+ @classmethod
272
+ @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
273
+ def _delete_related_objects(
274
+ cls, client: boto3.client, bucket: str, keys: list[dict[str, str]]
275
+ ) -> dict[str, MBulkResult]:
276
+ # Delete multiple objects from the same bucket, allowing for backoff
277
+ # retry.
278
+ response = client.delete_objects(Bucket=bucket, Delete={"Objects": keys, "Quiet": True})
279
+ # Use Quiet mode so we assume everything worked unless told otherwise.
280
+ # Only returning errors -- indexed by Key name.
281
+ errors: dict[str, MBulkResult] = {}
282
+ for errored_key in response.get("Errors", []):
283
+ errors[errored_key["Key"]] = MBulkResult(
284
+ False, ClientError({"Error": errored_key}, f"delete_objects: {errored_key['Key']}")
285
+ )
286
+ return errors
234
287
 
235
288
  @backoff.on_exception(backoff.expo, retryable_io_errors, max_time=max_retry_time)
236
289
  def exists(self) -> bool:
lsst/resources/s3utils.py CHANGED
@@ -53,6 +53,7 @@ except ImportError:
53
53
 
54
54
  from ._resourcePath import ResourcePath
55
55
  from .location import Location
56
+ from .utils import _get_num_workers
56
57
 
57
58
  # https://pypi.org/project/backoff/
58
59
  try:
@@ -246,7 +247,13 @@ def _s3_disable_bucket_validation(client: boto3.client) -> None:
246
247
  @functools.lru_cache
247
248
  def _get_s3_client(endpoint_config: _EndpointConfig, skip_validation: bool) -> boto3.client:
248
249
  # Helper function to cache the client for this endpoint
249
- config = botocore.config.Config(read_timeout=180, retries={"mode": "adaptive", "max_attempts": 10})
250
+ # boto seems to assume it will always have at least 10 available.
251
+ max_pool_size = max(_get_num_workers(), 10)
252
+ config = botocore.config.Config(
253
+ read_timeout=180,
254
+ max_pool_connections=max_pool_size,
255
+ retries={"mode": "adaptive", "max_attempts": 10},
256
+ )
250
257
 
251
258
  session = boto3.Session(profile_name=endpoint_config.profile)
252
259
 
lsst/resources/tests.py CHANGED
@@ -1027,6 +1027,8 @@ class GenericReadWriteTestCase(_GenericTestCase):
1027
1027
  expected_files = {
1028
1028
  "dir1/a.yaml",
1029
1029
  "dir1/b.yaml",
1030
+ "dir1/c.yaml",
1031
+ "dir1/d.yaml",
1030
1032
  "dir2/e.yaml",
1031
1033
  }
1032
1034
  expected_uris = {root.join(f) for f in expected_files}
@@ -1035,10 +1037,20 @@ class GenericReadWriteTestCase(_GenericTestCase):
1035
1037
  self.assertTrue(uri.exists())
1036
1038
  expected_uris.add(file)
1037
1039
 
1038
- multi = ResourcePath.mexists(expected_uris)
1040
+ # Force to run with fewer workers than there are files.
1041
+ multi = ResourcePath.mexists(expected_uris, num_workers=3)
1039
1042
 
1040
1043
  for uri, is_there in multi.items():
1041
1044
  if uri == file:
1042
1045
  self.assertFalse(is_there)
1043
1046
  else:
1044
1047
  self.assertTrue(is_there)
1048
+
1049
+ # Clean up. Unfortunately POSIX raises a FileNotFoundError but
1050
+ # S3 boto does not complain if there is no key.
1051
+ ResourcePath.mremove(expected_uris, do_raise=False)
1052
+
1053
+ # Check they were really removed.
1054
+ multi = ResourcePath.mexists(expected_uris, num_workers=3)
1055
+ for uri, is_there in multi.items():
1056
+ self.assertFalse(is_there)
lsst/resources/utils.py CHANGED
@@ -15,6 +15,7 @@ __all__ = ("NoTransaction", "TransactionProtocol", "get_tempdir", "os2posix", "p
15
15
 
16
16
  import contextlib
17
17
  import logging
18
+ import multiprocessing
18
19
  import os
19
20
  import posixpath
20
21
  import shutil
@@ -33,6 +34,11 @@ IS_POSIX = os.sep == posixpath.sep
33
34
  # posix means posix and only determine explicitly in the non-posix case.
34
35
  OS_ROOT_PATH = posixpath.sep if IS_POSIX else Path().resolve().root
35
36
 
37
+ # Maximum number of worker threads for parallelized operations.
38
+ # If greater than 10, be aware that this number has to be consistent
39
+ # with connection pool sizing (for example in urllib3).
40
+ MAX_WORKERS = 10
41
+
36
42
  log = logging.getLogger(__name__)
37
43
 
38
44
 
@@ -226,3 +232,40 @@ def ensure_directory_is_writeable(directory_path: str | bytes) -> None:
226
232
  desired_mode = current_mode | stat.S_IWUSR | stat.S_IXUSR
227
233
  if current_mode != desired_mode:
228
234
  os.chmod(directory_path, desired_mode)
235
+
236
+
237
+ def _get_int_env_var(env_var: str) -> int | None:
238
+ int_value = None
239
+ env_value = os.getenv(env_var)
240
+ if env_value is not None:
241
+ with contextlib.suppress(TypeError):
242
+ int_value = int(env_value)
243
+ return int_value
244
+
245
+
246
+ @cache
247
+ def _get_num_workers() -> int:
248
+ f"""Calculate the number of workers to use.
249
+
250
+ Returns
251
+ -------
252
+ num : `int`
253
+ The number of workers to use. Will use the value of the
254
+ ``LSST_RESOURCES_NUM_WORKERS`` environment variable if set. Will fall
255
+ back to using the CPU count (plus 2) but capped at {MAX_WORKERS}.
256
+ """
257
+ num_workers: int | None = None
258
+ num_workers = _get_int_env_var("LSST_RESOURCES_NUM_WORKERS")
259
+
260
+ # If someone is explicitly specifying a number, let them use that number.
261
+ if num_workers is not None:
262
+ return num_workers
263
+
264
+ if num_workers is None:
265
+ # CPU_LIMIT is used on nublado.
266
+ cpu_limit = _get_int_env_var("CPU_LIMIT") or multiprocessing.cpu_count()
267
+ if cpu_limit is not None:
268
+ num_workers = cpu_limit + 2
269
+
270
+ # But don't ever return more than the maximum allowed.
271
+ return min([num_workers, MAX_WORKERS])
lsst/resources/version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  __all__ = ["__version__"]
2
- __version__ = "29.2025.1700"
2
+ __version__ = "29.2025.1900"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lsst-resources
3
- Version: 29.2025.1700
3
+ Version: 29.2025.1900
4
4
  Summary: An abstraction layer for reading and writing from URI file resources.
5
5
  Author-email: Rubin Observatory Data Management <dm-admin@lists.lsst.org>
6
6
  License: BSD 3-Clause License
@@ -1,28 +1,28 @@
1
1
  lsst/__init__.py,sha256=9I6UQ9gj-ZcPlvsa0OPBo76UujxXVehVzw9yMAOQvyM,466
2
2
  lsst/resources/__init__.py,sha256=BDj6uokvd0ZQNGl-Xgz5gZd83Z0L2gFqGSk0KJpylP8,778
3
- lsst/resources/_resourcePath.py,sha256=xTVyDHD-UHlF5FeDvSXXnsmOuoSFnORZD_wMksxiFfA,64926
3
+ lsst/resources/_resourcePath.py,sha256=AnyqWz6TVgDjuZO4DjFj6q4J0-zbIduCFuxwopSS6Nk,73905
4
4
  lsst/resources/file.py,sha256=-jPuoHvTEtx5tnDyNkfwhWAyX0cTwkuMd-JvJn9EGdE,23226
5
5
  lsst/resources/gs.py,sha256=Lpo5GAzH7R7HG8E5RMGOdP4j4hjWJn-k6M3OXj0nHQM,12783
6
- lsst/resources/http.py,sha256=JW3cBe4MERyjopFKkELui1BRr4b4Mkgp0Iqt9YFIxuc,88227
6
+ lsst/resources/http.py,sha256=9a_VadSabznPC0FTQtDtfV041zH25wZgXFtJ7HkvHp0,88275
7
7
  lsst/resources/location.py,sha256=x3Tq0x5o1OXYmZDxYBenUG1N71wtDhnjVAr3s2ZEiu8,7937
8
8
  lsst/resources/mem.py,sha256=VOWh7XxJPfqKcFdLZSjKEAfORQ2AHZHpxmjT8LniV60,1008
9
9
  lsst/resources/packageresource.py,sha256=vnfeRlpVwpC5cDQZE6Lnh8EH6oZy1sH2vLz9ONYjJ4k,6817
10
10
  lsst/resources/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- lsst/resources/s3.py,sha256=wrOMdFWltxpGEWeL--kPCbk5Le_viCIsEn4lOPZbXhM,24124
12
- lsst/resources/s3utils.py,sha256=cKJ9GWHETHhn1djezyikWwAaw4k1B3hFvfif96THHDQ,14355
11
+ lsst/resources/s3.py,sha256=KH9oPThUMG6bvkE_gEgJU3KXY8RgIVbKsLu0orgv1Mo,26634
12
+ lsst/resources/s3utils.py,sha256=ojWf9BPrK9mhGQ8jvs4_8Nsqf9360e79U5FnPTxe24A,14576
13
13
  lsst/resources/schemeless.py,sha256=GfJcKzZ0XIeepfQdW4HPZWiZlSp_ej0SEtSiJTrDUQs,10666
14
- lsst/resources/tests.py,sha256=MLB8hERKuNredzzg3Qq9M_U7IesV3xrbcjFwKuMp3Ok,43513
15
- lsst/resources/utils.py,sha256=IHVrOdj0szNWxiXk-jbZu1RhTR8WXks1vI9JCpBxeBA,6706
16
- lsst/resources/version.py,sha256=k6PcnewHb2vETZCsCS6q8mJ_uyMg-otfwuQ8B0oDDwQ,55
14
+ lsst/resources/tests.py,sha256=SqYLbDG6QkZTB-0UvrsiPtfmdL1TcglGeqBTPQxu9GE,44027
15
+ lsst/resources/utils.py,sha256=6O3Mq7JbPEtqyD2lM77pRpwcPMfV5SxiNMknw-F2vNs,8097
16
+ lsst/resources/version.py,sha256=duakYcQBT5t4KBaOX14WCtq_sSvpoXvNcxi3sUb8GvI,55
17
17
  lsst/resources/_resourceHandles/__init__.py,sha256=zOcZ8gVEBdAWcHJaZabA8Vdq-wAVcxjbmA_1b1IWM6M,76
18
18
  lsst/resources/_resourceHandles/_baseResourceHandle.py,sha256=lQwxDOmFUNJndTxsjpz-HxrQBL0L-z4aXQocHdOEI7c,4676
19
19
  lsst/resources/_resourceHandles/_fileResourceHandle.py,sha256=A7_WQPzD0ZlOzNmaI_TPdZybrNxrXPkNHWVla3UFxfs,3676
20
- lsst/resources/_resourceHandles/_httpResourceHandle.py,sha256=JRjpE-ZQfgKX5OyVLulIbzW38FdhovcoOd1D4rhb5vk,10900
20
+ lsst/resources/_resourceHandles/_httpResourceHandle.py,sha256=Yami8IVGeru4bLQCag-OvGG0ltz1qyEg57FY4IEB87Y,10995
21
21
  lsst/resources/_resourceHandles/_s3ResourceHandle.py,sha256=NkDmPb9bm_zMvr6mMnb-tBmqJDt0yUJrt2gZXR8l7ok,12923
22
- lsst_resources-29.2025.1700.dist-info/licenses/COPYRIGHT,sha256=yazVsoMmFwhiw5itGrdT4YPmXbpsQyUFjlpOyZIa77M,148
23
- lsst_resources-29.2025.1700.dist-info/licenses/LICENSE,sha256=7wrtgl8meQ0_RIuv2TjIKpAnNrl-ODH-QLwyHe9citI,1516
24
- lsst_resources-29.2025.1700.dist-info/METADATA,sha256=gybCKJmJyKdf0oqRgCbF6IKA0SzNalaw1nhptr1Y51I,2237
25
- lsst_resources-29.2025.1700.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
26
- lsst_resources-29.2025.1700.dist-info/top_level.txt,sha256=eUWiOuVVm9wwTrnAgiJT6tp6HQHXxIhj2QSZ7NYZH80,5
27
- lsst_resources-29.2025.1700.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
28
- lsst_resources-29.2025.1700.dist-info/RECORD,,
22
+ lsst_resources-29.2025.1900.dist-info/licenses/COPYRIGHT,sha256=yazVsoMmFwhiw5itGrdT4YPmXbpsQyUFjlpOyZIa77M,148
23
+ lsst_resources-29.2025.1900.dist-info/licenses/LICENSE,sha256=7wrtgl8meQ0_RIuv2TjIKpAnNrl-ODH-QLwyHe9citI,1516
24
+ lsst_resources-29.2025.1900.dist-info/METADATA,sha256=AsHgImBvDFy_E373VzU-E3sARIx2y1E7-gfLP-991is,2237
25
+ lsst_resources-29.2025.1900.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
26
+ lsst_resources-29.2025.1900.dist-info/top_level.txt,sha256=eUWiOuVVm9wwTrnAgiJT6tp6HQHXxIhj2QSZ7NYZH80,5
27
+ lsst_resources-29.2025.1900.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
28
+ lsst_resources-29.2025.1900.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.1)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5