rucio-clients 37.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rucio-clients might be problematic. Click here for more details.

Files changed (104) hide show
  1. rucio/__init__.py +17 -0
  2. rucio/alembicrevision.py +15 -0
  3. rucio/cli/__init__.py +14 -0
  4. rucio/cli/account.py +216 -0
  5. rucio/cli/bin_legacy/__init__.py +13 -0
  6. rucio/cli/bin_legacy/rucio.py +2825 -0
  7. rucio/cli/bin_legacy/rucio_admin.py +2500 -0
  8. rucio/cli/command.py +272 -0
  9. rucio/cli/config.py +72 -0
  10. rucio/cli/did.py +191 -0
  11. rucio/cli/download.py +128 -0
  12. rucio/cli/lifetime_exception.py +33 -0
  13. rucio/cli/replica.py +162 -0
  14. rucio/cli/rse.py +293 -0
  15. rucio/cli/rule.py +158 -0
  16. rucio/cli/scope.py +40 -0
  17. rucio/cli/subscription.py +73 -0
  18. rucio/cli/upload.py +60 -0
  19. rucio/cli/utils.py +226 -0
  20. rucio/client/__init__.py +15 -0
  21. rucio/client/accountclient.py +432 -0
  22. rucio/client/accountlimitclient.py +183 -0
  23. rucio/client/baseclient.py +983 -0
  24. rucio/client/client.py +120 -0
  25. rucio/client/configclient.py +126 -0
  26. rucio/client/credentialclient.py +59 -0
  27. rucio/client/didclient.py +868 -0
  28. rucio/client/diracclient.py +56 -0
  29. rucio/client/downloadclient.py +1783 -0
  30. rucio/client/exportclient.py +44 -0
  31. rucio/client/fileclient.py +50 -0
  32. rucio/client/importclient.py +42 -0
  33. rucio/client/lifetimeclient.py +90 -0
  34. rucio/client/lockclient.py +109 -0
  35. rucio/client/metaconventionsclient.py +140 -0
  36. rucio/client/pingclient.py +44 -0
  37. rucio/client/replicaclient.py +452 -0
  38. rucio/client/requestclient.py +125 -0
  39. rucio/client/richclient.py +317 -0
  40. rucio/client/rseclient.py +746 -0
  41. rucio/client/ruleclient.py +294 -0
  42. rucio/client/scopeclient.py +90 -0
  43. rucio/client/subscriptionclient.py +173 -0
  44. rucio/client/touchclient.py +82 -0
  45. rucio/client/uploadclient.py +969 -0
  46. rucio/common/__init__.py +13 -0
  47. rucio/common/bittorrent.py +234 -0
  48. rucio/common/cache.py +111 -0
  49. rucio/common/checksum.py +168 -0
  50. rucio/common/client.py +122 -0
  51. rucio/common/config.py +788 -0
  52. rucio/common/constants.py +217 -0
  53. rucio/common/constraints.py +17 -0
  54. rucio/common/didtype.py +237 -0
  55. rucio/common/exception.py +1208 -0
  56. rucio/common/extra.py +31 -0
  57. rucio/common/logging.py +420 -0
  58. rucio/common/pcache.py +1409 -0
  59. rucio/common/plugins.py +185 -0
  60. rucio/common/policy.py +93 -0
  61. rucio/common/schema/__init__.py +200 -0
  62. rucio/common/schema/generic.py +416 -0
  63. rucio/common/schema/generic_multi_vo.py +395 -0
  64. rucio/common/stomp_utils.py +423 -0
  65. rucio/common/stopwatch.py +55 -0
  66. rucio/common/test_rucio_server.py +154 -0
  67. rucio/common/types.py +483 -0
  68. rucio/common/utils.py +1688 -0
  69. rucio/rse/__init__.py +96 -0
  70. rucio/rse/protocols/__init__.py +13 -0
  71. rucio/rse/protocols/bittorrent.py +194 -0
  72. rucio/rse/protocols/cache.py +111 -0
  73. rucio/rse/protocols/dummy.py +100 -0
  74. rucio/rse/protocols/gfal.py +708 -0
  75. rucio/rse/protocols/globus.py +243 -0
  76. rucio/rse/protocols/http_cache.py +82 -0
  77. rucio/rse/protocols/mock.py +123 -0
  78. rucio/rse/protocols/ngarc.py +209 -0
  79. rucio/rse/protocols/posix.py +250 -0
  80. rucio/rse/protocols/protocol.py +361 -0
  81. rucio/rse/protocols/rclone.py +365 -0
  82. rucio/rse/protocols/rfio.py +145 -0
  83. rucio/rse/protocols/srm.py +338 -0
  84. rucio/rse/protocols/ssh.py +414 -0
  85. rucio/rse/protocols/storm.py +195 -0
  86. rucio/rse/protocols/webdav.py +594 -0
  87. rucio/rse/protocols/xrootd.py +302 -0
  88. rucio/rse/rsemanager.py +881 -0
  89. rucio/rse/translation.py +260 -0
  90. rucio/vcsversion.py +11 -0
  91. rucio/version.py +45 -0
  92. rucio_clients-37.0.0rc1.data/data/etc/rse-accounts.cfg.template +25 -0
  93. rucio_clients-37.0.0rc1.data/data/etc/rucio.cfg.atlas.client.template +43 -0
  94. rucio_clients-37.0.0rc1.data/data/etc/rucio.cfg.template +241 -0
  95. rucio_clients-37.0.0rc1.data/data/requirements.client.txt +19 -0
  96. rucio_clients-37.0.0rc1.data/data/rucio_client/merge_rucio_configs.py +144 -0
  97. rucio_clients-37.0.0rc1.data/scripts/rucio +133 -0
  98. rucio_clients-37.0.0rc1.data/scripts/rucio-admin +97 -0
  99. rucio_clients-37.0.0rc1.dist-info/METADATA +54 -0
  100. rucio_clients-37.0.0rc1.dist-info/RECORD +104 -0
  101. rucio_clients-37.0.0rc1.dist-info/WHEEL +5 -0
  102. rucio_clients-37.0.0rc1.dist-info/licenses/AUTHORS.rst +100 -0
  103. rucio_clients-37.0.0rc1.dist-info/licenses/LICENSE +201 -0
  104. rucio_clients-37.0.0rc1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1783 @@
1
+ # Copyright European Organization for Nuclear Research (CERN) since 2012
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import enum
17
+ import itertools
18
+ import logging
19
+ import os
20
+ import random
21
+ import secrets
22
+ import shutil
23
+ import signal
24
+ import subprocess
25
+ import time
26
+ from queue import Empty, Queue, deque
27
+ from threading import Thread
28
+ from typing import TYPE_CHECKING, Any, Optional
29
+
30
+ from rucio import version
31
+ from rucio.client.client import Client
32
+ from rucio.common.checksum import CHECKSUM_ALGO_DICT, GLOBALLY_SUPPORTED_CHECKSUMS, PREFERRED_CHECKSUM, adler32
33
+ from rucio.common.client import detect_client_location
34
+ from rucio.common.config import config_get
35
+ from rucio.common.didtype import DID
36
+ from rucio.common.exception import InputValidationError, NoFilesDownloaded, NotAllFilesDownloaded, RucioException
37
+ from rucio.common.pcache import Pcache
38
+ from rucio.common.utils import execute, extract_scope, generate_uuid, parse_replicas_from_file, parse_replicas_from_string, send_trace, sizefmt
39
+ from rucio.rse import rsemanager as rsemgr
40
+
41
+ if TYPE_CHECKING:
42
+ from collections.abc import Iterable, Iterator
43
+ from xmlrpc.client import ServerProxy as RPCServerProxy
44
+
45
+ from rucio.common.constants import SORTING_ALGORITHMS_LITERAL
46
+ from rucio.common.types import LoggerFunction
47
+
48
+
49
+ @enum.unique
50
+ class FileDownloadState(str, enum.Enum):
51
+ """
52
+ The state a file can be in before/while/after downloading.
53
+ """
54
+ PROCESSING = "PROCESSING"
55
+ DOWNLOAD_ATTEMPT = "DOWNLOAD_ATTEMPT"
56
+ DONE = "DONE"
57
+ ALREADY_DONE = "ALREADY_DONE"
58
+ FOUND_IN_PCACHE = "FOUND_IN_PCACHE"
59
+ FILE_NOT_FOUND = "FILE_NOT_FOUND"
60
+ FAIL_VALIDATE = "FAIL_VALIDATE"
61
+ FAILED = "FAILED"
62
+
63
+
64
+ class BaseExtractionTool:
65
+
66
+ def __init__(
67
+ self,
68
+ program_name: str,
69
+ useability_check_args: str,
70
+ extract_args: str,
71
+ logger: "LoggerFunction" = logging.log
72
+ ):
73
+ """
74
+ Initialises a extraction tool object
75
+
76
+ :param program_name: the name of the archive extraction program, e.g., unzip
77
+ :param useability_check_args: the arguments of the extraction program to test if its installed, e.g., --version
78
+ :param extract_args: the arguments that will be passed to the program for extraction
79
+ :param logger: optional decorated logging.log object that can be passed from the calling daemon or client.
80
+ """
81
+ self.program_name = program_name
82
+ self.useability_check_args = useability_check_args
83
+ self.extract_args = extract_args
84
+ self.logger = logger
85
+ self.is_useable_result = None
86
+
87
+ def is_useable(self) -> bool:
88
+ """
89
+ Checks if the extraction tool is installed and usable
90
+
91
+ :returns: True if it is usable otherwise False
92
+ """
93
+ if self.is_useable_result is not None:
94
+ return self.is_useable_result
95
+ self.is_usable_result = False
96
+ cmd = '%s %s' % (self.program_name, self.useability_check_args)
97
+ try:
98
+ exitcode, out, err = execute(cmd)
99
+ exitcode = int(exitcode)
100
+ self.logger(logging.DEBUG, '"%s" returned with exitcode %d' % (cmd, exitcode))
101
+ self.is_usable_result = (exitcode == 0)
102
+ except Exception as error:
103
+ self.logger(logging.DEBUG, 'Failed to execute: "%s"' % cmd)
104
+ self.logger(logging.DEBUG, error)
105
+ return self.is_usable_result
106
+
107
+ def try_extraction(
108
+ self,
109
+ archive_file_path: str,
110
+ file_to_extract: str,
111
+ dest_dir_path: str
112
+ ) -> bool:
113
+ """
114
+ Calls the extraction program to extract a file from an archive
115
+
116
+ :param archive_file_path: path to the archive
117
+ :param file_to_extract: file name to extract from the archive
118
+ :param dest_dir_path: destination directory where the extracted file will be stored
119
+
120
+ :returns: True on success otherwise False
121
+ """
122
+ if not self.is_useable():
123
+ return False
124
+ args_map = {'archive_file_path': archive_file_path,
125
+ 'file_to_extract': file_to_extract,
126
+ 'dest_dir_path': dest_dir_path}
127
+ extract_args = self.extract_args % args_map
128
+ cmd = '%s %s' % (self.program_name, extract_args)
129
+ try:
130
+ exitcode, out, err = execute(cmd)
131
+ exitcode = int(exitcode)
132
+ self.logger(logging.DEBUG, '"%s" returned with exitcode %d' % (cmd, exitcode))
133
+ return (exitcode == 0)
134
+ except Exception as error:
135
+ self.logger(logging.DEBUG, 'Failed to execute: "%s"' % cmd)
136
+ self.logger(logging.DEBUG, error)
137
+ return False
138
+
139
+
140
+ class DownloadClient:
141
+
142
+ def __init__(
143
+ self,
144
+ client: Optional[Client] = None,
145
+ logger: Optional["LoggerFunction"] = None,
146
+ tracing: bool = True,
147
+ check_admin: bool = False,
148
+ check_pcache: bool = False
149
+ ):
150
+ """
151
+ Initialises the basic settings for an DownloadClient object
152
+
153
+ :param client: Optional: rucio.client.client.Client object. If None, a new object will be created.
154
+ :param external_traces: Optional: reference to a list where traces can be added
155
+ :param logger: Optional: logging.Logger object. If None, default logger will be used.
156
+ """
157
+ self.check_pcache = check_pcache
158
+ if logger is None:
159
+ self.logger = logging.log
160
+ else:
161
+ if hasattr(logger, "debug"):
162
+ self.logger = logger.log
163
+ else:
164
+ self.logger = logger
165
+
166
+ self.tracing = tracing
167
+
168
+ if not self.tracing:
169
+ self.logger(logging.DEBUG, 'Tracing is turned off.')
170
+
171
+ self.is_human_readable = True
172
+ self.client = client if client else Client()
173
+ # if token should be used, use only JWT tokens
174
+ self.auth_token = self.client.auth_token if len(self.client.auth_token.split(".")) == 3 else None
175
+
176
+ self.client_location = detect_client_location()
177
+
178
+ self.is_tape_excluded = True
179
+ self.is_admin = False
180
+ if check_admin:
181
+ account_attributes = list(self.client.list_account_attributes(self.client.account))
182
+ for attr in account_attributes[0]:
183
+ if attr['key'] == 'admin':
184
+ self.is_admin = attr['value'] is True
185
+ break
186
+ if self.is_admin:
187
+ self.is_tape_excluded = False
188
+ self.logger(logging.DEBUG, 'Admin mode enabled')
189
+
190
+ self.trace_tpl = {}
191
+ self.trace_tpl['hostname'] = self.client_location['fqdn']
192
+ self.trace_tpl['localSite'] = self.client_location['site']
193
+ self.trace_tpl['account'] = self.client.account
194
+ if self.client.vo != 'def':
195
+ self.trace_tpl['vo'] = self.client.vo
196
+ self.trace_tpl['eventType'] = 'download'
197
+ self.trace_tpl['eventVersion'] = 'api_%s' % version.RUCIO_VERSION[0]
198
+
199
+ self.use_cea_threshold = 10
200
+ self.extraction_tools = []
201
+
202
+ # unzip <archive_file_path> <did_name> -d <dest_dir_path>
203
+ extract_args = '%(archive_file_path)s %(file_to_extract)s -d %(dest_dir_path)s'
204
+ self.extraction_tools.append(BaseExtractionTool('unzip', '-v', extract_args, logger=self.logger))
205
+
206
+ # tar -C <dest_dir_path> -xf <archive_file_path> <did_name>
207
+ extract_args = '-C %(dest_dir_path)s -xf %(archive_file_path)s %(file_to_extract)s'
208
+ self.extraction_tools.append(BaseExtractionTool('tar', '--version', extract_args, logger=self.logger))
209
+ self.extract_scope_convention = config_get('common', 'extract_scope', False, None)
210
+
211
+ def download_pfns(
212
+ self,
213
+ items: list[dict[str, Any]],
214
+ num_threads: int = 2,
215
+ trace_custom_fields: Optional[dict[str, Any]] = None,
216
+ traces_copy_out: Optional[list[dict[str, Any]]] = None,
217
+ deactivate_file_download_exceptions: bool = False
218
+ ) -> list[dict[str, Any]]:
219
+ """
220
+ Download items with a given PFN. This function can only download files, no datasets.
221
+
222
+ :param items: List of dictionaries. Each dictionary describing a file to download. Keys:
223
+ pfn - PFN string of this file
224
+ did - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed
225
+ rse - rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are not allowed
226
+ base_dir - Optional: Base directory where the downloaded files will be stored. (Default: '.')
227
+ no_subdir - Optional: If true, files are written directly into base_dir. (Default: False)
228
+ adler32 - Optional: The adler32 checmsum to compare the downloaded files adler32 checksum with
229
+ md5 - Optional: The md5 checksum to compare the downloaded files md5 checksum with
230
+ transfer_timeout - Optional: Timeout time for the download protocols. (Default: None)
231
+ check_local_with_filesize_only - Optional: If true, already downloaded files will not be validated by checksum.
232
+ :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high.
233
+ :param trace_custom_fields: Custom key value pairs to send with the traces
234
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
235
+ :param deactivate_file_download_exceptions: Boolean, if file download exceptions shouldn't be raised
236
+
237
+
238
+ :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
239
+ clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED
240
+
241
+ :raises InputValidationError: if one of the input items is in the wrong format
242
+ :raises NoFilesDownloaded: if no files could be downloaded
243
+ :raises NotAllFilesDownloaded: if not all files could be downloaded
244
+ :raises RucioException: if something unexpected went wrong during the download
245
+ """
246
+ trace_custom_fields = trace_custom_fields or {}
247
+ logger = self.logger
248
+ trace_custom_fields['uuid'] = generate_uuid()
249
+
250
+ logger(logging.INFO, 'Processing %d item(s) for input' % len(items))
251
+ input_items = []
252
+ for item in items:
253
+ did_str = item.get('did')
254
+ pfn = item.get('pfn')
255
+ rse = item.get('rse')
256
+ item['input_dids'] = {DID(did_str): {}}
257
+
258
+ if not did_str or not pfn or not rse:
259
+ logger(logging.DEBUG, item)
260
+ raise InputValidationError('The keys did, pfn, and rse are mandatory')
261
+
262
+ logger(logging.DEBUG, 'Preparing PFN download of %s (%s) from %s' % (did_str, pfn, rse))
263
+
264
+ if '*' in did_str:
265
+ logger(logging.DEBUG, did_str)
266
+ raise InputValidationError('Cannot use PFN download with wildcard in DID')
267
+
268
+ did_scope, did_name = self._split_did_str(did_str)
269
+ dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'), did_scope, item.get('no_subdir'))
270
+
271
+ item['scope'] = did_scope
272
+ item['name'] = did_name
273
+ item['sources'] = [{'pfn': pfn, 'rse': rse}]
274
+ did_path_name = did_name
275
+ if did_name.startswith('/'):
276
+ did_path_name = did_name[1:]
277
+ dest_file_path = os.path.join(dest_dir_path, did_path_name)
278
+ item['dest_file_paths'] = [dest_file_path]
279
+ item['temp_file_path'] = '%s.part' % dest_file_path
280
+ options = item.setdefault('merged_options', {})
281
+ options['ignore_checksum'] = 'adler32' not in item and 'md5' not in item
282
+ options.setdefault('transfer_timeout', item.pop('transfer_timeout', None))
283
+
284
+ input_items.append(item)
285
+
286
+ num_files_in = len(input_items)
287
+ output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields, traces_copy_out)
288
+ num_files_out = len(output_items)
289
+
290
+ if not deactivate_file_download_exceptions and num_files_in != num_files_out:
291
+ raise RucioException('%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out))
292
+
293
+ return self._check_output(output_items, deactivate_file_download_exceptions=deactivate_file_download_exceptions)
294
+
295
+ def download_dids(
296
+ self,
297
+ items: list[dict[str, Any]],
298
+ num_threads: int = 2,
299
+ trace_custom_fields: Optional[dict[str, Any]] = None,
300
+ traces_copy_out: Optional[list[dict[str, Any]]] = None,
301
+ deactivate_file_download_exceptions: bool = False,
302
+ sort: Optional["SORTING_ALGORITHMS_LITERAL"] = None
303
+ ) -> list[dict[str, Any]]:
304
+ """
305
+ Download items with given DIDs. This function can also download datasets and wildcarded DIDs.
306
+
307
+ :param items: List of dictionaries. Each dictionary describing an item to download. Keys:
308
+ did - DID string of this file (e.g. 'scope:file.name')
309
+ filters - Filter to select DIDs for download. Optional if DID is given
310
+ rse - Optional: rse name (e.g. 'CERN-PROD_DATADISK') or rse expression from where to download
311
+ impl - Optional: name of the protocol implementation to be used to download this item.
312
+ no_resolve_archives - Optional: bool indicating whether archives should not be considered for download (Default: False)
313
+ resolve_archives - Deprecated: Use no_resolve_archives instead
314
+ force_scheme - Optional: force a specific scheme to download this item. (Default: None)
315
+ base_dir - Optional: base directory where the downloaded files will be stored. (Default: '.')
316
+ no_subdir - Optional: If true, files are written directly into base_dir. (Default: False)
317
+ nrandom - Optional: if the DID addresses a dataset, nrandom files will be randomly chosen for download from the dataset
318
+ ignore_checksum - Optional: If true, skips the checksum validation between the downloaded file and the rucio catalouge. (Default: False)
319
+ transfer_timeout - Optional: Timeout time for the download protocols. (Default: None)
320
+ transfer_speed_timeout - Optional: Minimum allowed transfer speed (in KBps). Ignored if transfer_timeout set. Otherwise, used to compute default timeout (Default: 500)
321
+ check_local_with_filesize_only - Optional: If true, already downloaded files will not be validated by checksum.
322
+ :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high.
323
+ :param trace_custom_fields: Custom key value pairs to send with the traces.
324
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
325
+ :param deactivate_file_download_exceptions: Boolean, if file download exceptions shouldn't be raised
326
+ :param sort: Select best replica by replica sorting algorithm. Available algorithms:
327
+ ``geoip`` - based on src/dst IP topographical distance
328
+
329
+ :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
330
+
331
+ :raises InputValidationError: if one of the input items is in the wrong format
332
+ :raises NoFilesDownloaded: if no files could be downloaded
333
+ :raises NotAllFilesDownloaded: if not all files could be downloaded
334
+ :raises RucioException: if something unexpected went wrong during the download
335
+ """
336
+ trace_custom_fields = trace_custom_fields or {}
337
+ logger = self.logger
338
+ trace_custom_fields['uuid'] = generate_uuid()
339
+
340
+ logger(logging.INFO, 'Processing %d item(s) for input' % len(items))
341
+ did_to_input_items, file_items_with_sources = self._resolve_and_merge_input_items(copy.deepcopy(items), sort=sort)
342
+ self.logger(logging.DEBUG, 'num_unmerged_items=%d; num_dids=%d; num_file_items=%d' % (len(items), len(did_to_input_items), len(file_items_with_sources)))
343
+
344
+ input_items = self._prepare_items_for_download(did_to_input_items, file_items_with_sources)
345
+
346
+ num_files_in = len(input_items)
347
+ output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields, traces_copy_out)
348
+ num_files_out = len(output_items)
349
+
350
+ if not deactivate_file_download_exceptions and num_files_in != num_files_out:
351
+ raise RucioException('%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out))
352
+
353
+ return self._check_output(output_items, deactivate_file_download_exceptions=deactivate_file_download_exceptions)
354
+
355
+ def download_from_metalink_file(
356
+ self,
357
+ item: dict[str, Any],
358
+ metalink_file_path: str,
359
+ num_threads: int = 2,
360
+ trace_custom_fields: Optional[dict[str, Any]] = None,
361
+ traces_copy_out: Optional[list[dict[str, Any]]] = None,
362
+ deactivate_file_download_exceptions: bool = False
363
+ ) -> list[dict[str, Any]]:
364
+ """
365
+ Download items using a given metalink file.
366
+
367
+ :param item: dictionary describing an item to download. Keys:
368
+ base_dir - Optional: base directory where the downloaded files will be stored. (Default: '.')
369
+ no_subdir - Optional: If true, files are written directly into base_dir. (Default: False)
370
+ ignore_checksum - Optional: If true, skips the checksum validation between the downloaded file and the rucio catalouge. (Default: False)
371
+ transfer_timeout - Optional: Timeout time for the download protocols. (Default: None)
372
+ check_local_with_filesize_only - Optional: If true, already downloaded files will not be validated by checksum.
373
+
374
+ :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high.
375
+ :param trace_custom_fields: Custom key value pairs to send with the traces.
376
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
377
+ :param deactivate_file_download_exceptions: Boolean, if file download exceptions shouldn't be raised
378
+
379
+ :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
380
+
381
+ :raises InputValidationError: if one of the input items is in the wrong format
382
+ :raises NoFilesDownloaded: if no files could be downloaded
383
+ :raises NotAllFilesDownloaded: if not all files could be downloaded
384
+ :raises RucioException: if something unexpected went wrong during the download
385
+ """
386
+ trace_custom_fields = trace_custom_fields or {}
387
+ logger = self.logger
388
+
389
+ logger(logging.INFO, 'Getting sources from metalink file')
390
+ metalinks = parse_replicas_from_file(metalink_file_path)
391
+
392
+ trace_custom_fields['uuid'] = generate_uuid()
393
+
394
+ did_to_options = {}
395
+ for metalink in metalinks:
396
+ did = DID(metalink['did'])
397
+ did_to_options[did] = [item]
398
+ metalink['input_dids'] = {did: {}}
399
+
400
+ input_items = self._prepare_items_for_download(did_to_options, metalinks)
401
+
402
+ num_files_in = len(input_items)
403
+ output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields, traces_copy_out)
404
+ num_files_out = len(output_items)
405
+
406
+ if not deactivate_file_download_exceptions and num_files_in != num_files_out:
407
+ raise RucioException('%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out))
408
+
409
+ return self._check_output(output_items, deactivate_file_download_exceptions=deactivate_file_download_exceptions)
410
+
411
+ def _download_multithreaded(
412
+ self,
413
+ input_items: list[dict[str, Any]],
414
+ num_threads: int,
415
+ trace_custom_fields: Optional[dict[str, Any]] = None,
416
+ traces_copy_out: Optional[list[dict[str, Any]]] = None
417
+ ) -> list[dict[str, Any]]:
418
+ """
419
+ Starts an appropriate number of threads to download items from the input list.
420
+ (This function is meant to be used as class internal only)
421
+
422
+ :param input_items: list containing the input items to download
423
+ :param num_threads: suggestion of how many threads should be started
424
+ :param trace_custom_fields: Custom key value pairs to send with the traces
425
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
426
+
427
+ :returns: list with output items as dictionaries
428
+ """
429
+ trace_custom_fields = trace_custom_fields or {}
430
+ logger = self.logger
431
+
432
+ num_files = len(input_items)
433
+ nlimit = 5
434
+ num_threads = max(1, num_threads)
435
+ num_threads = min(num_files, num_threads, nlimit)
436
+
437
+ input_queue = Queue()
438
+ output_queue = Queue()
439
+ input_queue.queue = deque(input_items)
440
+
441
+ if num_threads < 2:
442
+ logger(logging.INFO, 'Using main thread to download %d file(s)' % num_files)
443
+ self._download_worker(input_queue, output_queue, trace_custom_fields, traces_copy_out, '')
444
+ return list(output_queue.queue)
445
+
446
+ logger(logging.INFO, 'Using %d threads to download %d files' % (num_threads, num_files))
447
+ threads = []
448
+ for thread_num in range(0, num_threads):
449
+ log_prefix = 'Thread %s/%s: ' % (thread_num, num_threads)
450
+ kwargs = {'input_queue': input_queue,
451
+ 'output_queue': output_queue,
452
+ 'trace_custom_fields': trace_custom_fields,
453
+ 'traces_copy_out': traces_copy_out,
454
+ 'log_prefix': log_prefix}
455
+ try:
456
+ thread = Thread(target=self._download_worker, kwargs=kwargs)
457
+ thread.start()
458
+ threads.append(thread)
459
+ except Exception as error:
460
+ logger(logging.WARNING, 'Failed to start thread %d' % thread_num)
461
+ logger(logging.DEBUG, error)
462
+
463
+ try:
464
+ logger(logging.DEBUG, 'Waiting for threads to finish')
465
+ for thread in threads:
466
+ thread.join()
467
+ except KeyboardInterrupt:
468
+ logger(logging.WARNING, 'You pressed Ctrl+C! Exiting gracefully')
469
+ for thread in threads:
470
+ thread.kill_received = True
471
+ return list(output_queue.queue)
472
+
473
+ def _download_worker(
474
+ self,
475
+ input_queue: Queue,
476
+ output_queue: Queue,
477
+ trace_custom_fields: dict[str, Any],
478
+ traces_copy_out: Optional[list[dict[str, Any]]],
479
+ log_prefix: str
480
+ ) -> None:
481
+ """
482
+ This function runs as long as there are items in the input queue,
483
+ downloads them and stores the output in the output queue.
484
+ (This function is meant to be used as class internal only)
485
+
486
+ :param input_queue: queue containing the input items to download
487
+ :param output_queue: queue where the output items will be stored
488
+ :param trace_custom_fields: Custom key value pairs to send with the traces
489
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
490
+ :param log_prefix: string that will be put at the beginning of every log message
491
+ """
492
+ logger = self.logger
493
+
494
+ logger(logging.DEBUG, '%sStart processing queued downloads' % log_prefix)
495
+ while True:
496
+ try:
497
+ item = input_queue.get_nowait()
498
+ except Empty:
499
+ break
500
+ try:
501
+ trace = copy.deepcopy(self.trace_tpl)
502
+ trace.update(trace_custom_fields)
503
+ download_result = self._download_item(item, trace, traces_copy_out, log_prefix)
504
+ output_queue.put(download_result)
505
+ except KeyboardInterrupt:
506
+ logger(logging.WARNING, 'You pressed Ctrl+C! Exiting gracefully')
507
+ os.kill(os.getpgid(), signal.SIGINT)
508
+ break
509
+ except Exception as error:
510
+ logger(logging.ERROR, '%sFailed to download item' % log_prefix)
511
+ logger(logging.DEBUG, error)
512
+ item["clientState"] = "FAILED"
513
+ output_queue.put(item)
514
+
515
+ @staticmethod
516
+ def _compute_actual_transfer_timeout(item: dict[str, Any]) -> int:
517
+ """
518
+ Merge the two options related to timeout into the value which will be used for protocol download.
519
+ :param item: dictionary that describes the item to download
520
+ :return: timeout in seconds
521
+ """
522
+ default_transfer_timeout = 360
523
+ default_transfer_speed_timeout = 500 # KBps
524
+ # Static additive increment of the speed timeout. To include the static cost of
525
+ # establishing connections and download of small files
526
+ transfer_speed_timeout_static_increment = 60
527
+
528
+ transfer_timeout: Optional[int] = item.get('merged_options', {}).get('transfer_timeout')
529
+ if transfer_timeout is not None:
530
+ return transfer_timeout
531
+
532
+ transfer_speed_timeout: Optional[int] = item.get('merged_options', {}).get('transfer_speed_timeout')
533
+ bytes_ = item.get('bytes')
534
+ if not bytes_ or transfer_speed_timeout is None:
535
+ return default_transfer_timeout
536
+
537
+ if not transfer_speed_timeout > 0:
538
+ transfer_speed_timeout = default_transfer_speed_timeout
539
+
540
+ # Convert from KBytes/s to bytes/s
541
+ transfer_speed_timeout = transfer_speed_timeout * 1000
542
+ timeout = bytes_ // transfer_speed_timeout + transfer_speed_timeout_static_increment
543
+ return timeout
544
+
545
+ def _download_item(
546
+ self,
547
+ item: dict[str, Any],
548
+ trace: dict[str, Any],
549
+ traces_copy_out: Optional[list[dict[str, Any]]],
550
+ log_prefix: str = ''
551
+ ) -> dict[str, Any]:
552
+ """
553
+ Downloads the given item and sends traces for success/failure.
554
+ (This function is meant to be used as class internal only)
555
+
556
+ :param item: dictionary that describes the item to download
557
+ :param trace: dictionary representing a pattern of trace that will be send
558
+ :param traces_copy_out: reference to an external list, where the traces should be uploaded
559
+ :param log_prefix: string that will be put at the beginning of every log message
560
+
561
+ :returns: dictionary with all attributes from the input item and a clientState attribute
562
+ """
563
+ logger = self.logger
564
+ pcache = Pcache() if self.check_pcache and len(item.get('archive_items', [])) == 0 else None
565
+ did_scope = item['scope']
566
+ did_name = item['name']
567
+ did_str = '%s:%s' % (did_scope, did_name)
568
+ logger(logging.INFO, '%sPreparing download of %s' % (log_prefix, did_str))
569
+
570
+ trace['scope'] = did_scope
571
+ trace['filename'] = did_name
572
+ trace.setdefault('datasetScope', item.get('dataset_scope', ''))
573
+ trace.setdefault('dataset', item.get('dataset_name', ''))
574
+ trace.setdefault('filesize', item.get('bytes'))
575
+ trace.setdefault('clientState', FileDownloadState.PROCESSING)
576
+ trace.setdefault('stateReason', 'UNKNOWN')
577
+
578
+ dest_file_paths = item['dest_file_paths']
579
+
580
+ # appending trace to list reference, if the reference exists
581
+ if traces_copy_out is not None:
582
+ traces_copy_out.append(trace)
583
+
584
+ # if file already exists make sure it exists at all destination paths, set state, send trace, and return
585
+ for dest_file_path in dest_file_paths:
586
+ if os.path.isfile(dest_file_path):
587
+ if item.get('merged_options', {}).get('check_local_with_filesize_only', False):
588
+ local_filesize = os.stat(dest_file_path).st_size
589
+ if item.get('bytes') != local_filesize:
590
+ logger(logging.INFO, '%sFile with same name exists locally, but filesize mismatches: %s' % (log_prefix, did_str))
591
+ logger(logging.DEBUG, '%slocal filesize: %d bytes, expected filesize: %d bytes' % (log_prefix, local_filesize, item.get('bytes')))
592
+ continue
593
+ elif not item.get('merged_options', {}).get('ignore_checksum', False):
594
+ verified, _, _ = _verify_checksum(item, dest_file_path)
595
+ if not verified:
596
+ logger(logging.INFO, '%sFile with same name exists locally, but checksum mismatches: %s' % (log_prefix, did_str))
597
+ continue
598
+
599
+ logger(logging.INFO, '%sFile exists already locally: %s' % (log_prefix, did_str))
600
+ for missing_file_path in dest_file_paths:
601
+ if not os.path.isfile(missing_file_path):
602
+ logger(logging.DEBUG, "copying '%s' to '%s'" % (dest_file_path, missing_file_path))
603
+ shutil.copy2(dest_file_path, missing_file_path)
604
+ item['clientState'] = FileDownloadState.ALREADY_DONE
605
+ trace['transferStart'] = time.time()
606
+ trace['transferEnd'] = time.time()
607
+ trace['clientState'] = FileDownloadState.ALREADY_DONE
608
+ send_trace(trace, self.client.host, self.client.user_agent)
609
+ return item
610
+
611
+ # check if file has replicas
612
+ sources = item.get('sources')
613
+ if not sources or not len(sources):
614
+ logger(logging.WARNING, '%sNo available source found for file: %s' % (log_prefix, did_str))
615
+ item['clientState'] = FileDownloadState.FILE_NOT_FOUND
616
+ trace['clientState'] = FileDownloadState.FILE_NOT_FOUND
617
+ trace['stateReason'] = 'No available sources'
618
+ self._send_trace(trace)
619
+ return item
620
+
621
+ # checking Pcache
622
+ storage_prefix = None
623
+ if pcache:
624
+
625
+ # to check only first replica is enough
626
+ pfn = sources[0]['pfn']
627
+ rse_name = sources[0]['rse']
628
+
629
+ # protocols are needed to extract deterministic part of the pfn
630
+ scheme = None
631
+ prots = self.client.get_protocols(rse_name)
632
+ for prot in prots:
633
+ if prot['scheme'] in pfn and prot['prefix'] in pfn:
634
+ scheme = prot['scheme']
635
+ storage_prefix = prot['prefix']
636
+
637
+ # proceed with the actual check
638
+ logger(logging.INFO, 'Checking whether %s is in pcache' % dest_file_path)
639
+ pcache_state = None
640
+ hardlink_state = None
641
+ try:
642
+ pcache_state, hardlink_state = pcache.check_and_link(src=pfn, storage_root=storage_prefix, dst=dest_file_path)
643
+ except Exception as e:
644
+ logger(logging.WARNING, 'Pcache failure: %s' % str(e))
645
+
646
+ # if file found in pcache, send trace and return
647
+ if pcache_state == 0 and hardlink_state == 1:
648
+ logger(logging.INFO, 'File found in pcache.')
649
+ item['clientState'] = FileDownloadState.FOUND_IN_PCACHE
650
+ trace['transferStart'] = time.time()
651
+ trace['transferEnd'] = time.time()
652
+ trace['clientState'] = FileDownloadState.FOUND_IN_PCACHE
653
+ self._send_trace(trace)
654
+ return item
655
+ else:
656
+ logger(logging.INFO, 'File not found in pcache.')
657
+
658
+ # try different PFNs until one succeeded
659
+ temp_file_path = item['temp_file_path']
660
+ success = False
661
+ i = 0
662
+ while not success and i < len(sources):
663
+ source = sources[i]
664
+ i += 1
665
+ pfn = source['pfn']
666
+ rse_name = source['rse']
667
+ scheme = pfn.split(':')[0]
668
+
669
+ try:
670
+ rse = rsemgr.get_rse_info(rse_name, vo=self.client.vo)
671
+ except RucioException as error:
672
+ logger(logging.WARNING, '%sCould not get info of RSE %s: %s' % (log_prefix, rse_name, error))
673
+ trace['stateReason'] = str(error)
674
+ continue
675
+
676
+ trace['remoteSite'] = rse_name
677
+ trace['clientState'] = FileDownloadState.DOWNLOAD_ATTEMPT
678
+ trace['protocol'] = scheme
679
+
680
+ transfer_timeout = self._compute_actual_transfer_timeout(item)
681
+ timeout_log_string = ""
682
+ if transfer_timeout:
683
+ timeout_log_string = " and timeout of %ds" % transfer_timeout
684
+
685
+ logger(logging.INFO, '%sTrying to download with %s%s from %s: %s ' % (log_prefix, scheme, timeout_log_string, rse_name, did_str))
686
+
687
+ impl = item.get('impl')
688
+ if impl:
689
+ logger(logging.INFO, '%sUsing Implementation (impl): %s ' % (log_prefix, impl))
690
+
691
+ try:
692
+ protocol = rsemgr.create_protocol(rse, operation='read', scheme=scheme, impl=impl, auth_token=self.auth_token, logger=logger)
693
+ protocol.connect()
694
+ except Exception as error:
695
+ logger(logging.WARNING, '%sFailed to create protocol for PFN: %s' % (log_prefix, pfn))
696
+ logger(logging.DEBUG, 'scheme: %s, exception: %s' % (scheme, error))
697
+ trace['stateReason'] = str(error)
698
+ continue
699
+
700
+ logger(logging.INFO, '%sUsing PFN: %s' % (log_prefix, pfn))
701
+ attempt = 0
702
+ retries = 2
703
+ # do some retries with the same PFN if the download fails
704
+ while not success and attempt < retries:
705
+ attempt += 1
706
+ item['attemptnr'] = attempt
707
+
708
+ if os.path.isfile(temp_file_path):
709
+ logger(logging.DEBUG, '%sDeleting existing temporary file: %s' % (log_prefix, temp_file_path))
710
+ os.unlink(temp_file_path)
711
+
712
+ start_time = time.time()
713
+
714
+ try:
715
+ protocol.get(pfn, temp_file_path, transfer_timeout=transfer_timeout)
716
+ success = True
717
+ except Exception as error:
718
+ logger(logging.DEBUG, error)
719
+ trace['clientState'] = FileDownloadState.FAILED
720
+ trace['stateReason'] = str(error)
721
+
722
+ end_time = time.time()
723
+
724
+ if success and not item.get('merged_options', {}).get('ignore_checksum', False):
725
+ verified, rucio_checksum, local_checksum = _verify_checksum(item, temp_file_path)
726
+ if not verified:
727
+ success = False
728
+ os.unlink(temp_file_path)
729
+ logger(logging.WARNING, '%sChecksum validation failed for file: %s' % (log_prefix, did_str))
730
+ logger(logging.DEBUG, 'Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum))
731
+ trace['clientState'] = FileDownloadState.FAIL_VALIDATE
732
+ trace['stateReason'] = 'Checksum validation failed: Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)
733
+ if not success:
734
+ logger(logging.WARNING, '%sDownload attempt failed. Try %s/%s' % (log_prefix, attempt, retries))
735
+ self._send_trace(trace)
736
+
737
+ protocol.close()
738
+
739
+ if not success:
740
+ logger(logging.ERROR, '%sFailed to download file %s' % (log_prefix, did_str))
741
+ item['clientState'] = FileDownloadState.FAILED
742
+ return item
743
+
744
+ dest_file_path_iter = iter(dest_file_paths)
745
+ first_dest_file_path = next(dest_file_path_iter)
746
+ logger(logging.DEBUG, "renaming '%s' to '%s'" % (temp_file_path, first_dest_file_path))
747
+ os.rename(temp_file_path, first_dest_file_path)
748
+
749
+ # if the file was downloaded with success, it can be linked to pcache
750
+ if pcache:
751
+ logger(logging.INFO, 'File %s is going to be registered into pcache.' % dest_file_path)
752
+ try:
753
+ pcache_state, hardlink_state = pcache.check_and_link(src=pfn, storage_root=storage_prefix, local_src=first_dest_file_path)
754
+ logger(logging.INFO, 'File %s is now registered into pcache.' % first_dest_file_path)
755
+ except Exception as e:
756
+ logger(logging.WARNING, 'Failed to load file to pcache: %s' % str(e))
757
+
758
+ for cur_dest_file_path in dest_file_path_iter:
759
+ logger(logging.DEBUG, "copying '%s' to '%s'" % (first_dest_file_path, cur_dest_file_path))
760
+ shutil.copy2(first_dest_file_path, cur_dest_file_path)
761
+
762
+ trace['transferStart'] = start_time
763
+ trace['transferEnd'] = end_time
764
+ trace['clientState'] = FileDownloadState.DONE
765
+ trace['stateReason'] = 'OK'
766
+ item['clientState'] = FileDownloadState.DONE
767
+ self._send_trace(trace)
768
+
769
+ duration = round(end_time - start_time, 2)
770
+ size = item.get('bytes')
771
+ size_str = sizefmt(size, self.is_human_readable)
772
+ if size and duration:
773
+ rate = round((size / duration) * 1e-6, 2)
774
+ logger(logging.INFO, '%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate))
775
+ else:
776
+ logger(logging.INFO, '%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration))
777
+
778
+ file_items_in_archive = item.get('archive_items', [])
779
+ if len(file_items_in_archive) > 0:
780
+ logger(logging.INFO, '%sExtracting %d file(s) from %s' % (log_prefix, len(file_items_in_archive), did_name))
781
+
782
+ archive_file_path = first_dest_file_path
783
+ for file_item in file_items_in_archive:
784
+ extraction_ok = False
785
+ extract_file_name = file_item['name']
786
+ dest_file_path_iter = iter(file_item['dest_file_paths'])
787
+ first_dest_file_path = next(dest_file_path_iter)
788
+ dest_dir = os.path.dirname(first_dest_file_path)
789
+ logger(logging.DEBUG, '%sExtracting %s to %s' % (log_prefix, extract_file_name, dest_dir))
790
+ for extraction_tool in self.extraction_tools:
791
+ if extraction_tool.try_extraction(archive_file_path, extract_file_name, dest_dir):
792
+ extraction_ok = True
793
+ break
794
+
795
+ if not extraction_ok:
796
+ logger(logging.ERROR, 'Extraction of file %s from archive %s failed.' % (extract_file_name, did_name))
797
+ continue
798
+
799
+ first_dest_file_path = os.path.join(dest_dir, extract_file_name)
800
+ for cur_dest_file_path in dest_file_path_iter:
801
+ logger(logging.DEBUG, "copying '%s' to '%s'" % (first_dest_file_path, cur_dest_file_path))
802
+ shutil.copy2(first_dest_file_path, cur_dest_file_path)
803
+
804
+ if not item.get('shall_keep_archive'):
805
+ logger(logging.DEBUG, '%sDeleting archive %s' % (log_prefix, did_name))
806
+ os.remove(archive_file_path)
807
+
808
+ return item
809
+
810
+ def download_aria2c(
811
+ self,
812
+ items: list[dict[str, Any]],
813
+ trace_custom_fields: Optional[dict[str, Any]] = None,
814
+ filters: Optional[dict[str, Any]] = None,
815
+ deactivate_file_download_exceptions: bool = False,
816
+ sort: Optional["SORTING_ALGORITHMS_LITERAL"] = None
817
+ ) -> list[dict[str, Any]]:
818
+ """
819
+ Uses aria2c to download the items with given DIDs. This function can also download datasets and wildcarded DIDs.
820
+ It only can download files that are available via https/davs.
821
+ Aria2c needs to be installed and X509_USER_PROXY needs to be set!
822
+
823
+ :param items: List of dictionaries. Each dictionary describing an item to download. Keys:
824
+ did - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed
825
+ rse - Optional: rse name (e.g. 'CERN-PROD_DATADISK') or rse expression from where to download
826
+ base_dir - Optional: base directory where the downloaded files will be stored. (Default: '.')
827
+ no_subdir - Optional: If true, files are written directly into base_dir. (Default: False)
828
+ nrandom - Optional: if the DID addresses a dataset, nrandom files will be randomly chosen for download from the dataset
829
+ ignore_checksum - Optional: If true, skips the checksum validation between the downloaded file and the rucio catalouge. (Default: False)
830
+ check_local_with_filesize_only - Optional: If true, already downloaded files will not be validated by checksum.
831
+
832
+ :param trace_custom_fields: Custom key value pairs to send with the traces
833
+ :param filters: dictionary containing filter options
834
+ :param deactivate_file_download_exceptions: Boolean, if file download exceptions shouldn't be raised
835
+ :param sort: Select best replica by replica sorting algorithm. Available algorithms:
836
+ ``geoip`` - based on src/dst IP topographical distance
837
+
838
+ :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
839
+
840
+ :raises InputValidationError: if one of the input items is in the wrong format
841
+ :raises NoFilesDownloaded: if no files could be downloaded
842
+ :raises NotAllFilesDownloaded: if not all files could be downloaded
843
+ :raises RucioException: if something went wrong during the download (e.g. aria2c could not be started)
844
+ """
845
+ trace_custom_fields = trace_custom_fields or {}
846
+ filters = filters or {}
847
+ logger = self.logger
848
+ trace_custom_fields['uuid'] = generate_uuid()
849
+
850
+ rpc_secret = '%x' % (secrets.randbits(64))
851
+ rpc_auth = 'token:%s' % rpc_secret
852
+ rpcproc, aria_rpc = self._start_aria2c_rpc(rpc_secret)
853
+
854
+ for item in items:
855
+ item['force_scheme'] = ['https', 'davs']
856
+ item['no_resolve_archives'] = True
857
+
858
+ logger(logging.INFO, 'Processing %d item(s) for input' % len(items))
859
+ did_to_input_items, file_items_with_sources = self._resolve_and_merge_input_items(copy.deepcopy(items), sort=sort)
860
+ self.logger(logging.DEBUG, 'num_unmerged_items=%d; num_dids=%d; num_file_items=%d' % (len(items), len(did_to_input_items), len(file_items_with_sources)))
861
+
862
+ input_items = self._prepare_items_for_download(did_to_input_items, file_items_with_sources)
863
+
864
+ try:
865
+ output_items = self._download_items_aria2c(input_items, aria_rpc, rpc_auth, trace_custom_fields)
866
+ except Exception as error:
867
+ self.logger(logging.ERROR, 'Unknown exception during aria2c download')
868
+ self.logger(logging.DEBUG, error)
869
+ finally:
870
+ try:
871
+ aria_rpc.aria2.forceShutdown(rpc_auth)
872
+ finally:
873
+ rpcproc.terminate()
874
+
875
+ return self._check_output(output_items, deactivate_file_download_exceptions=deactivate_file_download_exceptions)
876
+
877
+ def _start_aria2c_rpc(self, rpc_secret: str) -> tuple[subprocess.Popen, "RPCServerProxy"]:
878
+ """
879
+ Starts aria2c in RPC mode as a subprocess. Also creates
880
+ the RPC proxy instance.
881
+ (This function is meant to be used as class internal only)
882
+
883
+ :param rpc_secret: the secret for the RPC proxy
884
+
885
+ :returns: a tuple with the process and the rpc proxy objects
886
+
887
+ :raises RucioException: if the process or the proxy could not be created
888
+ """
889
+ logger = self.logger
890
+ from xmlrpc.client import ServerProxy as RPCServerProxy
891
+
892
+ cmd = 'aria2c '\
893
+ '--enable-rpc '\
894
+ '--certificate=$X509_USER_PROXY '\
895
+ '--private-key=$X509_USER_PROXY '\
896
+ '--ca-certificate=/etc/pki/tls/certs/CERN-bundle.pem '\
897
+ '--quiet=true '\
898
+ '--allow-overwrite=true '\
899
+ '--auto-file-renaming=false '\
900
+ '--stop-with-process=%d '\
901
+ '--rpc-secret=%s '\
902
+ '--rpc-listen-all=false '\
903
+ '--rpc-max-request-size=100M '\
904
+ '--connect-timeout=5 '\
905
+ '--rpc-listen-port=%d'
906
+
907
+ logger(logging.INFO, 'Starting aria2c rpc server...')
908
+
909
+ # trying up to 3 random ports
910
+ for attempt in range(3):
911
+ port = random.randint(1024, 65534) # noqa: S311
912
+ logger(logging.DEBUG, 'Trying to start rpc server on port: %d' % port)
913
+ try:
914
+ to_exec = cmd % (os.getpid(), rpc_secret, port)
915
+ logger(logging.DEBUG, to_exec)
916
+ rpcproc = subprocess.Popen(
917
+ cmd,
918
+ shell=True,
919
+ stdin=subprocess.PIPE,
920
+ stdout=subprocess.PIPE,
921
+ stderr=subprocess.PIPE
922
+ )
923
+ except Exception as error:
924
+ raise RucioException('Failed to execute aria2c!', error)
925
+
926
+ # if port is in use aria should fail to start so give it some time
927
+ time.sleep(2)
928
+
929
+ # did it fail?
930
+ if rpcproc.poll() is not None:
931
+ (out, err) = rpcproc.communicate()
932
+ logger(logging.DEBUG, 'Failed to start aria2c with port: %d' % port)
933
+ logger(logging.DEBUG, 'aria2c output: %s' % out)
934
+ else:
935
+ break
936
+
937
+ if rpcproc.poll() is not None:
938
+ raise RucioException('Failed to start aria2c rpc server!')
939
+
940
+ try:
941
+ aria_rpc = RPCServerProxy('http://localhost:%d/rpc' % port)
942
+ except Exception as error:
943
+ rpcproc.kill()
944
+ raise RucioException('Failed to initialise rpc proxy!', error)
945
+ return (rpcproc, aria_rpc)
946
+
947
+ def _download_items_aria2c(
948
+ self,
949
+ items: list[dict[str, Any]],
950
+ aria_rpc: Any,
951
+ rpc_auth: str,
952
+ trace_custom_fields: Optional[dict[str, Any]] = None
953
+ ) -> list[dict[str, Any]]:
954
+ """
955
+ Uses aria2c to download the given items. Aria2c needs to be started
956
+ as RPC background process first and a RPC proxy is needed.
957
+ (This function is meant to be used as class internal only)
958
+
959
+ :param items: list of dictionaries containing one dict for each file to download
960
+ :param aria_rcp: RPCProxy to the aria2c process
961
+ :param rpc_auth: the rpc authentication token
962
+ :param trace_custom_fields: Custom key value pairs to send with the traces
963
+
964
+ :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
965
+ """
966
+ trace_custom_fields = trace_custom_fields or {}
967
+ logger = self.logger
968
+
969
+ gid_to_item = {} # maps an aria2c download id (gid) to the download item
970
+ pfn_to_rse = {}
971
+ items_to_queue = [item for item in items]
972
+
973
+ # items get removed from gid_to_item when they are complete or failed
974
+ while len(gid_to_item) or len(items_to_queue):
975
+ num_queued = 0
976
+
977
+ # queue up to 100 files and then check arias status
978
+ while (num_queued < 100) and len(items_to_queue):
979
+ item = items_to_queue.pop()
980
+
981
+ file_scope = item['scope']
982
+ file_name = item['name']
983
+ file_did_str = '%s:%s' % (file_scope, file_name)
984
+ trace = {'scope': file_scope,
985
+ 'filename': file_name,
986
+ 'datasetScope': item.get('dataset_scope', ''),
987
+ 'dataset': item.get('dataset_name', ''),
988
+ 'protocol': 'https',
989
+ 'remoteSite': '',
990
+ 'filesize': item.get('bytes', None),
991
+ 'transferStart': time.time(),
992
+ 'transferEnd': time.time()}
993
+ trace.update(self.trace_tpl)
994
+ trace.update(trace_custom_fields)
995
+
996
+ # get pfns from all replicas
997
+ pfns = []
998
+ for src in item['sources']:
999
+ pfn = src['pfn']
1000
+ if pfn[0:4].lower() == 'davs':
1001
+ pfn = pfn.replace('davs', 'https', 1)
1002
+ pfns.append(pfn)
1003
+ pfn_to_rse[pfn] = src['rse']
1004
+
1005
+ # does file exist and are sources available?
1006
+ # workaround: only consider first dest file path for aria2c download
1007
+ dest_file_path = next(iter(item['dest_file_paths']))
1008
+ if os.path.isfile(dest_file_path):
1009
+ logger(logging.INFO, 'File exists already locally: %s' % file_did_str)
1010
+ item['clientState'] = FileDownloadState.ALREADY_DONE
1011
+ trace['clientState'] = FileDownloadState.ALREADY_DONE
1012
+ self._send_trace(trace)
1013
+ elif len(pfns) == 0:
1014
+ logger(logging.WARNING, 'No available source found for file: %s' % file_did_str)
1015
+ item['clientState'] = FileDownloadState.FILE_NOT_FOUND
1016
+ trace['clientState'] = FileDownloadState.FILE_NOT_FOUND
1017
+ self._send_trace(trace)
1018
+ else:
1019
+ item['trace'] = trace
1020
+ options = {'dir': os.path.dirname(dest_file_path),
1021
+ 'out': os.path.basename(item['temp_file_path'])}
1022
+ gid = aria_rpc.aria2.addUri(rpc_auth, pfns, options)
1023
+ gid_to_item[gid] = item
1024
+ num_queued += 1
1025
+ logger(logging.DEBUG, 'Queued file: %s' % file_did_str)
1026
+
1027
+ # get some statistics
1028
+ aria_stat = aria_rpc.aria2.getGlobalStat(rpc_auth)
1029
+ num_active = int(aria_stat['numActive'])
1030
+ num_waiting = int(aria_stat['numWaiting'])
1031
+ num_stopped = int(aria_stat['numStoppedTotal'])
1032
+
1033
+ # save start time if one of the active downloads has started
1034
+ active = aria_rpc.aria2.tellActive(rpc_auth, ['gid', 'completedLength'])
1035
+ for dlinfo in active:
1036
+ gid = dlinfo['gid']
1037
+ if int(dlinfo['completedLength']) > 0:
1038
+ gid_to_item[gid].setdefault('transferStart', time.time())
1039
+
1040
+ stopped = aria_rpc.aria2.tellStopped(rpc_auth, -1, num_stopped, ['gid', 'status', 'files'])
1041
+ for dlinfo in stopped:
1042
+ gid = dlinfo['gid']
1043
+ item = gid_to_item[gid]
1044
+
1045
+ file_scope = item['scope']
1046
+ file_name = item['name']
1047
+ file_did_str = '%s:%s' % (file_scope, file_name)
1048
+ temp_file_path = item['temp_file_path']
1049
+ # workaround: only consider first dest file path for aria2c download
1050
+ dest_file_path = next(iter(item['dest_file_paths']))
1051
+
1052
+ # ensure we didn't miss the active state (e.g. a very fast download)
1053
+ start_time = item.setdefault('transferStart', time.time())
1054
+ end_time = item.setdefault('transferEnd', time.time())
1055
+
1056
+ # get used pfn for traces
1057
+ trace = item['trace']
1058
+ for uri in dlinfo['files'][0]['uris']:
1059
+ if uri['status'].lower() == 'used':
1060
+ trace['remoteSite'] = pfn_to_rse.get(uri['uri'], '')
1061
+
1062
+ trace['transferStart'] = start_time
1063
+ trace['transferEnd'] = end_time
1064
+
1065
+ # ensure file exists
1066
+ status = dlinfo.get('status', '').lower()
1067
+ if status == 'complete' and os.path.isfile(temp_file_path):
1068
+ # checksum check
1069
+ skip_check = item.get('ignore_checksum', False)
1070
+ rucio_checksum = 0 if skip_check else item.get('adler32')
1071
+ local_checksum = 0 if skip_check else adler32(temp_file_path)
1072
+ if str(rucio_checksum).lstrip('0') == str(local_checksum).lstrip('0'):
1073
+ item['clientState'] = FileDownloadState.DONE
1074
+ trace['clientState'] = FileDownloadState.DONE
1075
+ # remove .part ending
1076
+ os.rename(temp_file_path, dest_file_path)
1077
+
1078
+ # calculate duration
1079
+ duration = round(end_time - start_time, 2)
1080
+ duration = max(duration, 0.01) # protect against 0 division
1081
+ size = item.get('bytes', 0)
1082
+ rate = round((size / duration) * 1e-6, 2)
1083
+ size_str = sizefmt(size, self.is_human_readable)
1084
+ logger(logging.INFO, 'File %s successfully downloaded. %s in %s seconds = %s MBps' % (file_did_str,
1085
+ size_str,
1086
+ duration,
1087
+ rate))
1088
+ else:
1089
+ os.unlink(temp_file_path)
1090
+ logger(logging.WARNING, 'Checksum validation failed for file: %s' % file_did_str)
1091
+ logger(logging.DEBUG, 'Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum))
1092
+ item['clientState'] = FileDownloadState.FAIL_VALIDATE
1093
+ trace['clientState'] = FileDownloadState.FAIL_VALIDATE
1094
+ else:
1095
+ logger(logging.ERROR, 'Failed to download file: %s' % file_did_str)
1096
+ logger(logging.DEBUG, 'Aria2c status: %s' % status)
1097
+ item['clientState'] = FileDownloadState.FAILED
1098
+ trace['clientState'] = FileDownloadState.DOWNLOAD_ATTEMPT
1099
+
1100
+ self._send_trace(trace)
1101
+ del item['trace']
1102
+
1103
+ aria_rpc.aria2.removeDownloadResult(rpc_auth, gid)
1104
+ del gid_to_item[gid]
1105
+
1106
+ if len(stopped) > 0:
1107
+ logger(logging.INFO, 'Active: %d, Waiting: %d, Stopped: %d' % (num_active, num_waiting, num_stopped))
1108
+
1109
+ return items
1110
+
1111
+ def _resolve_one_item_dids(self, item: dict[str, Any]) -> "Iterator[dict[str, Any]]":
1112
+ """
1113
+ Resolve scopes or wildcard DIDs to lists of full did names:
1114
+ :param item: One input item
1115
+ """
1116
+ dids = item.get('did')
1117
+ filters = item.get('filters', {})
1118
+ if filters:
1119
+ filters = copy.copy(filters)
1120
+
1121
+ if dids is None:
1122
+ self.logger(logging.DEBUG, 'Resolving DIDs by using filter options')
1123
+ scope = filters.pop('scope')
1124
+ for did in self.client.list_dids(scope, filters=filters, did_type='all', long=True):
1125
+ yield did
1126
+ return
1127
+
1128
+ if not isinstance(dids, list):
1129
+ dids = [dids]
1130
+
1131
+ for did_str in dids:
1132
+ scope, did_name = self._split_did_str(did_str)
1133
+ filters['name'] = did_name
1134
+ any_did_resolved = False
1135
+ for did in self.client.list_dids(scope, filters=filters, did_type='all', long=True):
1136
+ yield did
1137
+ any_did_resolved = True
1138
+
1139
+ # Maintain compatibility with existing code, which expects non-existing DIDs be
1140
+ # passed through in order to correctly set trace state to FILE_NOT_FOUND
1141
+ if not any_did_resolved and '*' not in did_name:
1142
+ yield {'scope': scope, 'name': did_name}
1143
+
1144
+ def _resolve_and_merge_input_items(
1145
+ self,
1146
+ input_items: list[dict[str, Any]],
1147
+ sort: Optional["SORTING_ALGORITHMS_LITERAL"] = None
1148
+ ) -> tuple[dict[str, Any], list[dict[str, Any]]]:
1149
+ """
1150
+ This function takes the input items given to download_dids etc.
1151
+ and resolves the sources.
1152
+
1153
+ - It first performs a list_dids call to dereference any wildcards and
1154
+ retrieve DID stats (size, length, type).
1155
+ - Next, input items are grouped together by common list_replicas options.
1156
+ For each group, a single list_replicas call is performed.
1157
+ - The resolved File DIDs with sources are finally mapped back to initial
1158
+ input items to be able to correctly retrieve download options
1159
+ (timeout, destination directories, etc)
1160
+
1161
+ :param input_items: List of dictionaries. Each dictionary describing an input item
1162
+
1163
+ :returns: a tuple:
1164
+ - a dictionary that maps the dereferenced(w/o wildcards) input DIDs to a list of input items
1165
+ - and a list with a dictionary for each file DID which has to be downloaded
1166
+
1167
+ :raises InputValidationError: if one of the input items is in the wrong format
1168
+ """
1169
+ logger = self.logger
1170
+
1171
+ # check mandatory options before doing any server calls
1172
+ resolve_archives = False
1173
+ for item in input_items:
1174
+ if item.get('resolve_archives') is not None:
1175
+ logger(logging.WARNING, 'resolve_archives option is deprecated and will be removed in a future release.')
1176
+ item.setdefault('no_resolve_archives', not item.pop('resolve_archives'))
1177
+
1178
+ # If any item needs to resolve archives
1179
+ if not item.get('no_resolve_archives'):
1180
+ resolve_archives = True
1181
+
1182
+ if not item.get('did'):
1183
+ if not item.get('filters', {}).get('scope'):
1184
+ logger(logging.DEBUG, item)
1185
+ raise InputValidationError('Item without did and filter/scope')
1186
+ if resolve_archives:
1187
+ # perhaps we'll need an extraction tool so check what is installed
1188
+ self.extraction_tools = [tool for tool in self.extraction_tools if tool.is_useable()]
1189
+ if len(self.extraction_tools) < 1:
1190
+ logger(logging.WARNING, 'Archive resolution is enabled but no extraction tool is available. '
1191
+ 'Sources whose protocol does not support extraction will not be considered for download.')
1192
+
1193
+ # if excluding tapes, we need to list them first
1194
+ tape_rses = []
1195
+ if self.is_tape_excluded:
1196
+ try:
1197
+ tape_rses = [endp['rse'] for endp in self.client.list_rses(rse_expression='istape=true')]
1198
+ except Exception:
1199
+ logger(logging.DEBUG, 'No tapes found.')
1200
+
1201
+ # Matches each dereferenced DID back to a list of input items
1202
+ did_to_input_items = {}
1203
+
1204
+ # Resolve DIDs
1205
+ for item in input_items:
1206
+ resolved_dids = list(self._resolve_one_item_dids(item))
1207
+ if not resolved_dids:
1208
+ logger(logging.WARNING, 'An item did not have any DIDs after resolving the input: %s.' % item.get('did', item))
1209
+ item['dids'] = resolved_dids
1210
+ for did in resolved_dids:
1211
+ did_to_input_items.setdefault(DID(did), []).append(item)
1212
+
1213
+ if 'CONTAINER' in did.get('did_type', '').upper() or ('length' in did and not did['length']):
1214
+ did_with_size = self.client.get_did(scope=did['scope'], name=did['name'], dynamic_depth='FILE')
1215
+ did['length'] = did_with_size['length']
1216
+ did['bytes'] = did_with_size['bytes']
1217
+
1218
+ # group input items by common options to reduce the number of calls to list_replicas
1219
+ distinct_keys = ['rse', 'force_scheme', 'no_resolve_archives']
1220
+ item_groups = []
1221
+ for item in input_items:
1222
+ found_compatible_group = False
1223
+ if not item.get('nrandom'):
1224
+ # Don't merge items if nrandom is set. Otherwise two items with the same nrandom will be merged into one
1225
+ # and we'll effectively download only half of the desired replicas for each item.
1226
+ for item_group in item_groups:
1227
+ if all(item.get(k) == item_group[0].get(k) for k in distinct_keys):
1228
+ item_group.append(item)
1229
+ found_compatible_group = True
1230
+ break
1231
+ if not found_compatible_group:
1232
+ item_groups.append([item])
1233
+
1234
+ # List replicas for dids
1235
+ merged_items_with_sources = []
1236
+ for item_group in item_groups:
1237
+ # Take configuration from the first item in the group; but dids from all items
1238
+ item = item_group[0]
1239
+ input_dids = {DID(did): did
1240
+ for item in item_group
1241
+ for did in item.get('dids')}
1242
+
1243
+ # since we're using metalink we need to explicitly give all schemes
1244
+ schemes = item.get('force_scheme')
1245
+ if schemes:
1246
+ schemes = schemes if isinstance(schemes, list) else [schemes]
1247
+ logger(logging.DEBUG, 'schemes: %s' % schemes)
1248
+
1249
+ # RSE expression, still with tape endpoints included
1250
+ rse_expression = item.get('rse')
1251
+ logger(logging.DEBUG, 'rse_expression: %s' % rse_expression)
1252
+
1253
+ # obtaining the choice of Implementation
1254
+ impl = item.get('impl')
1255
+ if impl:
1256
+ impl_split = impl.split('.')
1257
+ if len(impl_split) == 1:
1258
+ impl = 'rucio.rse.protocols.' + impl + '.Default'
1259
+ else:
1260
+ impl = 'rucio.rse.protocols.' + impl
1261
+ logger(logging.DEBUG, 'impl: %s' % impl)
1262
+
1263
+ # get PFNs of files and datasets
1264
+ logger(logging.DEBUG, 'num DIDs for list_replicas call: %d' % len(item['dids']))
1265
+
1266
+ nrandom = item.get('nrandom')
1267
+ if nrandom:
1268
+ logger(logging.INFO, 'Selecting %d random replicas from DID(s): %s' % (nrandom, [str(did) for did in input_dids]))
1269
+
1270
+ metalink_str = self.client.list_replicas([{'scope': did.scope, 'name': did.name} for did in input_dids],
1271
+ schemes=schemes,
1272
+ ignore_availability=False,
1273
+ rse_expression=rse_expression,
1274
+ client_location=self.client_location,
1275
+ sort=sort,
1276
+ resolve_archives=not item.get('no_resolve_archives'),
1277
+ resolve_parents=True,
1278
+ nrandom=nrandom,
1279
+ metalink=True)
1280
+ file_items = parse_replicas_from_string(metalink_str) # type: ignore
1281
+ for file in file_items:
1282
+ if impl:
1283
+ file['impl'] = impl
1284
+ elif not item.get('force_scheme'):
1285
+ file['impl'] = self.preferred_impl(file['sources'])
1286
+
1287
+ logger(logging.DEBUG, 'num resolved files: %s' % len(file_items))
1288
+
1289
+ if not nrandom or nrandom != len(file_items):
1290
+ # If list_replicas didn't resolve any file DIDs for any input did, we pass through the input DID.
1291
+ # This is done to keep compatibility with later code which generates "FILE_NOT_FOUND" traces
1292
+ # and output items.
1293
+ # In the special case of nrandom, when serverside filtering is applied, it's "normal" for some input
1294
+ # dids to be ignored as long as we got exactly nrandom file_items from the server.
1295
+ for input_did in input_dids:
1296
+ if not any([input_did == f['did'] or str(input_did) in f['parent_dids'] for f in file_items]):
1297
+ logger(logging.ERROR, 'DID does not exist: %s' % input_did)
1298
+ # TODO: store did directly as DIDType object
1299
+ file_items.append({'did': str(input_did), 'adler32': None, 'md5': None, 'sources': [], 'parent_dids': set(), 'impl': impl or None})
1300
+
1301
+ # filtering out tape sources
1302
+ if self.is_tape_excluded:
1303
+ for file_item in file_items:
1304
+ unfiltered_sources = copy.copy(file_item['sources'])
1305
+ for src in unfiltered_sources:
1306
+ if src['rse'] in tape_rses:
1307
+ file_item['sources'].remove(src)
1308
+ if unfiltered_sources and not file_item['sources']:
1309
+ logger(logging.WARNING, 'The requested DID {} only has replicas on tape. Direct download from tape is prohibited. '
1310
+ 'Please request a transfer to a non-tape endpoint.'.format(file_item['did']))
1311
+
1312
+ # Match the file did back to the dids which were provided to list_replicas.
1313
+ # Later, this will allow to match the file back to input_items via did_to_input_items
1314
+ for file_item in file_items:
1315
+ file_did = DID(file_item['did'])
1316
+
1317
+ file_input_dids = {DID(did) for did in file_item.get('parent_dids', [])}.intersection(input_dids)
1318
+ if file_did in input_dids:
1319
+ file_input_dids.add(file_did)
1320
+ file_item['input_dids'] = {did: input_dids[did] for did in file_input_dids}
1321
+ merged_items_with_sources.extend(file_items)
1322
+
1323
+ return did_to_input_items, merged_items_with_sources
1324
+
1325
+ def _options_from_input_items(self, input_items: "Iterable[dict[str, Any]]") -> dict[str, Any]:
1326
+ """
1327
+ Best-effort generation of download options from multiple input items which resolve to the same file DID.
1328
+ This is done to download each file DID only once, even if it is requested multiple times via overlapping
1329
+ datasets and/or wildcard resolutions in distinct input items.
1330
+
1331
+ Some options can be easily merged. For example: multiple base_dir are all appended to a list. As a result,
1332
+ the file is downloaded once and copied to all desired destinations.
1333
+ Other options are not necessarily compatible. For example, two items requesting two different values for
1334
+ download timeout. We make our best to merge the options in such cases.
1335
+ """
1336
+ options = {}
1337
+ for item in input_items:
1338
+ base_dir = item.get('base_dir', '.')
1339
+ no_subdir = item.get('no_subdir', False)
1340
+ new_transfer_timeout = item.get('transfer_timeout', None)
1341
+ new_transfer_speed_timeout = item.get('transfer_speed_timeout', None)
1342
+
1343
+ options.setdefault('destinations', set()).add((base_dir, no_subdir))
1344
+
1345
+ # Merge some options
1346
+ # The other options of this DID will be inherited from the first item that contained the DID
1347
+ options['ignore_checksum'] = options.get('ignore_checksum') or item.get('ignore_checksum', False)
1348
+ options['check_local_with_filesize_only'] = options.get('check_local_with_filesize_only') or item.get('check_local_with_filesize_only', False)
1349
+
1350
+ # if one item wants to resolve archives we enable it for all items
1351
+ options['resolve_archives'] = (options.get('resolve_archives') or not item.get('no_resolve_archives'))
1352
+
1353
+ cur_transfer_timeout = options.setdefault('transfer_timeout', None)
1354
+ if cur_transfer_timeout is not None and new_transfer_timeout is not None:
1355
+ options['transfer_timeout'] = max(int(cur_transfer_timeout), int(new_transfer_timeout))
1356
+ elif new_transfer_timeout is not None:
1357
+ options['transfer_timeout'] = int(new_transfer_timeout)
1358
+
1359
+ cur_transfer_speed_timeout = options.setdefault('transfer_speed_timeout', None)
1360
+ if cur_transfer_speed_timeout is not None and new_transfer_speed_timeout is not None:
1361
+ options['transfer_speed_timeout'] = min(float(cur_transfer_speed_timeout), float(new_transfer_speed_timeout))
1362
+ elif new_transfer_speed_timeout is not None:
1363
+ options['transfer_speed_timeout'] = float(new_transfer_speed_timeout)
1364
+ return options
1365
+
1366
+ def _prepare_items_for_download(
1367
+ self,
1368
+ did_to_input_items: dict[str, Any],
1369
+ file_items: list[dict[str, Any]]
1370
+ ) -> list[dict[str, Any]]:
1371
+ """
1372
+ Optimises the amount of files to download
1373
+ (This function is meant to be used as class internal only)
1374
+
1375
+ :param did_to_input_items: dictionary that maps resolved input DIDs to input items
1376
+ :param file_items: list of dictionaries. Each dictionary describes a File DID to download
1377
+
1378
+ :returns: list of dictionaries. Each dictionary describes an element to download
1379
+
1380
+ :raises InputValidationError: if the given input is not valid or incomplete
1381
+ """
1382
+ logger = self.logger
1383
+
1384
+ # maps file item IDs (fiid) to the file item object
1385
+ fiid_to_file_item = {}
1386
+
1387
+ # cea -> client_extract archives to avoid confusion with archives that dont need explicit extraction
1388
+ # this dict will contain all ids of cea's that definitely will be downloaded
1389
+ cea_id_pure_to_fiids = {}
1390
+
1391
+ # this dict will contain ids of cea's that have higher prioritised non cea sources
1392
+ cea_id_mixed_to_fiids = {}
1393
+
1394
+ all_dest_file_paths = set()
1395
+
1396
+ # get replicas for every file of the given dids
1397
+ for file_item in file_items:
1398
+ file_did = DID(file_item['did'])
1399
+ input_items = list(itertools.chain.from_iterable(did_to_input_items.get(did, []) for did in file_item['input_dids']))
1400
+ options = self._options_from_input_items(input_items)
1401
+
1402
+ file_item['scope'] = file_did.scope
1403
+ file_item['name'] = file_did.name
1404
+
1405
+ logger(logging.DEBUG, 'Queueing file: %s' % file_did)
1406
+ logger(logging.DEBUG, 'real parents: %s' % [str(did) for did in file_item['input_dids'] if did != file_did])
1407
+ logger(logging.DEBUG, 'options: %s' % options)
1408
+
1409
+ # prepare destinations folders:
1410
+ dest_file_paths = file_item.get('dest_file_paths', set())
1411
+ for input_did in file_item['input_dids']:
1412
+ for item in did_to_input_items[input_did]:
1413
+ base_dir = item.get('base_dir', '.')
1414
+ no_subdir = item.get('no_subdir', False)
1415
+ file_did_path = file_did.name
1416
+ if input_did != file_did:
1417
+ # if datasets were given: prepare the destination paths for each dataset
1418
+ if self.extract_scope_convention == 'belleii' and file_did_path.startswith('/'):
1419
+ file_did_path = file_did_path.split('/')[-1]
1420
+ path = os.path.join(self._prepare_dest_dir(base_dir, input_did.name, no_subdir), file_did_path)
1421
+ else:
1422
+ # if no datasets were given only prepare the given destination paths
1423
+ if file_did_path.startswith('/'):
1424
+ file_did_path = file_did_path[1:]
1425
+ path = os.path.join(self._prepare_dest_dir(base_dir, file_did.scope, no_subdir), file_did_path)
1426
+
1427
+ if path in all_dest_file_paths:
1428
+ raise RucioException("Multiple file items with same destination file path")
1429
+
1430
+ all_dest_file_paths.add(path)
1431
+ dest_file_paths.add(path)
1432
+
1433
+ # workaround: just take any given dataset for the traces and the output
1434
+ file_item.setdefault('dataset_scope', input_did.scope)
1435
+ file_item.setdefault('dataset_name', input_did.name)
1436
+
1437
+ if not options:
1438
+ continue
1439
+ resolve_archives = options.get('resolve_archives')
1440
+ file_item['merged_options'] = options
1441
+ file_item['dest_file_paths'] = list(dest_file_paths)
1442
+ file_item['temp_file_path'] = '%s.part' % file_item['dest_file_paths'][0]
1443
+
1444
+ # the file did str is not an unique key for this dict because multiple calls of list_replicas
1445
+ # could result in the same DID multiple times. So we're using the id of the dictionary objects
1446
+ fiid = id(file_item)
1447
+ fiid_to_file_item[fiid] = file_item
1448
+
1449
+ if resolve_archives:
1450
+ min_cea_priority = None
1451
+ num_non_cea_sources = 0
1452
+ cea_ids = []
1453
+ sources = []
1454
+ # go through sources and check how many (non-)cea sources there are,
1455
+ # index cea sources, or remove cea sources if there is no extraction tool
1456
+ for source in file_item['sources']:
1457
+ is_cea = source.get('client_extract', False)
1458
+ if is_cea and (len(self.extraction_tools) > 0):
1459
+ priority = int(source['priority'])
1460
+ if min_cea_priority is None or priority < min_cea_priority:
1461
+ min_cea_priority = priority
1462
+
1463
+ # workaround since we dont have the archive DID use the part behind the last slash of the PFN
1464
+ # this doesn't respect the scope of the archive DID!!!
1465
+ # and we trust that client_extract==True sources dont have any parameters at the end of the PFN
1466
+ cea_id = source['pfn'].split('/')
1467
+ cea_id = cea_id[-1] if len(cea_id[-1]) > 0 else cea_id[-2]
1468
+ cea_ids.append(cea_id)
1469
+
1470
+ sources.append(source)
1471
+ elif not is_cea:
1472
+ num_non_cea_sources += 1
1473
+ sources.append(source)
1474
+ else:
1475
+ # no extraction tool
1476
+ logger(logging.DEBUG, 'client_extract=True; ignoring source: %s' % source['pfn'])
1477
+
1478
+ logger(logging.DEBUG, 'Prepared sources: num_sources=%d/%d; num_non_cea_sources=%d; num_cea_ids=%d'
1479
+ % (len(sources), len(file_item['sources']), num_non_cea_sources, len(cea_ids)))
1480
+
1481
+ file_item['sources'] = sources
1482
+
1483
+ # if there are no cea sources we are done for this item
1484
+ if min_cea_priority is None:
1485
+ continue
1486
+ # decide if file item belongs to the pure or mixed map
1487
+ # if no non-archive src exists or the highest prio src is an archive src we put it in the pure map
1488
+ elif num_non_cea_sources == 0 or min_cea_priority == 1:
1489
+ logger(logging.DEBUG, 'Adding fiid to cea pure map: '
1490
+ 'num_non_cea_sources=%d; min_cea_priority=%d; num_cea_sources=%d'
1491
+ % (num_non_cea_sources, min_cea_priority, len(cea_ids)))
1492
+ for cea_id in cea_ids:
1493
+ cea_id_pure_to_fiids.setdefault(cea_id, set()).add(fiid)
1494
+ file_item.setdefault('cea_ids_pure', set()).add(cea_id)
1495
+ # if there are non-archive sources and archive sources we put it in the mixed map
1496
+ elif len(cea_ids) > 0:
1497
+ logger(logging.DEBUG, 'Adding fiid to cea mixed map: '
1498
+ 'num_non_cea_sources=%d; min_cea_priority=%d; num_cea_sources=%d'
1499
+ % (num_non_cea_sources, min_cea_priority, len(cea_ids)))
1500
+ for cea_id in cea_ids:
1501
+ cea_id_mixed_to_fiids.setdefault(cea_id, set()).add(fiid)
1502
+ file_item.setdefault('cea_ids_mixed', set()).add(cea_id)
1503
+
1504
+ # put all archives from the mixed list into the pure list if they meet
1505
+ # certain conditions, e.g., an archive that is already in the pure list
1506
+ for cea_id_mixed in list(cea_id_mixed_to_fiids.keys()):
1507
+ fiids_mixed = cea_id_mixed_to_fiids[cea_id_mixed]
1508
+ if cea_id_mixed in cea_id_pure_to_fiids:
1509
+ # file from mixed list is already in a pure list
1510
+ logger(logging.DEBUG, 'Mixed ID is already in cea pure map: '
1511
+ 'cea_id_mixed=%s; num_fiids_mixed=%d; num_cea_pure_fiids=%d'
1512
+ % (cea_id_mixed, len(fiids_mixed), len(cea_id_pure_to_fiids[cea_id_mixed])))
1513
+ elif len(fiids_mixed) >= self.use_cea_threshold:
1514
+ # more than use_cea_threshold files are in a common archive
1515
+ logger(logging.DEBUG, 'Number of needed files in cea reached threshold: '
1516
+ 'cea_id_mixed=%s; num_fiids_mixed=%d; threshold=%d'
1517
+ % (cea_id_mixed, len(fiids_mixed), self.use_cea_threshold))
1518
+ else:
1519
+ # dont move from mixed list to pure list
1520
+ continue
1521
+
1522
+ # first add cea_id to pure map so it can be removed from mixed map later
1523
+ cea_id_pure_to_fiids.setdefault(cea_id_mixed, set()).update(fiids_mixed)
1524
+
1525
+ # now update all file_item mixed/pure maps
1526
+ for fiid_mixed in list(fiids_mixed):
1527
+ file_item = fiid_to_file_item[fiid_mixed]
1528
+ # add cea id to file_item pure map
1529
+ file_item.setdefault('cea_ids_pure', set()).add(cea_id_mixed)
1530
+
1531
+ # remove file item mixed map and
1532
+ # remove references from all other mixed archives to file_item
1533
+ for cea_id_mixed2 in file_item.pop('cea_ids_mixed'):
1534
+ cea_id_mixed_to_fiids[cea_id_mixed2].remove(fiid_mixed)
1535
+
1536
+ # finally remove cea_id from mixed map
1537
+ cea_id_mixed_to_fiids.pop(cea_id_mixed)
1538
+
1539
+ for file_item in file_items:
1540
+ cea_ids_pure = file_item.get('cea_ids_pure', set())
1541
+ cea_ids_mixed = file_item.get('cea_ids_mixed', set())
1542
+
1543
+ if len(cea_ids_pure) > 0:
1544
+ logger(logging.DEBUG, 'Removing all non-cea sources of file %s' % file_item['did'])
1545
+ file_item['sources'] = [s for s in file_item['sources'] if s.get('client_extract', False)]
1546
+ elif len(cea_ids_mixed) > 0:
1547
+ logger(logging.DEBUG, 'Removing all cea sources of file %s' % file_item['did'])
1548
+ file_item['sources'] = [s for s in file_item['sources'] if not s.get('client_extract', False)]
1549
+
1550
+ # reduce the amount of archives to download by removing
1551
+ # all redundant pure archives (=all files can be extracted from other archives)
1552
+ for cea_id_pure in list(cea_id_pure_to_fiids.keys()):
1553
+ # if all files of this archive are available in more than one archive the archive is redundant
1554
+ if all(len(fiid_to_file_item[fiid_pure]['cea_ids_pure']) > 1 for fiid_pure in cea_id_pure_to_fiids[cea_id_pure]):
1555
+ for fiid_pure in cea_id_pure_to_fiids[cea_id_pure]:
1556
+ fiid_to_file_item[fiid_pure]['cea_ids_pure'].discard(cea_id_pure)
1557
+ logger(logging.DEBUG, 'Removing redundant archive %s' % cea_id_pure)
1558
+ cea_id_pure_to_fiids.pop(cea_id_pure)
1559
+
1560
+ # remove all archives of a file except a single one so
1561
+ # that each file is assigned to exactly one pure archive
1562
+ for cea_id_pure in cea_id_pure_to_fiids:
1563
+ for fiid_pure in cea_id_pure_to_fiids[cea_id_pure]:
1564
+ cea_ids_pure = fiid_to_file_item[fiid_pure]['cea_ids_pure']
1565
+ for cea_id_pure_other in list(cea_ids_pure):
1566
+ if cea_id_pure != cea_id_pure_other:
1567
+ cea_id_pure_to_fiids[cea_id_pure_other].discard(fiid_pure)
1568
+ cea_ids_pure.discard(cea_id_pure_other)
1569
+
1570
+ download_packs = []
1571
+ cea_id_to_pack = {}
1572
+ for file_item in file_items:
1573
+ cea_ids = file_item.get('cea_ids_pure', set())
1574
+ if len(cea_ids) > 0:
1575
+ cea_id = next(iter(cea_ids))
1576
+ pack = cea_id_to_pack.get(cea_id)
1577
+ if pack is None:
1578
+ scope = file_item['scope']
1579
+ first_dest = next(iter(file_item['merged_options']['destinations']))
1580
+ dest_path = os.path.join(self._prepare_dest_dir(first_dest[0], scope, first_dest[1]), cea_id)
1581
+ pack = {'scope': scope,
1582
+ 'name': cea_id,
1583
+ 'dest_file_paths': [dest_path],
1584
+ 'temp_file_path': '%s.part' % dest_path,
1585
+ 'sources': file_item['sources'],
1586
+ 'merged_options': {'ignore_checksum': True}, # we currently dont have checksums for the archive
1587
+ 'archive_items': []
1588
+ }
1589
+ cea_id_to_pack[cea_id] = pack
1590
+ download_packs.append(pack)
1591
+ file_item.pop('sources')
1592
+ pack['archive_items'].append(file_item)
1593
+ else:
1594
+ download_packs.append(file_item)
1595
+ return download_packs
1596
+
1597
+ def _split_did_str(self, did_str: str) -> tuple[str, str]:
1598
+ """
1599
+ Splits a given DID string (e.g. 'scope1:name.file') into its scope and name part
1600
+ (This function is meant to be used as class internal only)
1601
+
1602
+ :param did_str: the DID string that will be split
1603
+
1604
+ :returns: the scope- and name part of the given DID
1605
+
1606
+ :raises InputValidationError: if the given DID string is not valid
1607
+ """
1608
+ did = did_str.split(':')
1609
+ if len(did) == 2:
1610
+ did_scope = did[0]
1611
+ did_name = did[1]
1612
+ elif len(did) == 1:
1613
+ if self.extract_scope_convention == 'belleii':
1614
+ scopes = [scope for scope in self.client.list_scopes()]
1615
+ did_scope, did_name = extract_scope(did[0], scopes)
1616
+ else:
1617
+ did = did_str.split('.')
1618
+ did_scope = did[0]
1619
+ if did_scope == 'user' or did_scope == 'group':
1620
+ did_scope = '%s.%s' % (did[0], did[1])
1621
+ did_name = did_str
1622
+ else:
1623
+ raise InputValidationError('%s is not a valid DID. To many colons.' % did_str)
1624
+
1625
+ if did_name.endswith('/'):
1626
+ did_name = did_name[:-1]
1627
+
1628
+ return did_scope, did_name
1629
+
1630
+ def _prepare_dest_dir(
1631
+ self,
1632
+ base_dir: str,
1633
+ dest_dir_name: str,
1634
+ no_subdir: Optional[bool]
1635
+ ) -> str:
1636
+ """
1637
+ Builds the final destination path for a file and creates the
1638
+ destination directory if it's not existent.
1639
+ (This function is meant to be used as class internal only)
1640
+
1641
+ :param base_dir: base directory part
1642
+ :param dest_dir_name: name of the destination directory
1643
+ :param no_subdir: if no subdirectory should be created
1644
+
1645
+ :returns: the absolute path of the destination directory
1646
+ """
1647
+ # append dest_dir_name, if subdir should be used
1648
+ if dest_dir_name.startswith('/'):
1649
+ dest_dir_name = dest_dir_name[1:]
1650
+ dest_dir_path = os.path.join(os.path.abspath(base_dir), '' if no_subdir else dest_dir_name)
1651
+
1652
+ if not os.path.isdir(dest_dir_path):
1653
+ os.makedirs(dest_dir_path)
1654
+
1655
+ return dest_dir_path
1656
+
1657
+ def _check_output(
1658
+ self,
1659
+ output_items: list[dict[str, Any]],
1660
+ deactivate_file_download_exceptions: bool = False
1661
+ ) -> list[dict[str, Any]]:
1662
+ """
1663
+ Checks if all files were successfully downloaded
1664
+ (This function is meant to be used as class internal only)
1665
+
1666
+ :param output_items: list of dictionaries describing the downloaded files
1667
+ :param deactivate_file_download_exceptions: Boolean, if file download exceptions shouldn't be raised
1668
+
1669
+ :returns: output_items list
1670
+
1671
+ :raises NoFilesDownloaded:
1672
+ :raises NotAllFilesDownloaded:
1673
+ """
1674
+ success_states = [FileDownloadState.ALREADY_DONE, FileDownloadState.DONE, FileDownloadState.FOUND_IN_PCACHE]
1675
+ num_successful = 0
1676
+ num_failed = 0
1677
+ for item in output_items:
1678
+ client_state = item.get('clientState', FileDownloadState.FAILED)
1679
+ if client_state in success_states:
1680
+ num_successful += 1
1681
+ else:
1682
+ num_failed += 1
1683
+
1684
+ if not deactivate_file_download_exceptions and num_successful == 0:
1685
+ raise NoFilesDownloaded()
1686
+ elif not deactivate_file_download_exceptions and num_failed > 0:
1687
+ raise NotAllFilesDownloaded()
1688
+
1689
+ return output_items
1690
+
1691
+ def _send_trace(self, trace: dict[str, Any]) -> None:
1692
+ """
1693
+ Checks if sending trace is allowed and send the trace.
1694
+
1695
+ :param trace: the trace
1696
+ """
1697
+ if self.tracing:
1698
+ send_trace(trace, self.client.trace_host, self.client.user_agent)
1699
+
1700
+ def preferred_impl(self, sources: list[dict[str, Any]]) -> Optional[str]:
1701
+ """
1702
+ Finds the optimum protocol impl preferred by the client and
1703
+ supported by the remote RSE.
1704
+
1705
+ :param sources: List of sources for a given DID
1706
+
1707
+ :raises RucioException(msg): general exception with msg for more details.
1708
+ """
1709
+
1710
+ preferred_protocols = []
1711
+ checked_rses = []
1712
+ supported_impl = None
1713
+
1714
+ try:
1715
+ preferred_impls = config_get('download', 'preferred_impl')
1716
+ except Exception as error:
1717
+ self.logger(logging.INFO, 'No preferred protocol impl in rucio.cfg: %s' % (error))
1718
+ return supported_impl
1719
+ else:
1720
+ preferred_impls = list(preferred_impls.split(', '))
1721
+ i = 0
1722
+ while i < len(preferred_impls):
1723
+ impl = preferred_impls[i]
1724
+ impl_split = impl.split('.')
1725
+ if len(impl_split) == 1:
1726
+ preferred_impls[i] = 'rucio.rse.protocols.' + impl + '.Default'
1727
+ else:
1728
+ preferred_impls[i] = 'rucio.rse.protocols.' + impl
1729
+ i += 1
1730
+
1731
+ for source in sources:
1732
+ if source['rse'] in checked_rses:
1733
+ continue
1734
+ try:
1735
+ rse_settings = rsemgr.get_rse_info(source['rse'], vo=self.client.vo)
1736
+ checked_rses.append(str(source['rse']))
1737
+ except RucioException as error:
1738
+ self.logger(logging.DEBUG, 'Could not get info of RSE %s: %s' % (source['source'], error))
1739
+ continue
1740
+
1741
+ preferred_protocols = [protocol for protocol in reversed(rse_settings['protocols']) if protocol['impl'] in preferred_impls]
1742
+
1743
+ if len(preferred_protocols) == 0:
1744
+ continue
1745
+
1746
+ for protocol in preferred_protocols:
1747
+ if not protocol['domains']['wan'].get("read"):
1748
+ self.logger(logging.WARNING, 'Unsuitable protocol "%s": "WAN Read" operation is not supported' % (protocol['impl']))
1749
+ continue
1750
+ try:
1751
+ supported_protocol = rsemgr.create_protocol(rse_settings, 'read', impl=protocol['impl'], auth_token=self.auth_token, logger=self.logger)
1752
+ supported_protocol.connect()
1753
+ except Exception as error:
1754
+ self.logger(logging.WARNING, 'Failed to create protocol "%s", exception: %s' % (protocol['impl'], error))
1755
+ pass
1756
+ else:
1757
+ self.logger(logging.INFO, 'Preferred protocol impl supported locally and remotely: %s' % (protocol['impl']))
1758
+ supported_impl = protocol['impl']
1759
+ break
1760
+
1761
+ return supported_impl
1762
+
1763
+
1764
+ def _verify_checksum(
1765
+ item: dict[str, Any],
1766
+ path: str
1767
+ ) -> tuple[bool, Optional[str], Optional[str]]:
1768
+ rucio_checksum = item.get(PREFERRED_CHECKSUM)
1769
+ local_checksum = None
1770
+ checksum_algo = CHECKSUM_ALGO_DICT.get(PREFERRED_CHECKSUM)
1771
+
1772
+ if rucio_checksum and checksum_algo:
1773
+ local_checksum = checksum_algo(path)
1774
+ return rucio_checksum == local_checksum, rucio_checksum, local_checksum
1775
+
1776
+ for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS:
1777
+ rucio_checksum = item.get(checksum_name)
1778
+ checksum_algo = CHECKSUM_ALGO_DICT.get(checksum_name)
1779
+ if rucio_checksum and checksum_algo:
1780
+ local_checksum = checksum_algo(path)
1781
+ return rucio_checksum == local_checksum, rucio_checksum, local_checksum
1782
+
1783
+ return False, None, None