data-transfer-cli 0.3.6__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: data-transfer-cli
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: HiDALGO Data Transfer CLI provides commands to transfer data between different data providers and consumers using NIFI pipelines
5
5
  License: APL-2.0
6
6
  Author: Jesús Gorroñogoitia
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Classifier: Programming Language :: Python :: 3.12
13
13
  Classifier: Programming Language :: Python :: 3.13
14
- Requires-Dist: hid_data_transfer_lib (>=0.3.4)
14
+ Requires-Dist: hid_data_transfer_lib (>=0.3.7)
15
15
  Requires-Dist: paramiko (>=3.3.1)
16
16
  Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
17
17
  Requires-Dist: requests (>=2.31.0)
@@ -209,6 +209,9 @@ options:
209
209
  [Optional] Password for HPC secret key
210
210
  -2fa, --two-factor-authentication
211
211
  [Optional] HPC requires 2FA authentication
212
+ -acct, --accounting [Optional] Enable returning accounting information of data transfer
213
+ -ct CONCURRENT_TASKS, --concurrent-tasks CONCURRENT_TASKS
214
+ [Optional] set the number of concurrent tasks for parallel data transfer
212
215
  ```
213
216
 
214
217
  A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
@@ -216,6 +219,15 @@ A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
216
219
  - execute *hdfs2hcp* CLI command to transfer data from an hdfs location (e.g. /users/yosu/data/genome-tags.csv) to a remote HPC (e.g. LUMI, at $HOME/data folder)
217
220
  - check status of *hdfs2hcp* transfer (and possible warnings/errors) with *check-status* CLI command
218
221
 
222
+ If accounting report is enabled, the output of the command will include some transfer statistics:
223
+ ```
224
+ Data transfer report:
225
+ Transfer time: 21 s
226
+ Transfer size: 12.86 MB
227
+ Transfer rate: 0.61 MB/s
228
+ Number of transferred files: 1
229
+ ```
230
+
219
231
  ## Support for HPC clusters that require a 2FA token
220
232
  The Data Transfer CLI tool's commands support transferring data to/from HPC clusters that require a 2FA token. These commands offer an optional flag *_2fa*. If set by the user, the command prompts the user (in the standard input) for the token when required.
221
233
 
@@ -248,6 +260,7 @@ Note: Hidalgo2 HPDA configuration is included in the Data Transfer CLI tool impl
248
260
 
249
261
  Then, when you launch a Data Tranfer CLI tool command, any parameter not included in the command line will be retrieved from the config file if the corresponding host entry is included. After that, if the command line gets complete (i.e. all required parameters are provided), the command will be executed, otherwise the corresponding error will be triggered.
250
262
 
251
-
263
+ ## Data transfer optimization
264
+ You can improve the data transfer rate by setting the optional parameter *-ct|--concurrent-tasks* (*integer*) to the number of concurrent tasks that will be used in the NIFI pipeline (default is 1). The maximum number of tasks that improve the transfer throughput depends on the physical resources of the NIFI server (consult its administrator). The parallel transfer is currently supported to/from HPC and HDFS data servers, but not to/from CKAN (under development)
252
265
 
253
266
 
@@ -190,6 +190,9 @@ options:
190
190
  [Optional] Password for HPC secret key
191
191
  -2fa, --two-factor-authentication
192
192
  [Optional] HPC requires 2FA authentication
193
+ -acct, --accounting [Optional] Enable returning accounting information of data transfer
194
+ -ct CONCURRENT_TASKS, --concurrent-tasks CONCURRENT_TASKS
195
+ [Optional] set the number of concurrent tasks for parallel data transfer
193
196
  ```
194
197
 
195
198
  A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
@@ -197,6 +200,15 @@ A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
197
200
  - execute *hdfs2hcp* CLI command to transfer data from an hdfs location (e.g. /users/yosu/data/genome-tags.csv) to a remote HPC (e.g. LUMI, at $HOME/data folder)
198
201
  - check status of *hdfs2hcp* transfer (and possible warnings/errors) with *check-status* CLI command
199
202
 
203
+ If accounting report is enabled, the output of the command will include some transfer statistics:
204
+ ```
205
+ Data transfer report:
206
+ Transfer time: 21 s
207
+ Transfer size: 12.86 MB
208
+ Transfer rate: 0.61 MB/s
209
+ Number of transferred files: 1
210
+ ```
211
+
200
212
  ## Support for HPC clusters that require a 2FA token
201
213
  The Data Transfer CLI tool's commands support transferring data to/from HPC clusters that require a 2FA token. These commands offer an optional flag *_2fa*. If set by the user, the command prompts the user (in the standard input) for the token when required.
202
214
 
@@ -229,5 +241,6 @@ Note: Hidalgo2 HPDA configuration is included in the Data Transfer CLI tool impl
229
241
 
230
242
  Then, when you launch a Data Tranfer CLI tool command, any parameter not included in the command line will be retrieved from the config file if the corresponding host entry is included. After that, if the command line gets complete (i.e. all required parameters are provided), the command will be executed, otherwise the corresponding error will be triggered.
231
243
 
232
-
244
+ ## Data transfer optimization
245
+ You can improve the data transfer rate by setting the optional parameter *-ct|--concurrent-tasks* (*integer*) to the number of concurrent tasks that will be used in the NIFI pipeline (default is 1). The maximum number of tasks that improve the transfer throughput depends on the physical resources of the NIFI server (consult its administrator). The parallel transfer is currently supported to/from HPC and HDFS data servers, but not to/from CKAN (under development)
233
246
 
@@ -122,11 +122,28 @@ class ThreadRaisingExceptions(threading.Thread):
122
122
  def __init__(self, *args, **kwargs):
123
123
  self._exception = None
124
124
  self._process_group_id = None
125
+ self.accounting = kwargs['args'][0].accounting \
126
+ if 'args' in kwargs and kwargs['args'] else False
125
127
  super().__init__(*args, **kwargs)
126
128
 
127
129
  def run(self):
128
130
  try:
129
- self._process_group_id = self._target(*self._args, **self._kwargs)
131
+ self._process_group_id, accounting_info = \
132
+ self._target(*self._args, **self._kwargs)
133
+ if self.accounting: # Report accounting information
134
+ transfer_time = accounting_info.pipeline_timespan
135
+ number_transfer_files = len(
136
+ accounting_info.flowfiles_sizes)
137
+ transfer_size = sum(
138
+ accounting_info.flowfiles_sizes.values())/(1024*1024)
139
+ transfer_rate = transfer_size / transfer_time
140
+
141
+ msg = "Data transfer report:\n"
142
+ msg += f"Transfer time: {transfer_time} s\n"
143
+ msg += f"Transfer size: {transfer_size:.2f} MB\n"
144
+ msg += f"Transfer rate: {transfer_rate:.2f} MB/s\n"
145
+ msg += f"Number of transferred files: {number_transfer_files}\n"
146
+ print(msg)
130
147
  except HidDataTransferException as e:
131
148
  self._exception = e
132
149
  raise e
@@ -20,7 +20,7 @@ and run processors in a process group.
20
20
 
21
21
  from hid_data_transfer_lib.exceptions.hid_dt_exceptions import HidDataTransferException
22
22
  from hid_data_transfer_lib.conf.hid_dt_configuration import HidDataTransferConfiguration
23
- from hid_data_transfer_lib.hid_dt_lib import HIDDataTransfer
23
+ from hid_data_transfer_lib.hid_dt_lib import HIDDataTransfer, AccountingInfo
24
24
 
25
25
 
26
26
  class DataTransferProxy:
@@ -82,7 +82,7 @@ class DataTransferProxy:
82
82
 
83
83
  # MAIN CLI commands
84
84
 
85
- def hdfs2hpc(self, args) -> str:
85
+ def hdfs2hpc(self, args) -> tuple[str, AccountingInfo]:
86
86
  """transfer data from HDFS to hpc using SFTP"""
87
87
  self.__logger.info(
88
88
  "executing hdfs2hpc command with args: %s", self.format_args_to_string(args)
@@ -100,6 +100,7 @@ class DataTransferProxy:
100
100
  data_target=args.data_target,
101
101
  kerberos_principal=args.kerberos_principal,
102
102
  kerberos_password=args.kerberos_password,
103
+ concurrent_tasks=args.concurrent_tasks,
103
104
  )
104
105
  return self.dt_client.hdfs2hpc(
105
106
  hpc_host=args.hpc_host,
@@ -112,12 +113,13 @@ class DataTransferProxy:
112
113
  data_target=args.data_target,
113
114
  kerberos_principal=args.kerberos_principal,
114
115
  kerberos_password=args.kerberos_password,
116
+ concurrent_tasks=args.concurrent_tasks,
115
117
  )
116
118
 
117
119
  except Exception as ex:
118
120
  raise HidDataTransferException(ex) from ex
119
121
 
120
- def hpc2hdfs(self, args) -> str:
122
+ def hpc2hdfs(self, args) -> tuple[str, AccountingInfo]:
121
123
  """transfer data from HPC to hdfs using SFTP"""
122
124
  self.__logger.info(
123
125
  "executing hpc2hdfs command with args: %s", self.format_args_to_string(args)
@@ -135,6 +137,7 @@ class DataTransferProxy:
135
137
  data_target=args.data_target,
136
138
  kerberos_principal=args.kerberos_principal,
137
139
  kerberos_password=args.kerberos_password,
140
+ concurrent_tasks=args.concurrent_tasks,
138
141
  )
139
142
  return self.dt_client.hpc2hdfs(
140
143
  hpc_host=args.hpc_host,
@@ -147,12 +150,13 @@ class DataTransferProxy:
147
150
  data_target=args.data_target,
148
151
  kerberos_principal=args.kerberos_principal,
149
152
  kerberos_password=args.kerberos_password,
153
+ concurrent_tasks=args.concurrent_tasks,
150
154
  )
151
155
 
152
156
  except Exception as ex:
153
157
  raise HidDataTransferException(ex) from ex
154
158
 
155
- def hdfs2ckan(self, args) -> str:
159
+ def hdfs2ckan(self, args) -> tuple[str, AccountingInfo]:
156
160
  """transfer data from HDFS to CKAN using SFTP"""
157
161
  self.__logger.info(
158
162
  "executing hpc2ckan command with args: %s", self.format_args_to_string(args)
@@ -167,12 +171,13 @@ class DataTransferProxy:
167
171
  data_source=args.data_source,
168
172
  kerberos_principal=args.kerberos_principal,
169
173
  kerberos_password=args.kerberos_password,
174
+ concurrent_tasks=args.concurrent_tasks,
170
175
  )
171
176
 
172
177
  except Exception as ex:
173
178
  raise HidDataTransferException(ex) from ex
174
179
 
175
- def ckan2hdfs(self, args) -> str:
180
+ def ckan2hdfs(self, args) -> tuple[str, AccountingInfo]:
176
181
  """transfer data from CKAN to HPC using SFTP"""
177
182
  self.__logger.info(
178
183
  "executing ckan2hpc command with args: %s", self.format_args_to_string(args)
@@ -188,12 +193,13 @@ class DataTransferProxy:
188
193
  data_target=args.data_target,
189
194
  kerberos_principal=args.kerberos_principal,
190
195
  kerberos_password=args.kerberos_password,
196
+ concurrent_tasks=args.concurrent_tasks,
191
197
  )
192
198
 
193
199
  except Exception as ex:
194
200
  raise HidDataTransferException(ex) from ex
195
201
 
196
- def hpc2ckan(self, args) -> str:
202
+ def hpc2ckan(self, args) -> tuple[str, AccountingInfo]:
197
203
  """transfer data from hpc to CKAN using SFTP"""
198
204
  self.__logger.info(
199
205
  "executing hpc2ckan command with args: %s", self.format_args_to_string(args)
@@ -208,12 +214,14 @@ class DataTransferProxy:
208
214
  ckan_api_key=args.ckan_api_key,
209
215
  ckan_organization=args.ckan_organization,
210
216
  ckan_dataset=args.ckan_dataset,
217
+ ckan_resource=args.ckan_resource,
211
218
  hpc_host=args.hpc_host,
212
219
  hpc_port=args.hpc_port,
213
220
  hpc_username=args.hpc_username,
214
221
  hpc_secret_key_path=args.hpc_secret_key,
215
222
  hpc_secret_key_password=args.hpc_secret_key_password,
216
223
  data_source=args.data_source,
224
+ concurrent_tasks=args.concurrent_tasks,
217
225
  )
218
226
  return self.dt_client.hpc2ckan(
219
227
  ckan_host=args.ckan_host
@@ -221,6 +229,7 @@ class DataTransferProxy:
221
229
  ckan_api_key=args.ckan_api_key,
222
230
  ckan_organization=args.ckan_organization,
223
231
  ckan_dataset=args.ckan_dataset,
232
+ ckan_resource=args.ckan_resource,
224
233
  hpc_host=args.hpc_host,
225
234
  hpc_port=args.hpc_port,
226
235
  hpc_username=args.hpc_username,
@@ -228,12 +237,13 @@ class DataTransferProxy:
228
237
  hpc_secret_key_path=args.hpc_secret_key,
229
238
  hpc_secret_key_password=args.hpc_secret_key_password,
230
239
  data_source=args.data_source,
240
+ concurrent_tasks=args.concurrent_tasks,
231
241
  )
232
242
 
233
243
  except Exception as ex:
234
244
  raise HidDataTransferException(ex) from ex
235
245
 
236
- def ckan2hpc(self, args) -> str:
246
+ def ckan2hpc(self, args) -> tuple[str, AccountingInfo]:
237
247
  """transfer data from CKAN to hpc using SFTP"""
238
248
  self.__logger.info(
239
249
  "executing ckan2hpc command with args: %s", self.format_args_to_string(args)
@@ -255,6 +265,7 @@ class DataTransferProxy:
255
265
  hpc_secret_key_path=args.hpc_secret_key,
256
266
  hpc_secret_key_password=args.hpc_secret_key_password,
257
267
  data_target=args.data_target,
268
+ concurrent_tasks=args.concurrent_tasks,
258
269
  )
259
270
  return self.dt_client.ckan2hpc(
260
271
  ckan_host=args.ckan_host
@@ -270,12 +281,13 @@ class DataTransferProxy:
270
281
  hpc_secret_key_path=args.hpc_secret_key,
271
282
  hpc_secret_key_password=args.hpc_secret_key_password,
272
283
  data_target=args.data_target,
284
+ concurrent_tasks=args.concurrent_tasks,
273
285
  )
274
286
 
275
287
  except Exception as ex:
276
288
  raise HidDataTransferException(ex) from ex
277
289
 
278
- def local2ckan(self, args) -> str:
290
+ def local2ckan(self, args) -> tuple[str, AccountingInfo]:
279
291
  """transfer data from local filesystem to CKAN using SFTP"""
280
292
  self.__logger.info(
281
293
  "executing local2ckan command with args: %s",
@@ -296,7 +308,7 @@ class DataTransferProxy:
296
308
  except Exception as ex:
297
309
  raise HidDataTransferException(ex) from ex
298
310
 
299
- def ckan2local(self, args) -> str:
311
+ def ckan2local(self, args) -> tuple[str, AccountingInfo]:
300
312
  """transfer data from CKAN to the local filesystem using SFTP"""
301
313
  self.__logger.info(
302
314
  "executing ckan2local command with args: %s",
@@ -224,6 +224,17 @@ class CLIParser(argparse.ArgumentParser):
224
224
  required=False, action="store_true", default=False,
225
225
  help="[Optional] HPC requires 2FA authentication"
226
226
  )
227
+ hdfs2hpc_parser.add_argument(
228
+ "-acct", "--accounting",
229
+ required=False, action="store_true", default=False,
230
+ help="[Optional] Enable returning accounting information of data transfer"
231
+ )
232
+ hdfs2hpc_parser.add_argument(
233
+ "-ct", "--concurrent-tasks",
234
+ required=False, type=int, default=1,
235
+ help="[Optional] set the number of concurrent tasks"
236
+ " for parallel data transfer"
237
+ )
227
238
  hdfs2hpc_parser.set_defaults(func=target.hdfs2hpc)
228
239
 
229
240
  # hpc2hdfs
@@ -244,6 +255,17 @@ class CLIParser(argparse.ArgumentParser):
244
255
  required=False, action="store_true", default=False,
245
256
  help="[Optional] HPC requires 2FA authentication"
246
257
  )
258
+ hpc2hdfs_parser.add_argument(
259
+ "-acct", "--accounting",
260
+ required=False, action="store_true", default=False,
261
+ help="[Optional] Enable returning accounting information of data transfer"
262
+ )
263
+ hpc2hdfs_parser.add_argument(
264
+ "-ct", "--concurrent-tasks",
265
+ required=False, type=int, default=1,
266
+ help="[Optional] set the number of concurrent tasks"
267
+ " for parallel data transfer"
268
+ )
247
269
  hpc2hdfs_parser.set_defaults(func=target.hpc2hdfs)
248
270
 
249
271
  # ckan2hdfs
@@ -264,6 +286,17 @@ class CLIParser(argparse.ArgumentParser):
264
286
  "-t", "--data-target", required=False, help="[Optional] target HDFS folder"
265
287
  )
266
288
  ckan2hdfs_parser = self.add_default_kerberos_arguments(ckan2hdfs_parser)
289
+ ckan2hdfs_parser.add_argument(
290
+ "-acct", "--accounting",
291
+ required=False, action="store_true", default=False,
292
+ help="[Optional] Enable returning accounting information of data transfer"
293
+ )
294
+ ckan2hdfs_parser.add_argument(
295
+ "-ct", "--concurrent-tasks",
296
+ required=False, type=int, default=1,
297
+ help="[Optional] set the number of concurrent tasks"
298
+ " for parallel data transfer"
299
+ )
267
300
  ckan2hdfs_parser.set_defaults(func=target.ckan2hdfs)
268
301
 
269
302
  # hdfs2ckan
@@ -278,6 +311,17 @@ class CLIParser(argparse.ArgumentParser):
278
311
  required=True,
279
312
  help="File path to HDFS file or directory to transfer",
280
313
  )
314
+ hdfs2ckan_parser.add_argument(
315
+ "-acct", "--accounting",
316
+ required=False, action="store_true", default=False,
317
+ help="[Optional] Enable returning accounting information of data transfer"
318
+ )
319
+ hdfs2ckan_parser.add_argument(
320
+ "-ct", "--concurrent-tasks",
321
+ required=False, type=int, default=1,
322
+ help="[Optional] set the number of concurrent tasks"
323
+ " for parallel data transfer"
324
+ )
281
325
  hdfs2ckan_parser.set_defaults(func=target.hdfs2ckan)
282
326
 
283
327
  # ckan2hpc
@@ -303,6 +347,17 @@ class CLIParser(argparse.ArgumentParser):
303
347
  required=False, action="store_true", default=False,
304
348
  help="[Optional] HPC requires 2FA authentication"
305
349
  )
350
+ ckan2hpc_parser.add_argument(
351
+ "-acct", "--accounting",
352
+ required=False, action="store_true", default=False,
353
+ help="[Optional] Enable returning accounting information of data transfer"
354
+ )
355
+ ckan2hpc_parser.add_argument(
356
+ "-ct", "--concurrent-tasks",
357
+ required=False, type=int, default=1,
358
+ help="[Optional] set the number of concurrent tasks"
359
+ " for parallel data transfer"
360
+ )
306
361
  ckan2hpc_parser.set_defaults(func=target.ckan2hpc)
307
362
 
308
363
  # hpc2ckan
@@ -310,6 +365,13 @@ class CLIParser(argparse.ArgumentParser):
310
365
  "hpc2ckan", help="transfer data from HPC to a target CKAN"
311
366
  )
312
367
  hpc2ckan_parser = self.add_default_ckan_arguments(hpc2ckan_parser)
368
+ hpc2ckan_parser.add_argument(
369
+ "-r",
370
+ "--ckan-resource",
371
+ required=False,
372
+ help="[Optional] CKAN resource to create from transferred sources. \
373
+ If omitted, target resource name will adopt the source file or folder name",
374
+ )
313
375
  hpc2ckan_parser = self.add_default_hpc_arguments(hpc2ckan_parser)
314
376
  hpc2ckan_parser.add_argument(
315
377
  "-2fa", "--two-factor-authentication",
@@ -322,6 +384,17 @@ class CLIParser(argparse.ArgumentParser):
322
384
  required=True,
323
385
  help="File path to HPC file or directory to transfer",
324
386
  )
387
+ hpc2ckan_parser.add_argument(
388
+ "-acct", "--accounting",
389
+ required=False, action="store_true", default=False,
390
+ help="[Optional] Enable returning accounting information of data transfer"
391
+ )
392
+ hpc2ckan_parser.add_argument(
393
+ "-ct", "--concurrent-tasks",
394
+ required=False, type=int, default=1,
395
+ help="[Optional] set the number of concurrent tasks"
396
+ " for parallel data transfer"
397
+ )
325
398
  hpc2ckan_parser.set_defaults(func=target.hpc2ckan)
326
399
 
327
400
  # local2ckan
@@ -336,13 +409,17 @@ class CLIParser(argparse.ArgumentParser):
336
409
  help="[Optional] CKAN resource to create from transferred sources. \
337
410
  If omitted, target resource name will adopt the source file or folder name",
338
411
  )
339
-
340
412
  local2ckan_parser.add_argument(
341
413
  "-s",
342
414
  "--data-source",
343
415
  required=True,
344
416
  help="File path to local file or directory to transfer",
345
417
  )
418
+ local2ckan_parser.add_argument(
419
+ "-acct", "--accounting",
420
+ required=False, action="store_true", default=False,
421
+ help="[Optional] Enable returning accounting information of data transfer"
422
+ )
346
423
  local2ckan_parser.set_defaults(func=target.local2ckan)
347
424
 
348
425
  # ckan2local
@@ -357,7 +434,6 @@ class CLIParser(argparse.ArgumentParser):
357
434
  help="[Optional] CKAN resource to transfer. \
358
435
  If omitted, all resources in the dataset will be transferred",
359
436
  )
360
-
361
437
  ckan2local_parser.add_argument(
362
438
  "-t",
363
439
  "--data-target",
@@ -365,6 +441,11 @@ class CLIParser(argparse.ArgumentParser):
365
441
  help="Local directory where to transfer the data. \
366
442
  If omitted, data will be transferred to the current directory",
367
443
  )
444
+ ckan2local_parser.add_argument(
445
+ "-acct", "--accounting",
446
+ required=False, action="store_true", default=False,
447
+ help="[Optional] Enable returning accounting information of data transfer"
448
+ )
368
449
  ckan2local_parser.set_defaults(func=target.ckan2local)
369
450
 
370
451
  return self.parse_args(args)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data-transfer-cli"
3
- version = "0.3.6"
3
+ version = "0.3.7"
4
4
  description = "HiDALGO Data Transfer CLI provides commands to transfer data between different data providers and consumers using NIFI pipelines"
5
5
  authors = [
6
6
  { name = "Jesús Gorroñogoitia", email = "jesus.gorronogoitia@eviden.com" },
@@ -11,9 +11,9 @@ requires-python = ">=3.11, <4.0"
11
11
  dependencies = [
12
12
  "requests>=2.31.0",
13
13
  "paramiko>=3.3.1",
14
- "hid_data_transfer_lib>=0.3.4",
15
- #"hid_data_transfer_lib @ file:///home/yosu/Projects/Hidalgo2/git/hid-data-management/data-transfer/nifi/hid_data_transfer_lib/dist/hid_data_transfer_lib-0.3.4-py3-none-any.whl",
14
+ "hid_data_transfer_lib>=0.3.7",
16
15
  "pyyaml (>=6.0.2,<7.0.0)",
16
+ #"hid-data-transfer-lib @ file:///home/yosu/Projects/Hidalgo2/git/hid-data-management/data-transfer/nifi/hid_data_transfer_lib/dist/hid_data_transfer_lib-0.3.7-py3-none-any.whl",
17
17
  ]
18
18
 
19
19
  [tool.poetry]