data-transfer-cli 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_transfer_cli-0.3.2/PKG-INFO +235 -0
- data_transfer_cli-0.3.2/README.md +215 -0
- data_transfer_cli-0.3.2/pyproject.toml +45 -0
- data_transfer_cli-0.3.2/src/.env +1 -0
- data_transfer_cli-0.3.2/src/__init__.py +0 -0
- data_transfer_cli-0.3.2/src/conf/cli.cfg +8 -0
- data_transfer_cli-0.3.2/src/data_transfer_cli.py +171 -0
- data_transfer_cli-0.3.2/src/data_transfer_proxy.py +331 -0
- data_transfer_cli-0.3.2/src/dtcli.cfg +16 -0
- data_transfer_cli-0.3.2/src/parser/__init__.py +0 -0
- data_transfer_cli-0.3.2/src/parser/cli_parser.py +370 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: data-transfer-cli
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: HiDALGO Data Transfer CLI provides commands to transfer data between different data providers and consumers using NIFI pipelines
|
|
5
|
+
License: APL-2.0
|
|
6
|
+
Author: Jesús Gorroñogoitia
|
|
7
|
+
Author-email: jesus.gorronogoitia@eviden.com
|
|
8
|
+
Requires-Python: >=3.11, <4.0
|
|
9
|
+
Classifier: License :: Other/Proprietary License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: hid_data_transfer_lib (>=0.3.2)
|
|
15
|
+
Requires-Dist: paramiko (>=3.3.1)
|
|
16
|
+
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
17
|
+
Requires-Dist: requests (>=2.31.0)
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Hidalgo2 Data Transfer Tool
|
|
21
|
+
This repository contains the implementation of the Hidalgo2 data transfer tool. It uses [Apache NIFI](https://nifi.apache.org/) to transfer data from different data sources to specified targets
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
This tool is planning to support the following features:
|
|
25
|
+
- transfer datasets from Cloud Providers to HDFS
|
|
26
|
+
- transfer datasets from Cloud Providers to CKAN
|
|
27
|
+
- transfer datasets from/to Hadoop HDFS to/from HPC
|
|
28
|
+
- transfer datasets from/to Hadoop HDFS to/from CKAN
|
|
29
|
+
- transfer datasets from/to a CKAN to/from HPC
|
|
30
|
+
- transfer datasets from/to local filesystem to/from CKAN
|
|
31
|
+
|
|
32
|
+
## Prototype
|
|
33
|
+
Current prototype supports the following features:
|
|
34
|
+
- transfer datasets from/to Hadoop HDFS to/from HPC
|
|
35
|
+
- transfer datasets from/to Hadoop HDFS to/from CKAN
|
|
36
|
+
- transfer datasets from/to a CKAN to/from HPC
|
|
37
|
+
- transfer datasets from/to local filesystem to/from CKAN
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
## Implementation
|
|
41
|
+
Current implementation is based on Python. It is implemented as a CLI that executes a transfer command, by creating a NIFI process group out of the worflow definition reqistered in NIFI registry. It uses the parameters given within the CLI command invocation to populate a NIFI parameter context that is asociated to the created process group. Then, the process group processors are executed once (or until the incoming flowfile queues is empty), one after another, following the group sequence flow, until the flow is completed. To check the status of the transfer command, the CLI offers a check-status command. The Data Transfer CLI tool sends requests to NIFI through its REST API.
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
To use the Data Transfer CLI tool, it is required the following requirements:
|
|
45
|
+
- **Python3** execution environment
|
|
46
|
+
- **Poetry** python package management tool
|
|
47
|
+
- **NIFI** instance, with either a NIFI service or a KEYCLOAK account, plus a NIFI server account (for keys transfer)
|
|
48
|
+
- **HDFS** instance, with a user Kerberos token (i.e. authenticated Kerberos principal) if required
|
|
49
|
+
- **CKAN** instance, with an user APIKey
|
|
50
|
+
|
|
51
|
+
Python3 and Poetry should be installed in the computer where Data Transfer CLI tool will be used.
|
|
52
|
+
To install Poetry, follows [this instructions](https://python-poetry.org/docs/#installing-with-the-official-installer)
|
|
53
|
+
|
|
54
|
+
For a quick download, setup, configuration and execution of the DTCLI go to section [Quick Deployment, setup, configuration and execution](#quick-deployment-setup-configuration-and-execution)
|
|
55
|
+
|
|
56
|
+
## CLI configuration
|
|
57
|
+
### Configuration file
|
|
58
|
+
Before using the Data Transfer CLI tool, you should configure it to point at the target NIFI. The configuration file is located at the *src/dtcli.cfg* file.
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
[Nifi]
|
|
62
|
+
nifi_endpoint=http://localhost:8443
|
|
63
|
+
nifi_upload_folder=/opt/nifi/data/upload
|
|
64
|
+
nifi_download_folder=/opt/nifi/data/download
|
|
65
|
+
nifi_secure_connection=True
|
|
66
|
+
|
|
67
|
+
[Keycloak]
|
|
68
|
+
keycloak_endpoint=https://idm.hidalgo2.eu
|
|
69
|
+
keycloak_client_id=nifi
|
|
70
|
+
keycloak_client_secret=Tkt8BQmTfUkSceknml6HDSbmGyNRik9V
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Under the NIFI section,
|
|
74
|
+
- We define the url of the NIFI service (*nifi_endpoint*),
|
|
75
|
+
- We also specify a folder (*nifi_upload_folder*) in NIFI server where to upload files
|
|
76
|
+
- And another folder (*nifi_download_folder*) where from to download files. These folder must be accessible by the NIFI service (ask NIFI administrator for details).
|
|
77
|
+
- Additionally, you cat set if NIFI servers listens on a secure HTTPS connection (*nifi_secure_connection*=True) or on a non-secure HTTP (*nifi_secure_connection*=False)
|
|
78
|
+
|
|
79
|
+
Under the Keycloak section, you can configure the Keycloak integrated with NIFI, specifying:
|
|
80
|
+
- The Keycloak service endpoint (*keycloak_endpoint*)
|
|
81
|
+
- The NIFI client in Keycloak (*keycloak_client*)
|
|
82
|
+
- The NIFI secret in Keycloak (*keycloak_client_secret*)
|
|
83
|
+
|
|
84
|
+
### NIFI and Keycloak credentials in environment variables
|
|
85
|
+
We must also specify a user account (username, private_key) that grants to upload/download files to the NIFI server (as requested to upload temporary HPC keys or to support local file transfer). This user's account is provided by Hidalgo2 infrastructure provider and it is user's specific. This account is setup in the following environment variables
|
|
86
|
+
- NIFI_SERVER_USERNAME: `export NIFI_SERVER_USERNAME=<nifi_server_username>`
|
|
87
|
+
- NIFI_SERVER_PRIVATE_KEY: `export NIFI_SERVER_PRIVATE_KEY=<path_to_private_key>`
|
|
88
|
+
|
|
89
|
+
Additionally, a user account granted with access to the NIFI service must be specified, either a
|
|
90
|
+
|
|
91
|
+
#### A) NIFI User Account
|
|
92
|
+
The NIFI account must be configured in the following environment variables:
|
|
93
|
+
- NIFI_LOGIN: `export NIFI_LOGIN=<nifi_login>`
|
|
94
|
+
- NIFI_PASSWORD: `export NIFI_PASSWORD=<nifi_password>`
|
|
95
|
+
|
|
96
|
+
This NIFI account is provided by the NIFI administrator, or a
|
|
97
|
+
|
|
98
|
+
#### B) Keycloak Account with access to NIFI
|
|
99
|
+
The Keycloak account must be configured in the following environment variables:
|
|
100
|
+
- KEYCLOAK_LOGIN: `export KEYCLOAK_LOGIN=<keycloak_login>`
|
|
101
|
+
- KEYCLOAK_PASSWORD: `export KEYCLOAK_PASSWORD=<keycloak_password>`
|
|
102
|
+
|
|
103
|
+
This Keycloak account is provided by the Keycloak administrator.
|
|
104
|
+
|
|
105
|
+
## Quick Deployment, setup, configuration and execution
|
|
106
|
+
### From GitLab repository
|
|
107
|
+
1. Clone Data Transfer CLI repository.
|
|
108
|
+
2. Setup the hid-data_transfer_lib project.
|
|
109
|
+
Go to folder *hid-data-management/data-transfer/nifi/hid_data_transfer_lib*.
|
|
110
|
+
On the prompt, do: `poetry install & poetry build`
|
|
111
|
+
Note: this is only required meanwhile this tool is under development, as data-transfer-cli references the local hid_data_transfer_lib and not the one published on Pypi.
|
|
112
|
+
2. Setup the data-transfer-cli project with poetry.
|
|
113
|
+
Go to folder *hid-data-management/data-transfer/nifi/data-transfer-cli*.
|
|
114
|
+
On the prompt, run `./setup.sh`
|
|
115
|
+
3. Configure your NIFI and Keycloak services, by modifying the default DT CLI configuration (attached to HiDALGO2 NIFI and KEYCLOAK) located at *src/dtcli.cfg*
|
|
116
|
+
4. Edit *setenv.sh*. Provide your accounts for KEYCLOAK and NIFI server. Contact the HiDALGO2 administrator to request them.
|
|
117
|
+
```
|
|
118
|
+
export NIFI_SERVER_USERNAME="<username>"
|
|
119
|
+
export NIFI_SERVER_PRIVATE_KEY="<relative_path_ssh_private_key"
|
|
120
|
+
export KEYCLOAK_LOGIN="<username>"
|
|
121
|
+
export KEYCLOAK_PASSWORD="<password>"
|
|
122
|
+
```
|
|
123
|
+
5. Run Data Transfer CLI tool. In this example, we ask it for help: `dtcli -h`
|
|
124
|
+
|
|
125
|
+
### From Pipy installation
|
|
126
|
+
To be done
|
|
127
|
+
|
|
128
|
+
## Usage
|
|
129
|
+
The Data Transfer CLI tool can be executed by invoking the command `dtcli`. Add this command location to your path, either by adding the *data_transfer_cli* folder (when cloned from GitLab) or its location when installed with pip from Pypi:
|
|
130
|
+
|
|
131
|
+
`./dtcli command <arguments>`
|
|
132
|
+
|
|
133
|
+
To get help execute:
|
|
134
|
+
|
|
135
|
+
`./dtcli -h`
|
|
136
|
+
|
|
137
|
+
obtaining:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
usage: ['-h'] [-h]
|
|
141
|
+
{check-status,hdfs2hpc,hpc2hdfs,ckan2hdfs,hdfs2ckan,ckan2hpc,hpc2ckan,local2ckan,ckan2local}
|
|
142
|
+
...
|
|
143
|
+
|
|
144
|
+
positional arguments:
|
|
145
|
+
{check-status,hdfs2hpc,hpc2hdfs,ckan2hdfs,hdfs2ckan,ckan2hpc,hpc2ckan,local2ckan,ckan2local}
|
|
146
|
+
supported commands to transfer data
|
|
147
|
+
check-status check the status of a command
|
|
148
|
+
hdfs2hpc transfer data from HDFS to target HPC
|
|
149
|
+
hpc2hdfs transfer data from HPC to target HDFS
|
|
150
|
+
ckan2hdfs transfer data from CKAN to target HDFS
|
|
151
|
+
hdfs2ckan transfer data from HDFS to a target CKAN
|
|
152
|
+
ckan2hpc transfer data from CKAN to target HPC
|
|
153
|
+
hpc2ckan transfer data from HPC to a target CKAN
|
|
154
|
+
local2ckan transfer data from a local filesystem to a target CKAN
|
|
155
|
+
ckan2local transfer data from CKAN to a local filesystem
|
|
156
|
+
|
|
157
|
+
options:
|
|
158
|
+
-h, --help show this help message and exit
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
To get help of a particular command:
|
|
162
|
+
|
|
163
|
+
`./dtcli hdfs2hpc -h`
|
|
164
|
+
|
|
165
|
+
obtaining:
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
usage: ['hdfs2hpc', '-h'] hdfs2hpc [-h] -s DATA_SOURCE [-t DATA_TARGET] [-kpr KERBEROS_PRINCIPAL] [-kp KERBEROS_PASSWORD] -H HPC_HOST [-z HPC_PORT] -u HPC_USERNAME [-p HPC_PASSWORD] [-k HPC_SECRET_KEY] [-P HPC_SECRET_KEY_PASSWORD]
|
|
169
|
+
|
|
170
|
+
options:
|
|
171
|
+
-h, --help show this help message and exit
|
|
172
|
+
-s DATA_SOURCE, --data-source DATA_SOURCE
|
|
173
|
+
HDFS file path
|
|
174
|
+
-t DATA_TARGET, --data-target DATA_TARGET
|
|
175
|
+
[Optional] HPC folder
|
|
176
|
+
-kpr KERBEROS_PRINCIPAL, --kerberos-principal KERBEROS_PRINCIPAL
|
|
177
|
+
[Optional] Kerberos principal (mandatory for a Kerberized HDFS)
|
|
178
|
+
-kp KERBEROS_PASSWORD, --kerberos-password KERBEROS_PASSWORD
|
|
179
|
+
[Optional] Kerberos principal password (mandatory for a Kerberized HDFS)
|
|
180
|
+
-H HPC_HOST, --hpc-host HPC_HOST
|
|
181
|
+
Target HPC ssh host
|
|
182
|
+
-z HPC_PORT, --hpc-port HPC_PORT
|
|
183
|
+
[Optional] Target HPC ssh port
|
|
184
|
+
-u HPC_USERNAME, --hpc-username HPC_USERNAME
|
|
185
|
+
Username for HPC account
|
|
186
|
+
-p HPC_PASSWORD, --hpc-password HPC_PASSWORD
|
|
187
|
+
[Optional] Password for HPC account. Either password or secret key is required
|
|
188
|
+
-k HPC_SECRET_KEY, --hpc-secret-key HPC_SECRET_KEY
|
|
189
|
+
[Optional] Path to HPC secret key. Either password or secret key is required
|
|
190
|
+
-P HPC_SECRET_KEY_PASSWORD, --hpc-secret-key-password HPC_SECRET_KEY_PASSWORD
|
|
191
|
+
[Optional] Password for HPC secret key
|
|
192
|
+
-2fa, --two-factor-authentication
|
|
193
|
+
[Optional] HPC requires 2FA authentication
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
|
|
197
|
+
|
|
198
|
+
- execute *hdfs2hcp* CLI command to transfer data from an hdfs location (e.g. /users/yosu/data/genome-tags.csv) to a remote HPC (e.g. LUMI, at $HOME/data folder)
|
|
199
|
+
- check status of *hdfs2hcp* transfer (and possible warnings/errors) with *check-status* CLI command
|
|
200
|
+
|
|
201
|
+
## Support for HPC clusters that require a 2FA token
|
|
202
|
+
The Data Transfer CLI tool's commands support transferring data to/from HPC clusters that require a 2FA token. These commands offer an optional flag *_2fa*. If set by the user, the command prompts the user (in the standard input) for the token when required.
|
|
203
|
+
|
|
204
|
+
## Predefined profiles for data hosts
|
|
205
|
+
To avoid feeding the Data Transfer CLI tool with many inputs decribing the hosts of the source and target data providers/consumers, the user can defined them in the `~/dtcli/config` YAML file, as shown in the following YAML code snippet:
|
|
206
|
+
```
|
|
207
|
+
# Meluxina
|
|
208
|
+
login.lxp.lu:
|
|
209
|
+
username: u102309
|
|
210
|
+
port: 8822
|
|
211
|
+
secret-key: ~/.ssh/<secret_key>
|
|
212
|
+
secret-key-password: <password>
|
|
213
|
+
|
|
214
|
+
# CKAN
|
|
215
|
+
ckan.hidalgo2.eu:
|
|
216
|
+
api-key: <api-key>
|
|
217
|
+
organization: atos
|
|
218
|
+
dataset: test-dataset
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
where details for Meluxina HPC and CKAN are given. For a HPC cluster, provide the HPC host as key, followed by colon, and below, with identation, any of the hpc parameters described by the Data Tranfer CLI tool help, without the *hpc_* prefix. For instance, if the Data Transfer CLI tool help mentions:
|
|
222
|
+
```
|
|
223
|
+
-u HPC_USERNAME, --hpc-username HPC_USERNAME
|
|
224
|
+
Username for HPC account
|
|
225
|
+
```
|
|
226
|
+
that is, *--hpc-username* as parameter, use *username* as nested property for the HPC profile's description in the YAML config file, as shown in the example below. Similarly, proceed for other HPC parameters, such as *port*, *password*, *secret-key*, etc.
|
|
227
|
+
The same procedure can be adopted to describe the CKAN host's parameters.
|
|
228
|
+
|
|
229
|
+
Note: Hidalgo2 HPDA configuration is included in the Data Transfer CLI tool implementation and does not require to be included in this config file.
|
|
230
|
+
|
|
231
|
+
Then, when you launch a Data Tranfer CLI tool command, any parameter not included in the command line will be retrieved from the config file if the corresponding host entry is included. After that, if the command line gets complete (i.e. all required parameters are provided), the command will be executed, otherwise the corresponding error will be triggered.
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Hidalgo2 Data Transfer Tool
|
|
2
|
+
This repository contains the implementation of the Hidalgo2 data transfer tool. It uses [Apache NIFI](https://nifi.apache.org/) to transfer data from different data sources to specified targets
|
|
3
|
+
|
|
4
|
+
## Features
|
|
5
|
+
This tool is planning to support the following features:
|
|
6
|
+
- transfer datasets from Cloud Providers to HDFS
|
|
7
|
+
- transfer datasets from Cloud Providers to CKAN
|
|
8
|
+
- transfer datasets from/to Hadoop HDFS to/from HPC
|
|
9
|
+
- transfer datasets from/to Hadoop HDFS to/from CKAN
|
|
10
|
+
- transfer datasets from/to a CKAN to/from HPC
|
|
11
|
+
- transfer datasets from/to local filesystem to/from CKAN
|
|
12
|
+
|
|
13
|
+
## Prototype
|
|
14
|
+
Current prototype supports the following features:
|
|
15
|
+
- transfer datasets from/to Hadoop HDFS to/from HPC
|
|
16
|
+
- transfer datasets from/to Hadoop HDFS to/from CKAN
|
|
17
|
+
- transfer datasets from/to a CKAN to/from HPC
|
|
18
|
+
- transfer datasets from/to local filesystem to/from CKAN
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## Implementation
|
|
22
|
+
Current implementation is based on Python. It is implemented as a CLI that executes a transfer command, by creating a NIFI process group out of the worflow definition reqistered in NIFI registry. It uses the parameters given within the CLI command invocation to populate a NIFI parameter context that is asociated to the created process group. Then, the process group processors are executed once (or until the incoming flowfile queues is empty), one after another, following the group sequence flow, until the flow is completed. To check the status of the transfer command, the CLI offers a check-status command. The Data Transfer CLI tool sends requests to NIFI through its REST API.
|
|
23
|
+
|
|
24
|
+
## Requirements
|
|
25
|
+
To use the Data Transfer CLI tool, it is required the following requirements:
|
|
26
|
+
- **Python3** execution environment
|
|
27
|
+
- **Poetry** python package management tool
|
|
28
|
+
- **NIFI** instance, with either a NIFI service or a KEYCLOAK account, plus a NIFI server account (for keys transfer)
|
|
29
|
+
- **HDFS** instance, with a user Kerberos token (i.e. authenticated Kerberos principal) if required
|
|
30
|
+
- **CKAN** instance, with an user APIKey
|
|
31
|
+
|
|
32
|
+
Python3 and Poetry should be installed in the computer where Data Transfer CLI tool will be used.
|
|
33
|
+
To install Poetry, follows [this instructions](https://python-poetry.org/docs/#installing-with-the-official-installer)
|
|
34
|
+
|
|
35
|
+
For a quick download, setup, configuration and execution of the DTCLI go to section [Quick Deployment, setup, configuration and execution](#quick-deployment-setup-configuration-and-execution)
|
|
36
|
+
|
|
37
|
+
## CLI configuration
|
|
38
|
+
### Configuration file
|
|
39
|
+
Before using the Data Transfer CLI tool, you should configure it to point at the target NIFI. The configuration file is located at the *src/dtcli.cfg* file.
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
[Nifi]
|
|
43
|
+
nifi_endpoint=http://localhost:8443
|
|
44
|
+
nifi_upload_folder=/opt/nifi/data/upload
|
|
45
|
+
nifi_download_folder=/opt/nifi/data/download
|
|
46
|
+
nifi_secure_connection=True
|
|
47
|
+
|
|
48
|
+
[Keycloak]
|
|
49
|
+
keycloak_endpoint=https://idm.hidalgo2.eu
|
|
50
|
+
keycloak_client_id=nifi
|
|
51
|
+
keycloak_client_secret=Tkt8BQmTfUkSceknml6HDSbmGyNRik9V
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Under the NIFI section,
|
|
55
|
+
- We define the url of the NIFI service (*nifi_endpoint*),
|
|
56
|
+
- We also specify a folder (*nifi_upload_folder*) in NIFI server where to upload files
|
|
57
|
+
- And another folder (*nifi_download_folder*) where from to download files. These folder must be accessible by the NIFI service (ask NIFI administrator for details).
|
|
58
|
+
- Additionally, you cat set if NIFI servers listens on a secure HTTPS connection (*nifi_secure_connection*=True) or on a non-secure HTTP (*nifi_secure_connection*=False)
|
|
59
|
+
|
|
60
|
+
Under the Keycloak section, you can configure the Keycloak integrated with NIFI, specifying:
|
|
61
|
+
- The Keycloak service endpoint (*keycloak_endpoint*)
|
|
62
|
+
- The NIFI client in Keycloak (*keycloak_client*)
|
|
63
|
+
- The NIFI secret in Keycloak (*keycloak_client_secret*)
|
|
64
|
+
|
|
65
|
+
### NIFI and Keycloak credentials in environment variables
|
|
66
|
+
We must also specify a user account (username, private_key) that grants to upload/download files to the NIFI server (as requested to upload temporary HPC keys or to support local file transfer). This user's account is provided by Hidalgo2 infrastructure provider and it is user's specific. This account is setup in the following environment variables
|
|
67
|
+
- NIFI_SERVER_USERNAME: `export NIFI_SERVER_USERNAME=<nifi_server_username>`
|
|
68
|
+
- NIFI_SERVER_PRIVATE_KEY: `export NIFI_SERVER_PRIVATE_KEY=<path_to_private_key>`
|
|
69
|
+
|
|
70
|
+
Additionally, a user account granted with access to the NIFI service must be specified, either a
|
|
71
|
+
|
|
72
|
+
#### A) NIFI User Account
|
|
73
|
+
The NIFI account must be configured in the following environment variables:
|
|
74
|
+
- NIFI_LOGIN: `export NIFI_LOGIN=<nifi_login>`
|
|
75
|
+
- NIFI_PASSWORD: `export NIFI_PASSWORD=<nifi_password>`
|
|
76
|
+
|
|
77
|
+
This NIFI account is provided by the NIFI administrator, or a
|
|
78
|
+
|
|
79
|
+
#### B) Keycloak Account with access to NIFI
|
|
80
|
+
The Keycloak account must be configured in the following environment variables:
|
|
81
|
+
- KEYCLOAK_LOGIN: `export KEYCLOAK_LOGIN=<keycloak_login>`
|
|
82
|
+
- KEYCLOAK_PASSWORD: `export KEYCLOAK_PASSWORD=<keycloak_password>`
|
|
83
|
+
|
|
84
|
+
This Keycloak account is provided by the Keycloak administrator.
|
|
85
|
+
|
|
86
|
+
## Quick Deployment, setup, configuration and execution
|
|
87
|
+
### From GitLab repository
|
|
88
|
+
1. Clone Data Transfer CLI repository.
|
|
89
|
+
2. Setup the hid-data_transfer_lib project.
|
|
90
|
+
Go to folder *hid-data-management/data-transfer/nifi/hid_data_transfer_lib*.
|
|
91
|
+
On the prompt, do: `poetry install & poetry build`
|
|
92
|
+
Note: this is only required meanwhile this tool is under development, as data-transfer-cli references the local hid_data_transfer_lib and not the one published on Pypi.
|
|
93
|
+
2. Setup the data-transfer-cli project with poetry.
|
|
94
|
+
Go to folder *hid-data-management/data-transfer/nifi/data-transfer-cli*.
|
|
95
|
+
On the prompt, run `./setup.sh`
|
|
96
|
+
3. Configure your NIFI and Keycloak services, by modifying the default DT CLI configuration (attached to HiDALGO2 NIFI and KEYCLOAK) located at *src/dtcli.cfg*
|
|
97
|
+
4. Edit *setenv.sh*. Provide your accounts for KEYCLOAK and NIFI server. Contact the HiDALGO2 administrator to request them.
|
|
98
|
+
```
|
|
99
|
+
export NIFI_SERVER_USERNAME="<username>"
|
|
100
|
+
export NIFI_SERVER_PRIVATE_KEY="<relative_path_ssh_private_key"
|
|
101
|
+
export KEYCLOAK_LOGIN="<username>"
|
|
102
|
+
export KEYCLOAK_PASSWORD="<password>"
|
|
103
|
+
```
|
|
104
|
+
5. Run Data Transfer CLI tool. In this example, we ask it for help: `dtcli -h`
|
|
105
|
+
|
|
106
|
+
### From Pipy installation
|
|
107
|
+
To be done
|
|
108
|
+
|
|
109
|
+
## Usage
|
|
110
|
+
The Data Transfer CLI tool can be executed by invoking the command `dtcli`. Add this command location to your path, either by adding the *data_transfer_cli* folder (when cloned from GitLab) or its location when installed with pip from Pypi:
|
|
111
|
+
|
|
112
|
+
`./dtcli command <arguments>`
|
|
113
|
+
|
|
114
|
+
To get help execute:
|
|
115
|
+
|
|
116
|
+
`./dtcli -h`
|
|
117
|
+
|
|
118
|
+
obtaining:
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
usage: ['-h'] [-h]
|
|
122
|
+
{check-status,hdfs2hpc,hpc2hdfs,ckan2hdfs,hdfs2ckan,ckan2hpc,hpc2ckan,local2ckan,ckan2local}
|
|
123
|
+
...
|
|
124
|
+
|
|
125
|
+
positional arguments:
|
|
126
|
+
{check-status,hdfs2hpc,hpc2hdfs,ckan2hdfs,hdfs2ckan,ckan2hpc,hpc2ckan,local2ckan,ckan2local}
|
|
127
|
+
supported commands to transfer data
|
|
128
|
+
check-status check the status of a command
|
|
129
|
+
hdfs2hpc transfer data from HDFS to target HPC
|
|
130
|
+
hpc2hdfs transfer data from HPC to target HDFS
|
|
131
|
+
ckan2hdfs transfer data from CKAN to target HDFS
|
|
132
|
+
hdfs2ckan transfer data from HDFS to a target CKAN
|
|
133
|
+
ckan2hpc transfer data from CKAN to target HPC
|
|
134
|
+
hpc2ckan transfer data from HPC to a target CKAN
|
|
135
|
+
local2ckan transfer data from a local filesystem to a target CKAN
|
|
136
|
+
ckan2local transfer data from CKAN to a local filesystem
|
|
137
|
+
|
|
138
|
+
options:
|
|
139
|
+
-h, --help show this help message and exit
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
To get help of a particular command:
|
|
143
|
+
|
|
144
|
+
`./dtcli hdfs2hpc -h`
|
|
145
|
+
|
|
146
|
+
obtaining:
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
usage: ['hdfs2hpc', '-h'] hdfs2hpc [-h] -s DATA_SOURCE [-t DATA_TARGET] [-kpr KERBEROS_PRINCIPAL] [-kp KERBEROS_PASSWORD] -H HPC_HOST [-z HPC_PORT] -u HPC_USERNAME [-p HPC_PASSWORD] [-k HPC_SECRET_KEY] [-P HPC_SECRET_KEY_PASSWORD]
|
|
150
|
+
|
|
151
|
+
options:
|
|
152
|
+
-h, --help show this help message and exit
|
|
153
|
+
-s DATA_SOURCE, --data-source DATA_SOURCE
|
|
154
|
+
HDFS file path
|
|
155
|
+
-t DATA_TARGET, --data-target DATA_TARGET
|
|
156
|
+
[Optional] HPC folder
|
|
157
|
+
-kpr KERBEROS_PRINCIPAL, --kerberos-principal KERBEROS_PRINCIPAL
|
|
158
|
+
[Optional] Kerberos principal (mandatory for a Kerberized HDFS)
|
|
159
|
+
-kp KERBEROS_PASSWORD, --kerberos-password KERBEROS_PASSWORD
|
|
160
|
+
[Optional] Kerberos principal password (mandatory for a Kerberized HDFS)
|
|
161
|
+
-H HPC_HOST, --hpc-host HPC_HOST
|
|
162
|
+
Target HPC ssh host
|
|
163
|
+
-z HPC_PORT, --hpc-port HPC_PORT
|
|
164
|
+
[Optional] Target HPC ssh port
|
|
165
|
+
-u HPC_USERNAME, --hpc-username HPC_USERNAME
|
|
166
|
+
Username for HPC account
|
|
167
|
+
-p HPC_PASSWORD, --hpc-password HPC_PASSWORD
|
|
168
|
+
[Optional] Password for HPC account. Either password or secret key is required
|
|
169
|
+
-k HPC_SECRET_KEY, --hpc-secret-key HPC_SECRET_KEY
|
|
170
|
+
[Optional] Path to HPC secret key. Either password or secret key is required
|
|
171
|
+
-P HPC_SECRET_KEY_PASSWORD, --hpc-secret-key-password HPC_SECRET_KEY_PASSWORD
|
|
172
|
+
[Optional] Password for HPC secret key
|
|
173
|
+
-2fa, --two-factor-authentication
|
|
174
|
+
[Optional] HPC requires 2FA authentication
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
A common command flow (e.g. transfer data from hdfs to hpc) would be like this:
|
|
178
|
+
|
|
179
|
+
- execute *hdfs2hcp* CLI command to transfer data from an hdfs location (e.g. /users/yosu/data/genome-tags.csv) to a remote HPC (e.g. LUMI, at $HOME/data folder)
|
|
180
|
+
- check status of *hdfs2hcp* transfer (and possible warnings/errors) with *check-status* CLI command
|
|
181
|
+
|
|
182
|
+
## Support for HPC clusters that require a 2FA token
|
|
183
|
+
The Data Transfer CLI tool's commands support transferring data to/from HPC clusters that require a 2FA token. These commands offer an optional flag *_2fa*. If set by the user, the command prompts the user (in the standard input) for the token when required.
|
|
184
|
+
|
|
185
|
+
## Predefined profiles for data hosts
|
|
186
|
+
To avoid feeding the Data Transfer CLI tool with many inputs decribing the hosts of the source and target data providers/consumers, the user can defined them in the `~/dtcli/config` YAML file, as shown in the following YAML code snippet:
|
|
187
|
+
```
|
|
188
|
+
# Meluxina
|
|
189
|
+
login.lxp.lu:
|
|
190
|
+
username: u102309
|
|
191
|
+
port: 8822
|
|
192
|
+
secret-key: ~/.ssh/<secret_key>
|
|
193
|
+
secret-key-password: <password>
|
|
194
|
+
|
|
195
|
+
# CKAN
|
|
196
|
+
ckan.hidalgo2.eu:
|
|
197
|
+
api-key: <api-key>
|
|
198
|
+
organization: atos
|
|
199
|
+
dataset: test-dataset
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
where details for Meluxina HPC and CKAN are given. For a HPC cluster, provide the HPC host as key, followed by colon, and below, with identation, any of the hpc parameters described by the Data Tranfer CLI tool help, without the *hpc_* prefix. For instance, if the Data Transfer CLI tool help mentions:
|
|
203
|
+
```
|
|
204
|
+
-u HPC_USERNAME, --hpc-username HPC_USERNAME
|
|
205
|
+
Username for HPC account
|
|
206
|
+
```
|
|
207
|
+
that is, *--hpc-username* as parameter, use *username* as nested property for the HPC profile's description in the YAML config file, as shown in the example below. Similarly, proceed for other HPC parameters, such as *port*, *password*, *secret-key*, etc.
|
|
208
|
+
The same procedure can be adopted to describe the CKAN host's parameters.
|
|
209
|
+
|
|
210
|
+
Note: Hidalgo2 HPDA configuration is included in the Data Transfer CLI tool implementation and does not require to be included in this config file.
|
|
211
|
+
|
|
212
|
+
Then, when you launch a Data Tranfer CLI tool command, any parameter not included in the command line will be retrieved from the config file if the corresponding host entry is included. After that, if the command line gets complete (i.e. all required parameters are provided), the command will be executed, otherwise the corresponding error will be triggered.
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "data-transfer-cli"
|
|
3
|
+
version = "0.3.2"
|
|
4
|
+
description = "HiDALGO Data Transfer CLI provides commands to transfer data between different data providers and consumers using NIFI pipelines"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name = "Jesús Gorroñogoitia", email = "jesus.gorronogoitia@eviden.com" },
|
|
7
|
+
]
|
|
8
|
+
license = "APL-2.0"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11, <4.0"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"requests>=2.31.0",
|
|
13
|
+
"paramiko>=3.3.1",
|
|
14
|
+
"hid_data_transfer_lib>=0.3.2",
|
|
15
|
+
#"hid_data_transfer_lib @ file:///home/yosu/Projects/Hidalgo2/git/hid-data-management/data-transfer/nifi/hid_data_transfer_lib/dist/hid_data_transfer_lib-0.3.2-py3-none-any.whl",
|
|
16
|
+
"pyyaml (>=6.0.2,<7.0.0)"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[tool.poetry]
|
|
20
|
+
packages = [{ include = "src" }, { include = "parser", from = "src" }]
|
|
21
|
+
|
|
22
|
+
[tool.poetry.dependencies]
|
|
23
|
+
# Comment out when using the version installed in Pypi
|
|
24
|
+
# hid_data_transfer_lib = { path = "../hid_data_transfer_lib/dist/hid_data_transfer_lib-0.3.2-py3-none-any.whl", develop = true }
|
|
25
|
+
|
|
26
|
+
[tool.poetry.group.dev.dependencies]
|
|
27
|
+
flake8 = "^6.1.0"
|
|
28
|
+
black = "^23.9.1"
|
|
29
|
+
pytest = "^7.4.2"
|
|
30
|
+
pytest-cov = "^5.0.0"
|
|
31
|
+
pytest-env = "^1.1.3"
|
|
32
|
+
|
|
33
|
+
[tool.poetry.group.types.dependencies]
|
|
34
|
+
types-paramiko = "^3.4.0.20240311"
|
|
35
|
+
types-requests = "^2.31.0.20240311"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["poetry-core"]
|
|
39
|
+
build-backend = "poetry.core.masonry.api"
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
cli = 'data_transfer_cli:main'
|
|
43
|
+
|
|
44
|
+
[tool.pytest.ini_options]
|
|
45
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
PYTHONPATH=./src
|
|
File without changes
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Eviden
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an 'AS IS' BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
|
14
|
+
|
|
15
|
+
CLI tool for data transfer based on Apache NIFI
|
|
16
|
+
Initial Poc features
|
|
17
|
+
- hdfs2hpc: transfer data from hdfs to target hpc, using sftp processor:
|
|
18
|
+
- inputs:
|
|
19
|
+
- hpc_host: hpc frontend hostname
|
|
20
|
+
- hpc_username: user account name
|
|
21
|
+
- hpc_secret_key_path: user's secret key location
|
|
22
|
+
- data-source: HDFS file path
|
|
23
|
+
- data-target: HPC remote folder
|
|
24
|
+
|
|
25
|
+
- hpc2hdfs: transfer data file from hpc folder to target hdfs folder,
|
|
26
|
+
using sftp processor:
|
|
27
|
+
- inputs:
|
|
28
|
+
- hpc_host: hpc frontend hostname
|
|
29
|
+
- hpc_username: user account name
|
|
30
|
+
- hpc_secret_key_path: user's secret key location
|
|
31
|
+
- data-source: HPC file path
|
|
32
|
+
- data-target: HDFS remote folder
|
|
33
|
+
|
|
34
|
+
- ckan2hpc: transfer data from ckan to target hpc,
|
|
35
|
+
using ckan and sftp processors:
|
|
36
|
+
- inputs:
|
|
37
|
+
- ckan_host: CKAN host endpoint
|
|
38
|
+
- ckan_api_key: CKAN API key
|
|
39
|
+
- ckan_organization: CKAN organization
|
|
40
|
+
- ckan_dataset: CKAN dataset
|
|
41
|
+
- ckan_resource: CKAN resource
|
|
42
|
+
- hpc_host: hpc frontend hostname
|
|
43
|
+
- hpc_username: user account name
|
|
44
|
+
- hpc_secret_key_path: user's secret key location
|
|
45
|
+
- data-target: HPC remote folder
|
|
46
|
+
|
|
47
|
+
- hpc2ckan: transfer data from hpc to target ckan,
|
|
48
|
+
using ckan and sftp processors:
|
|
49
|
+
- inputs:
|
|
50
|
+
- ckan_host: CKAN host endpoint
|
|
51
|
+
- ckan_api_key: CKAN API key
|
|
52
|
+
- ckan_organization: CKAN organization
|
|
53
|
+
- ckan_dataset: CKAN dataset
|
|
54
|
+
- hpc_host: hpc frontend hostname
|
|
55
|
+
- hpc_username: user account name
|
|
56
|
+
- hpc_secret_key_path: user's secret key location
|
|
57
|
+
- data_source: HPC file path
|
|
58
|
+
|
|
59
|
+
- local2ckan: transfer data from a local filesystem to a target ckan
|
|
60
|
+
- inputs:
|
|
61
|
+
- ckan_host: CKAN host endpoint
|
|
62
|
+
- ckan_api_key: CKAN API key
|
|
63
|
+
- ckan_organization: CKAN organization
|
|
64
|
+
- ckan_dataset: CKAN dataset
|
|
65
|
+
- ckan_resource: CKAN resource to receive the data
|
|
66
|
+
- data_source: local file path to the data to transfer
|
|
67
|
+
|
|
68
|
+
- ckan2local: transfer data from ckan to a local filesystem
|
|
69
|
+
- inputs:
|
|
70
|
+
- ckan_host: CKAN host endpoint
|
|
71
|
+
- ckan_api_key: CKAN API key
|
|
72
|
+
- ckan_organization: CKAN organization
|
|
73
|
+
- ckan_dataset: CKAN dataset
|
|
74
|
+
- ckan_resource: CKAN resource to transfer
|
|
75
|
+
- data_target: local target directory where to transfer the resource
|
|
76
|
+
|
|
77
|
+
- check-status: check the execution state of a command
|
|
78
|
+
- inputs:
|
|
79
|
+
- command_id: uuid of command executed
|
|
80
|
+
(uuid is reported after command execution)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
This CLI uses NIFI account to get an access token,
|
|
84
|
+
It uses NIFI REST API to send requests.
|
|
85
|
+
It uses a predefined and installed process group HDSF2HPC template
|
|
86
|
+
with associated parameter context
|
|
87
|
+
'''
|
|
88
|
+
|
|
89
|
+
import sys
|
|
90
|
+
import os
|
|
91
|
+
import threading
|
|
92
|
+
import traceback
|
|
93
|
+
import warnings
|
|
94
|
+
|
|
95
|
+
from hid_data_transfer_lib.exceptions.hid_dt_exceptions import HidDataTransferException
|
|
96
|
+
from hid_data_transfer_lib.conf.hid_dt_configuration import HidDataTransferConfiguration
|
|
97
|
+
|
|
98
|
+
from src.data_transfer_proxy import DataTransferProxy
|
|
99
|
+
from src.parser.cli_parser import CLIParser
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
warnings.filterwarnings("ignore")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# Get CLI configuration
|
|
106
|
+
os.environ["HID_DT_CONFIG_FILE"] = \
|
|
107
|
+
str(os.path.dirname(os.path.realpath(__file__))) + "/dtcli.cfg"
|
|
108
|
+
config = HidDataTransferConfiguration()
|
|
109
|
+
|
|
110
|
+
# Data Transfer proxy to the library
|
|
111
|
+
dt_proxy = DataTransferProxy(config, True)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class ThreadRaisingExceptions(threading.Thread):
|
|
115
|
+
"""Thread class that raises exceptions in the main thread
|
|
116
|
+
when the thread finishes with an exception"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, *args, **kwargs):
|
|
119
|
+
self._exception = None
|
|
120
|
+
self._process_group_id = None
|
|
121
|
+
super().__init__(*args, **kwargs)
|
|
122
|
+
|
|
123
|
+
def run(self):
|
|
124
|
+
try:
|
|
125
|
+
self._process_group_id = self._target(*self._args, **self._kwargs)
|
|
126
|
+
except HidDataTransferException as e:
|
|
127
|
+
self._exception = e
|
|
128
|
+
raise e
|
|
129
|
+
|
|
130
|
+
def join(self, *args, **kwargs):
|
|
131
|
+
super().join(*args, **kwargs)
|
|
132
|
+
if self._exception:
|
|
133
|
+
raise self._exception
|
|
134
|
+
return self._process_group_id
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main(args=None):
|
|
138
|
+
"""Main entry point for the Data Transfer CLI"""
|
|
139
|
+
if not args:
|
|
140
|
+
args = sys.argv[1:]
|
|
141
|
+
# Parse arguments
|
|
142
|
+
cli_parser = CLIParser(args)
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
if len(args) == 0:
|
|
146
|
+
cli_parser.print_help()
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
# Read user's config file to complete missing arguments with default ones
|
|
150
|
+
args = cli_parser.fill_missing_args_from_config(args)
|
|
151
|
+
args = cli_parser.parse_arguments(args, dt_proxy)
|
|
152
|
+
|
|
153
|
+
# executes associated command in data_transfer_cli module
|
|
154
|
+
thread = ThreadRaisingExceptions(target=args.func, args=(args,))
|
|
155
|
+
thread.start()
|
|
156
|
+
thread.join()
|
|
157
|
+
except HidDataTransferException as e:
|
|
158
|
+
if e.process_group_id():
|
|
159
|
+
sys.stderr.write(
|
|
160
|
+
(
|
|
161
|
+
f"Got error{e} when executing process group "
|
|
162
|
+
f"with id {e.process_group_id()}"
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
traceback.print_exc(file=sys.stderr)
|
|
167
|
+
raise e
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
main()
|