data-factory-utils 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Init file."""
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""Class to help with AWS environment inference and configuration."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import secrets
|
|
6
|
+
import string
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Self
|
|
9
|
+
|
|
10
|
+
import boto3
|
|
11
|
+
import botocore.exceptions
|
|
12
|
+
from cloudpathlib import S3Path
|
|
13
|
+
from mypy_boto3_s3.type_defs import BucketTypeDef
|
|
14
|
+
from mypy_boto3_sts.type_defs import CredentialsTypeDef
|
|
15
|
+
|
|
16
|
+
logging.basicConfig(level=logging.INFO)
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
alphabet = string.ascii_lowercase
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _raise_missing_token() -> None:
|
|
23
|
+
msg = "AWS_WEB_IDENTITY_TOKEN_FILE environment variable not set."
|
|
24
|
+
logger.exception(msg)
|
|
25
|
+
raise RuntimeError(msg)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _raise_missing_role_arn() -> None:
|
|
29
|
+
raise MissingEnvVarRoleArnError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MissingEnvVarRoleArnError(Exception):
|
|
33
|
+
"""Exception for a missing role arn env var."""
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
"""Raise error with message."""
|
|
37
|
+
self.message = "Add AWS_ROLE_ARN to environment variables."
|
|
38
|
+
super().__init__()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Environment:
|
|
42
|
+
"""AWS environment inference and configuration.
|
|
43
|
+
|
|
44
|
+
This class helps determine the environment (prod, preprod, test, dev),
|
|
45
|
+
manage AWS credentials (via web identity or default),
|
|
46
|
+
and construct environment-specific S3 bucket URLs.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
_instance: Self | None = None
|
|
50
|
+
|
|
51
|
+
def __new__(cls, *_args: object, **_kwargs: object) -> Self:
|
|
52
|
+
"""Singleton instantiation."""
|
|
53
|
+
if cls._instance is None:
|
|
54
|
+
cls._instance = super().__new__(cls)
|
|
55
|
+
return cls._instance
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
job_name: str | None = "",
|
|
60
|
+
bucket_prefix: str | None = "emds",
|
|
61
|
+
*,
|
|
62
|
+
use_web_identity: bool,
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Initialize the environment context.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
job_name : str, optional
|
|
69
|
+
Name used when assuming a role session, by default empty string.
|
|
70
|
+
bucket_prefix : str, optional
|
|
71
|
+
Base prefix used to identify datahub buckets, by default "emds".
|
|
72
|
+
use_web_identity : bool, optional
|
|
73
|
+
Whether to try using web identity credentials first.
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
if hasattr(self, "_initialized") and self._initialized:
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
self.job_name = job_name
|
|
80
|
+
self.bucket_prefix = bucket_prefix
|
|
81
|
+
self.use_web_identity = use_web_identity
|
|
82
|
+
|
|
83
|
+
self.session = self._init_session()
|
|
84
|
+
self.alias = self._fetch_account_alias()
|
|
85
|
+
self.account_no = self._get_account_number()
|
|
86
|
+
self.bucket_list = self._list_buckets()
|
|
87
|
+
|
|
88
|
+
self._initialized: bool = True
|
|
89
|
+
|
|
90
|
+
def _init_session(self) -> boto3.Session:
|
|
91
|
+
"""Initialise a boto3 session, optionally using web identity credentials.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
boto3.Session
|
|
96
|
+
Configured boto3 session.
|
|
97
|
+
|
|
98
|
+
"""
|
|
99
|
+
if self.use_web_identity:
|
|
100
|
+
try:
|
|
101
|
+
token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
|
|
102
|
+
if token_path is None:
|
|
103
|
+
_raise_missing_token()
|
|
104
|
+
else:
|
|
105
|
+
path = Path(token_path)
|
|
106
|
+
with path.open() as f:
|
|
107
|
+
web_identity_token = f.read()
|
|
108
|
+
|
|
109
|
+
role_arn = os.environ.get("AWS_ROLE_ARN")
|
|
110
|
+
if role_arn is None:
|
|
111
|
+
_raise_missing_role_arn()
|
|
112
|
+
else:
|
|
113
|
+
role_arn_arg = role_arn
|
|
114
|
+
|
|
115
|
+
sts_client = boto3.client("sts")
|
|
116
|
+
response = sts_client.assume_role_with_web_identity(
|
|
117
|
+
RoleArn=role_arn_arg,
|
|
118
|
+
RoleSessionName=f"session-{self.job_name}",
|
|
119
|
+
WebIdentityToken=web_identity_token,
|
|
120
|
+
DurationSeconds=900,
|
|
121
|
+
)
|
|
122
|
+
return boto3.session.Session(
|
|
123
|
+
aws_access_key_id=response["Credentials"]["AccessKeyId"],
|
|
124
|
+
aws_secret_access_key=response["Credentials"]["SecretAccessKey"],
|
|
125
|
+
aws_session_token=response["Credentials"]["SessionToken"],
|
|
126
|
+
)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.warning("Web identity failed: %s. Falling back to default session.", e)
|
|
129
|
+
raise
|
|
130
|
+
|
|
131
|
+
return boto3.session.Session()
|
|
132
|
+
|
|
133
|
+
def _fetch_account_alias(self) -> str:
|
|
134
|
+
"""Fetch the AWS account alias.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
str
|
|
139
|
+
Account alias or default.
|
|
140
|
+
|
|
141
|
+
Notes
|
|
142
|
+
-----
|
|
143
|
+
Falls back to 'preproduction' alias if none found.
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
try:
|
|
147
|
+
aliases = boto3.client("iam").list_account_aliases().get("AccountAliases", [])
|
|
148
|
+
return aliases[0] if aliases else "electronic-monitoring-data-preproduction"
|
|
149
|
+
except botocore.exceptions.ClientError:
|
|
150
|
+
logger.warning("Failed to fetch account alias, assuming preproduction.")
|
|
151
|
+
return "electronic-monitoring-data-preproduction"
|
|
152
|
+
|
|
153
|
+
def _get_account_number(self) -> str:
|
|
154
|
+
"""Return the AWS account number."""
|
|
155
|
+
try:
|
|
156
|
+
return boto3.client("sts").get_caller_identity()["Account"]
|
|
157
|
+
except botocore.exceptions.NoCredentialsError:
|
|
158
|
+
msg = "AWS credentials not found."
|
|
159
|
+
logger.exception(msg)
|
|
160
|
+
raise RuntimeError(msg) from None
|
|
161
|
+
|
|
162
|
+
def _list_buckets(self) -> list[BucketTypeDef]:
|
|
163
|
+
"""List all available S3 buckets."""
|
|
164
|
+
try:
|
|
165
|
+
return boto3.client("s3").list_buckets()["Buckets"]
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.warning("Could not list buckets: %s", e)
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def account_number(self) -> str:
|
|
172
|
+
"""Return the AWS account number."""
|
|
173
|
+
return self.account_no
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def environment_name(self) -> str:
|
|
177
|
+
"""Infer environment name from account alias.
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
str
|
|
182
|
+
One of: prod, dev, preprod, test, or fallback to raw alias suffix.
|
|
183
|
+
|
|
184
|
+
"""
|
|
185
|
+
full_env_name = self.alias.split("-")[-1]
|
|
186
|
+
return {
|
|
187
|
+
"production": "prod",
|
|
188
|
+
"development": "dev",
|
|
189
|
+
"preproduction": "preprod",
|
|
190
|
+
"test": "test",
|
|
191
|
+
}.get(full_env_name, full_env_name)
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def is_prod(self) -> bool:
|
|
195
|
+
"""Check if the environment is production."""
|
|
196
|
+
return self.environment_name == "prod"
|
|
197
|
+
|
|
198
|
+
def get_full_bucket_url(
|
|
199
|
+
self,
|
|
200
|
+
bucket_prefix: str | None = None,
|
|
201
|
+
*,
|
|
202
|
+
full_prefix: bool,
|
|
203
|
+
) -> S3Path | None:
|
|
204
|
+
"""Get S3Path to bucket matching environment and prefix.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
bucket_prefix : str, optional
|
|
209
|
+
Prefix to search for (overrides default prefix).
|
|
210
|
+
full_prefix : bool, optional
|
|
211
|
+
Whether to match full bucket name exactly.
|
|
212
|
+
|
|
213
|
+
Returns
|
|
214
|
+
-------
|
|
215
|
+
Optional[S3Path]
|
|
216
|
+
S3Path to the matched bucket or None if not found.
|
|
217
|
+
|
|
218
|
+
"""
|
|
219
|
+
search_prefix = bucket_prefix or self.bucket_prefix
|
|
220
|
+
expected_name = f"{self.bucket_prefix}-{self.environment_name}-{search_prefix}"
|
|
221
|
+
|
|
222
|
+
for bucket in self.bucket_list:
|
|
223
|
+
bucket_name = bucket["Name"]
|
|
224
|
+
if full_prefix:
|
|
225
|
+
if expected_name == "-".join(bucket_name.split("-")[:-1]) or expected_name == bucket_name:
|
|
226
|
+
return S3Path(f"s3://{bucket_name}")
|
|
227
|
+
elif expected_name in bucket_name:
|
|
228
|
+
return S3Path(f"s3://{bucket_name}")
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def get_api_invoke_url(self, api_name: str, region: str) -> str:
|
|
232
|
+
"""Get API invoke url from env."""
|
|
233
|
+
client = boto3.client("apigateway", region)
|
|
234
|
+
rest_api_response = client.get_rest_apis()
|
|
235
|
+
matches = [it for it in rest_api_response["items"] if it["name"] == api_name]
|
|
236
|
+
if len(matches) > 1:
|
|
237
|
+
raise ValueError
|
|
238
|
+
api_details = matches[0]
|
|
239
|
+
return f"https://{api_details['id']}.execute-api.{region}.amazonaws.com/"
|
|
240
|
+
|
|
241
|
+
def refresh_credentials(self) -> CredentialsTypeDef:
|
|
242
|
+
"""Refresh credentials via STS.
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
dict
|
|
247
|
+
New credentials.
|
|
248
|
+
|
|
249
|
+
"""
|
|
250
|
+
try:
|
|
251
|
+
token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
|
|
252
|
+
if token_path is None:
|
|
253
|
+
_raise_missing_token()
|
|
254
|
+
else:
|
|
255
|
+
path = Path(token_path)
|
|
256
|
+
with path.open() as f:
|
|
257
|
+
web_identity_token = f.read()
|
|
258
|
+
|
|
259
|
+
role_arn = os.environ.get("AWS_ROLE_ARN")
|
|
260
|
+
if role_arn is None:
|
|
261
|
+
_raise_missing_role_arn()
|
|
262
|
+
else:
|
|
263
|
+
roel_arn_arg = role_arn
|
|
264
|
+
|
|
265
|
+
sts_client = boto3.client("sts")
|
|
266
|
+
rand_suffix = "".join(secrets.choice(alphabet) for _ in range(10))
|
|
267
|
+
response_assume_role = sts_client.assume_role_with_web_identity(
|
|
268
|
+
RoleArn=roel_arn_arg,
|
|
269
|
+
RoleSessionName=f"session-{self.job_name}-{rand_suffix}",
|
|
270
|
+
WebIdentityToken=web_identity_token,
|
|
271
|
+
DurationSeconds=900,
|
|
272
|
+
)
|
|
273
|
+
return response_assume_role["Credentials"]
|
|
274
|
+
|
|
275
|
+
except Exception:
|
|
276
|
+
logger.exception("Web identity failed. Falling back to get_session_token.")
|
|
277
|
+
|
|
278
|
+
sts_client = boto3.client("sts")
|
|
279
|
+
response_session_token = sts_client.get_session_token(DurationSeconds=900)
|
|
280
|
+
return response_session_token["Credentials"]
|
|
281
|
+
|
|
282
|
+
def export_dbt_variables(self, *, actions: bool = False, airflow: bool = False) -> None:
|
|
283
|
+
"""Export dbt variables for the environment."""
|
|
284
|
+
s3_data_bucket_name = self.get_full_bucket_url("cadt", full_prefix=True)
|
|
285
|
+
dbt_test_profile_workgroup = f"{self.account_number}-default"
|
|
286
|
+
dbt_suffix = "" if self.is_prod else f"_{self.environment_name}_dbt"
|
|
287
|
+
h3_lambda_arn = f"arn:aws:lambda:eu-west-2:{self.account_no}:function:h3-udf"
|
|
288
|
+
|
|
289
|
+
if actions:
|
|
290
|
+
export_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
|
|
291
|
+
>> $GITHUB_ENV\n'
|
|
292
|
+
export_bucket = f'echo "S3_DATA_BUCKET_NAME={s3_data_bucket_name}" \
|
|
293
|
+
>> $GITHUB_ENV\n'
|
|
294
|
+
export_dbt_profile = f'echo \
|
|
295
|
+
"DBT_TEST_PROFILE_WORKGROUP={dbt_test_profile_workgroup}"\
|
|
296
|
+
>> $GITHUB_ENV\n'
|
|
297
|
+
export_dbt_profile_location = ""
|
|
298
|
+
export_h3_lambda_arn = f"""echo \
|
|
299
|
+
export H3_LAMBDA_ARN='{h3_lambda_arn}'
|
|
300
|
+
>> $GITHUB_ENV\n
|
|
301
|
+
"""
|
|
302
|
+
export_dbt_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
|
|
303
|
+
>> $GITHUB_ENV\n'
|
|
304
|
+
else:
|
|
305
|
+
export_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
|
|
306
|
+
export_bucket = f"export S3_DATA_BUCKET_NAME='{s3_data_bucket_name}'\n"
|
|
307
|
+
export_dbt_profile = f"""
|
|
308
|
+
export DBT_TEST_PROFILE_WORKGROUP='{dbt_test_profile_workgroup}'\n
|
|
309
|
+
"""
|
|
310
|
+
export_dbt_profile_location = 'export DBT_PROFILES_DIR="../.dbt/"\n'
|
|
311
|
+
export_h3_lambda_arn = f"export H3_LAMBDA_ARN='{h3_lambda_arn}'"
|
|
312
|
+
export_dbt_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
|
|
313
|
+
|
|
314
|
+
with Path("set_env.sh").open("w") as f:
|
|
315
|
+
f.write(export_suffix)
|
|
316
|
+
f.write(export_bucket)
|
|
317
|
+
f.write(export_dbt_profile)
|
|
318
|
+
if not airflow:
|
|
319
|
+
f.write(export_dbt_profile_location)
|
|
320
|
+
f.write(export_h3_lambda_arn)
|
|
321
|
+
f.write(export_dbt_suffix)
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def clear(cls) -> None:
|
|
325
|
+
"""Reset the singleton instance.
|
|
326
|
+
|
|
327
|
+
Use this to force restart of the class. Mainly for testing.
|
|
328
|
+
"""
|
|
329
|
+
cls._instance = None
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def instance(cls) -> "Environment | None":
|
|
333
|
+
"""Return the current singleton instance, if any. Mainly for testing."""
|
|
334
|
+
return cls._instance
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: data-factory-utils
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Utility functions for interacting with data factories.
|
|
5
|
+
Requires-Dist: boto3>=1.42.8
|
|
6
|
+
Requires-Dist: boto3-stubs>=1.42.89
|
|
7
|
+
Requires-Dist: botocore>=1.42.7
|
|
8
|
+
Requires-Dist: cloudpathlib>=0.23.0
|
|
9
|
+
Requires-Dist: mypy-boto3>=1.42.3
|
|
10
|
+
Requires-Dist: mypy-boto3-s3>=1.42.85
|
|
11
|
+
Requires-Dist: mypy-boto3-sts>=1.42.3
|
|
12
|
+
Requires-Python: >=3.12
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# data-factory-utils
|
|
16
|
+
A package for random utils for data factories.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
This is a published package. Install using your favourite installation method.
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv add data-factory-utils
|
|
23
|
+
pip install data-factory-utils
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
### Environment functions
|
|
28
|
+
This set of functions reads from your data factory dynamically. It should infer the environment you are in as well.
|
|
29
|
+
|
|
30
|
+
No matter how many times you initiate the class, it will re-use old variables. To do so...
|
|
31
|
+
```python
|
|
32
|
+
from data_factory_utils.environment import Environment
|
|
33
|
+
env = Environment()
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
To return information about the environment (if we are in development with account number 0101010101):
|
|
37
|
+
```python
|
|
38
|
+
env.account_no
|
|
39
|
+
# 0101010101
|
|
40
|
+
env.environment_name
|
|
41
|
+
# dev
|
|
42
|
+
env.is_prod
|
|
43
|
+
# False
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
To get an S3 bucket name (outputted as `cloudpathlib`'s `S3Path`) (let us imagine here that the name is `emds-dev-random-name-202512161154001309058001`):
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
s3_random_name_bucket = env.get_full_bucket_url("random-name", full_prefix=True)
|
|
50
|
+
print(str(s3_random_name_bucket.bucket))
|
|
51
|
+
# emds-dev-random-name-202512161154001309058001
|
|
52
|
+
```
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
data_factory_utils/__init__.py,sha256=2DpcGqE-C0G785KJpkdZBg_em4jPuutiiIXgIVtLDvc,17
|
|
2
|
+
data_factory_utils/environment.py,sha256=hvj25smOkmy3bgfdTqNtBs_KVhVQ4kqwaCDMw7AfFkQ,11639
|
|
3
|
+
data_factory_utils-1.0.0.dist-info/WHEEL,sha256=s_zqWxHFEH8b58BCtf46hFCqPaISurdB9R1XJ8za6XI,80
|
|
4
|
+
data_factory_utils-1.0.0.dist-info/METADATA,sha256=Ezvz6ewe99Vi-1h5Jjx6Q6X0RqS4tWDSsS_vFAFmH-k,1534
|
|
5
|
+
data_factory_utils-1.0.0.dist-info/RECORD,,
|