data-factory-utils 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.3
2
+ Name: data-factory-utils
3
+ Version: 1.0.0
4
+ Summary: Utility functions for interacting with data factories.
5
+ Requires-Dist: boto3>=1.42.8
6
+ Requires-Dist: boto3-stubs>=1.42.89
7
+ Requires-Dist: botocore>=1.42.7
8
+ Requires-Dist: cloudpathlib>=0.23.0
9
+ Requires-Dist: mypy-boto3>=1.42.3
10
+ Requires-Dist: mypy-boto3-s3>=1.42.85
11
+ Requires-Dist: mypy-boto3-sts>=1.42.3
12
+ Requires-Python: >=3.12
13
+ Description-Content-Type: text/markdown
14
+
15
+ # data-factory-utils
16
+ A package for random utils for data factories.
17
+
18
+ ## Installation
19
+ This is a published package. Install using your favourite installation method.
20
+
21
+ ```bash
22
+ uv add data-factory-utils
23
+ pip install data-factory-utils
24
+ ```
25
+
26
+ ## Usage
27
+ ### Environment functions
28
+ This set of functions reads from your data factory dynamically. It should infer the environment you are in as well.
29
+
30
+ No matter how many times you initiate the class, it will re-use old variables. To do so...
31
+ ```python
32
+ from data_factory_utils.environment import Environment
33
+ env = Environment()
34
+ ```
35
+
36
+ To return information about the environment (if we are in development with account number 0101010101):
37
+ ```python
38
+ env.account_no
39
+ # 0101010101
40
+ env.environment_name
41
+ # dev
42
+ env.is_prod
43
+ # False
44
+ ```
45
+
46
+ To get an S3 bucket name (outputted as `cloudpathlib`'s `S3Path`) (let us imagine here that the name is `emds-dev-random-name-202512161154001309058001`):
47
+
48
+ ```python
49
+ s3_random_name_bucket = env.get_full_bucket_url("random-name", full_prefix=True)
50
+ print(str(s3_random_name_bucket.bucket))
51
+ # emds-dev-random-name-202512161154001309058001
52
+ ```
@@ -0,0 +1,38 @@
1
+ # data-factory-utils
2
+ A package for random utils for data factories.
3
+
4
+ ## Installation
5
+ This is a published package. Install using your favourite installation method.
6
+
7
+ ```bash
8
+ uv add data-factory-utils
9
+ pip install data-factory-utils
10
+ ```
11
+
12
+ ## Usage
13
+ ### Environment functions
14
+ This set of functions reads from your data factory dynamically. It should infer the environment you are in as well.
15
+
16
+ No matter how many times you initiate the class, it will re-use old variables. To do so...
17
+ ```python
18
+ from data_factory_utils.environment import Environment
19
+ env = Environment()
20
+ ```
21
+
22
+ To return information about the environment (if we are in development with account number 0101010101):
23
+ ```python
24
+ env.account_no
25
+ # 0101010101
26
+ env.environment_name
27
+ # dev
28
+ env.is_prod
29
+ # False
30
+ ```
31
+
32
+ To get an S3 bucket name (outputted as `cloudpathlib`'s `S3Path`) (let us imagine here that the name is `emds-dev-random-name-202512161154001309058001`):
33
+
34
+ ```python
35
+ s3_random_name_bucket = env.get_full_bucket_url("random-name", full_prefix=True)
36
+ print(str(s3_random_name_bucket.bucket))
37
+ # emds-dev-random-name-202512161154001309058001
38
+ ```
@@ -0,0 +1,85 @@
1
+ [project]
2
+ name = "data-factory-utils"
3
+ version = "1.0.0"
4
+ description = "Utility functions for interacting with data factories."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "boto3>=1.42.8",
9
+ "boto3-stubs>=1.42.89",
10
+ "botocore>=1.42.7",
11
+ "cloudpathlib>=0.23.0",
12
+ "mypy-boto3>=1.42.3",
13
+ "mypy-boto3-s3>=1.42.85",
14
+ "mypy-boto3-sts>=1.42.3",
15
+ ]
16
+
17
+ [build-system]
18
+ requires = ["uv_build>=0.9.17,<0.10.0"]
19
+ build-backend = "uv_build"
20
+
21
+ [dependency-groups]
22
+ dev = [
23
+ "mypy>=1.20.1",
24
+ "prek>=0.2.21",
25
+ "ruff>=0.14.8",
26
+ "toml-cli>=0.8.2",
27
+ "ty>=0.0.1a33",
28
+ ]
29
+ test = [
30
+ "moto>=5.1.18",
31
+ "pytest>=9.0.2",
32
+ ]
33
+
34
+
35
+ [tool.ruff]
36
+ line-length = 120
37
+
38
+ [tool.bandit]
39
+ exclude_dirs = ["/tests", "/.venv"]
40
+
41
+ [tool.mypy]
42
+ strict = true
43
+ namespace_packages = false
44
+ disallow_untyped_defs = true
45
+ follow_untyped_imports = true
46
+ exclude = ["tests"]
47
+
48
+ [tool.ruff.lint]
49
+ select = ["ALL"]
50
+ # Remove warnings
51
+ ignore = ["D203", "D213", "COM812"]
52
+
53
+ [tool.ruff.lint.per-file-ignores]
54
+ "tests/**.py" = ["S101"]
55
+
56
+ [tool.semantic_release]
57
+ commit_message = "{version}\n\nAutomatically generated by python-semantic-release"
58
+ commit_parser = "conventional"
59
+ logging_use_named_masks = false
60
+ major_on_zero = false
61
+ allow_zero_version = true
62
+ no_git_verify = false
63
+ tag_format = "{version}"
64
+
65
+ [tool.semantic_release.branches.main]
66
+ match = "main"
67
+ prerelease_token = "rc"
68
+ prerelease = false
69
+
70
+ [tool.semantic_release.branches.other]
71
+ match = ".*"
72
+ prerelease_token = "rc"
73
+ prerelease = true
74
+
75
+ [tool.semantic_release.commit_parser_options]
76
+ minor_tags = ["feat"]
77
+ patch_tags = ["fix", "perf"]
78
+ other_allowed_tags = ["build", "chore", "ci", "docs", "style", "refactor", "test"]
79
+ allowed_tags = ["feat", "fix", "perf", "build", "chore", "ci", "docs", "style", "refactor", "test"]
80
+ default_bump_level = 0
81
+ parse_squash_commits = true
82
+ ignore_merge_commits = true
83
+
84
+ [tool.bandit.assert_used]
85
+ skips = ['*_test.py', '*/test_*.py']
@@ -0,0 +1 @@
1
+ """Init file."""
@@ -0,0 +1,334 @@
1
+ """Class to help with AWS environment inference and configuration."""
2
+
3
+ import logging
4
+ import os
5
+ import secrets
6
+ import string
7
+ from pathlib import Path
8
+ from typing import Self
9
+
10
+ import boto3
11
+ import botocore.exceptions
12
+ from cloudpathlib import S3Path
13
+ from mypy_boto3_s3.type_defs import BucketTypeDef
14
+ from mypy_boto3_sts.type_defs import CredentialsTypeDef
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ alphabet = string.ascii_lowercase
20
+
21
+
22
+ def _raise_missing_token() -> None:
23
+ msg = "AWS_WEB_IDENTITY_TOKEN_FILE environment variable not set."
24
+ logger.exception(msg)
25
+ raise RuntimeError(msg)
26
+
27
+
28
+ def _raise_missing_role_arn() -> None:
29
+ raise MissingEnvVarRoleArnError
30
+
31
+
32
+ class MissingEnvVarRoleArnError(Exception):
33
+ """Exception for a missing role arn env var."""
34
+
35
+ def __init__(self) -> None:
36
+ """Raise error with message."""
37
+ self.message = "Add AWS_ROLE_ARN to environment variables."
38
+ super().__init__()
39
+
40
+
41
+ class Environment:
42
+ """AWS environment inference and configuration.
43
+
44
+ This class helps determine the environment (prod, preprod, test, dev),
45
+ manage AWS credentials (via web identity or default),
46
+ and construct environment-specific S3 bucket URLs.
47
+ """
48
+
49
+ _instance: Self | None = None
50
+
51
+ def __new__(cls, *_args: object, **_kwargs: object) -> Self:
52
+ """Singleton instantiation."""
53
+ if cls._instance is None:
54
+ cls._instance = super().__new__(cls)
55
+ return cls._instance
56
+
57
+ def __init__(
58
+ self,
59
+ job_name: str | None = "",
60
+ bucket_prefix: str | None = "emds",
61
+ *,
62
+ use_web_identity: bool,
63
+ ) -> None:
64
+ """Initialize the environment context.
65
+
66
+ Parameters
67
+ ----------
68
+ job_name : str, optional
69
+ Name used when assuming a role session, by default empty string.
70
+ bucket_prefix : str, optional
71
+ Base prefix used to identify datahub buckets, by default "emds".
72
+ use_web_identity : bool, optional
73
+ Whether to try using web identity credentials first.
74
+
75
+ """
76
+ if hasattr(self, "_initialized") and self._initialized:
77
+ return
78
+
79
+ self.job_name = job_name
80
+ self.bucket_prefix = bucket_prefix
81
+ self.use_web_identity = use_web_identity
82
+
83
+ self.session = self._init_session()
84
+ self.alias = self._fetch_account_alias()
85
+ self.account_no = self._get_account_number()
86
+ self.bucket_list = self._list_buckets()
87
+
88
+ self._initialized: bool = True
89
+
90
+ def _init_session(self) -> boto3.Session:
91
+ """Initialise a boto3 session, optionally using web identity credentials.
92
+
93
+ Returns
94
+ -------
95
+ boto3.Session
96
+ Configured boto3 session.
97
+
98
+ """
99
+ if self.use_web_identity:
100
+ try:
101
+ token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
102
+ if token_path is None:
103
+ _raise_missing_token()
104
+ else:
105
+ path = Path(token_path)
106
+ with path.open() as f:
107
+ web_identity_token = f.read()
108
+
109
+ role_arn = os.environ.get("AWS_ROLE_ARN")
110
+ if role_arn is None:
111
+ _raise_missing_role_arn()
112
+ else:
113
+ role_arn_arg = role_arn
114
+
115
+ sts_client = boto3.client("sts")
116
+ response = sts_client.assume_role_with_web_identity(
117
+ RoleArn=role_arn_arg,
118
+ RoleSessionName=f"session-{self.job_name}",
119
+ WebIdentityToken=web_identity_token,
120
+ DurationSeconds=900,
121
+ )
122
+ return boto3.session.Session(
123
+ aws_access_key_id=response["Credentials"]["AccessKeyId"],
124
+ aws_secret_access_key=response["Credentials"]["SecretAccessKey"],
125
+ aws_session_token=response["Credentials"]["SessionToken"],
126
+ )
127
+ except Exception as e:
128
+ logger.warning("Web identity failed: %s. Falling back to default session.", e)
129
+ raise
130
+
131
+ return boto3.session.Session()
132
+
133
+ def _fetch_account_alias(self) -> str:
134
+ """Fetch the AWS account alias.
135
+
136
+ Returns
137
+ -------
138
+ str
139
+ Account alias or default.
140
+
141
+ Notes
142
+ -----
143
+ Falls back to 'preproduction' alias if none found.
144
+
145
+ """
146
+ try:
147
+ aliases = boto3.client("iam").list_account_aliases().get("AccountAliases", [])
148
+ return aliases[0] if aliases else "electronic-monitoring-data-preproduction"
149
+ except botocore.exceptions.ClientError:
150
+ logger.warning("Failed to fetch account alias, assuming preproduction.")
151
+ return "electronic-monitoring-data-preproduction"
152
+
153
+ def _get_account_number(self) -> str:
154
+ """Return the AWS account number."""
155
+ try:
156
+ return boto3.client("sts").get_caller_identity()["Account"]
157
+ except botocore.exceptions.NoCredentialsError:
158
+ msg = "AWS credentials not found."
159
+ logger.exception(msg)
160
+ raise RuntimeError(msg) from None
161
+
162
+ def _list_buckets(self) -> list[BucketTypeDef]:
163
+ """List all available S3 buckets."""
164
+ try:
165
+ return boto3.client("s3").list_buckets()["Buckets"]
166
+ except Exception as e:
167
+ logger.warning("Could not list buckets: %s", e)
168
+ raise
169
+
170
+ @property
171
+ def account_number(self) -> str:
172
+ """Return the AWS account number."""
173
+ return self.account_no
174
+
175
+ @property
176
+ def environment_name(self) -> str:
177
+ """Infer environment name from account alias.
178
+
179
+ Returns
180
+ -------
181
+ str
182
+ One of: prod, dev, preprod, test, or fallback to raw alias suffix.
183
+
184
+ """
185
+ full_env_name = self.alias.split("-")[-1]
186
+ return {
187
+ "production": "prod",
188
+ "development": "dev",
189
+ "preproduction": "preprod",
190
+ "test": "test",
191
+ }.get(full_env_name, full_env_name)
192
+
193
+ @property
194
+ def is_prod(self) -> bool:
195
+ """Check if the environment is production."""
196
+ return self.environment_name == "prod"
197
+
198
+ def get_full_bucket_url(
199
+ self,
200
+ bucket_prefix: str | None = None,
201
+ *,
202
+ full_prefix: bool,
203
+ ) -> S3Path | None:
204
+ """Get S3Path to bucket matching environment and prefix.
205
+
206
+ Parameters
207
+ ----------
208
+ bucket_prefix : str, optional
209
+ Prefix to search for (overrides default prefix).
210
+ full_prefix : bool, optional
211
+ Whether to match full bucket name exactly.
212
+
213
+ Returns
214
+ -------
215
+ Optional[S3Path]
216
+ S3Path to the matched bucket or None if not found.
217
+
218
+ """
219
+ search_prefix = bucket_prefix or self.bucket_prefix
220
+ expected_name = f"{self.bucket_prefix}-{self.environment_name}-{search_prefix}"
221
+
222
+ for bucket in self.bucket_list:
223
+ bucket_name = bucket["Name"]
224
+ if full_prefix:
225
+ if expected_name == "-".join(bucket_name.split("-")[:-1]) or expected_name == bucket_name:
226
+ return S3Path(f"s3://{bucket_name}")
227
+ elif expected_name in bucket_name:
228
+ return S3Path(f"s3://{bucket_name}")
229
+ return None
230
+
231
+ def get_api_invoke_url(self, api_name: str, region: str) -> str:
232
+ """Get API invoke url from env."""
233
+ client = boto3.client("apigateway", region)
234
+ rest_api_response = client.get_rest_apis()
235
+ matches = [it for it in rest_api_response["items"] if it["name"] == api_name]
236
+ if len(matches) > 1:
237
+ raise ValueError
238
+ api_details = matches[0]
239
+ return f"https://{api_details['id']}.execute-api.{region}.amazonaws.com/"
240
+
241
+ def refresh_credentials(self) -> CredentialsTypeDef:
242
+ """Refresh credentials via STS.
243
+
244
+ Returns
245
+ -------
246
+ dict
247
+ New credentials.
248
+
249
+ """
250
+ try:
251
+ token_path = os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
252
+ if token_path is None:
253
+ _raise_missing_token()
254
+ else:
255
+ path = Path(token_path)
256
+ with path.open() as f:
257
+ web_identity_token = f.read()
258
+
259
+ role_arn = os.environ.get("AWS_ROLE_ARN")
260
+ if role_arn is None:
261
+ _raise_missing_role_arn()
262
+ else:
263
+ roel_arn_arg = role_arn
264
+
265
+ sts_client = boto3.client("sts")
266
+ rand_suffix = "".join(secrets.choice(alphabet) for _ in range(10))
267
+ response_assume_role = sts_client.assume_role_with_web_identity(
268
+ RoleArn=roel_arn_arg,
269
+ RoleSessionName=f"session-{self.job_name}-{rand_suffix}",
270
+ WebIdentityToken=web_identity_token,
271
+ DurationSeconds=900,
272
+ )
273
+ return response_assume_role["Credentials"]
274
+
275
+ except Exception:
276
+ logger.exception("Web identity failed. Falling back to get_session_token.")
277
+
278
+ sts_client = boto3.client("sts")
279
+ response_session_token = sts_client.get_session_token(DurationSeconds=900)
280
+ return response_session_token["Credentials"]
281
+
282
+ def export_dbt_variables(self, *, actions: bool = False, airflow: bool = False) -> None:
283
+ """Export dbt variables for the environment."""
284
+ s3_data_bucket_name = self.get_full_bucket_url("cadt", full_prefix=True)
285
+ dbt_test_profile_workgroup = f"{self.account_number}-default"
286
+ dbt_suffix = "" if self.is_prod else f"_{self.environment_name}_dbt"
287
+ h3_lambda_arn = f"arn:aws:lambda:eu-west-2:{self.account_no}:function:h3-udf"
288
+
289
+ if actions:
290
+ export_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
291
+ >> $GITHUB_ENV\n'
292
+ export_bucket = f'echo "S3_DATA_BUCKET_NAME={s3_data_bucket_name}" \
293
+ >> $GITHUB_ENV\n'
294
+ export_dbt_profile = f'echo \
295
+ "DBT_TEST_PROFILE_WORKGROUP={dbt_test_profile_workgroup}"\
296
+ >> $GITHUB_ENV\n'
297
+ export_dbt_profile_location = ""
298
+ export_h3_lambda_arn = f"""echo \
299
+ export H3_LAMBDA_ARN='{h3_lambda_arn}'
300
+ >> $GITHUB_ENV\n
301
+ """
302
+ export_dbt_suffix = f'echo "DBT_SUFFIX={dbt_suffix}" \
303
+ >> $GITHUB_ENV\n'
304
+ else:
305
+ export_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
306
+ export_bucket = f"export S3_DATA_BUCKET_NAME='{s3_data_bucket_name}'\n"
307
+ export_dbt_profile = f"""
308
+ export DBT_TEST_PROFILE_WORKGROUP='{dbt_test_profile_workgroup}'\n
309
+ """
310
+ export_dbt_profile_location = 'export DBT_PROFILES_DIR="../.dbt/"\n'
311
+ export_h3_lambda_arn = f"export H3_LAMBDA_ARN='{h3_lambda_arn}'"
312
+ export_dbt_suffix = f"export DBT_SUFFIX='{dbt_suffix}'\n"
313
+
314
+ with Path("set_env.sh").open("w") as f:
315
+ f.write(export_suffix)
316
+ f.write(export_bucket)
317
+ f.write(export_dbt_profile)
318
+ if not airflow:
319
+ f.write(export_dbt_profile_location)
320
+ f.write(export_h3_lambda_arn)
321
+ f.write(export_dbt_suffix)
322
+
323
+ @classmethod
324
+ def clear(cls) -> None:
325
+ """Reset the singleton instance.
326
+
327
+ Use this to force restart of the class. Mainly for testing.
328
+ """
329
+ cls._instance = None
330
+
331
+ @classmethod
332
+ def instance(cls) -> "Environment | None":
333
+ """Return the current singleton instance, if any. Mainly for testing."""
334
+ return cls._instance