abi_ds_utils 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ from abi_ds_utils.airflow import write_to_xcom
2
+ from abi_ds_utils.aws import get_parameter, get_secret
3
+ from abi_ds_utils.spark import getSpark, getGlue
4
+
5
+ get_spark = getSpark
6
+ get_glue = getGlue
7
+
8
+ __all__ = [
9
+ "get_glue",
10
+ "get_secret",
11
+ "get_spark",
12
+ "get_parameter",
13
+ "write_to_xcom"
14
+ ]
@@ -0,0 +1,29 @@
1
+ import json
2
+ from typing import Dict
3
+ from pathlib import Path
4
+
5
+
6
+ def write_to_xcom(dict_value: Dict) -> Dict:
7
+ """Write `dict_value` for XCOM to pass it to the next task
8
+
9
+ ref: https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/operators.html#how-does-xcom-work
10
+
11
+ """
12
+ file_save = Path('/airflow/xcom/return.json')
13
+ file_save.parent.mkdir(parents=True, exist_ok=True)
14
+
15
+ # Get data from the file if there is some
16
+ if file_save.is_file():
17
+ with file_save.open('r') as fin:
18
+ dict_restored = json.load(fin)
19
+
20
+ else:
21
+ dict_restored = dict()
22
+
23
+ # Update dict with new data
24
+ dict_restored.update(dict_value)
25
+
26
+ # Overwrite file with updated dict
27
+ with file_save.open('w') as fout:
28
+ json.dump(dict_restored, fout)
29
+ return dict_restored
abi_ds_utils/aws.py ADDED
@@ -0,0 +1,51 @@
1
+ import json
2
+ import base64
3
+ from typing import Dict, Optional
4
+ import boto3
5
+ from botocore.exceptions import ClientError
6
+
7
+
8
+ def get_secret(secret_name: str, region_name: str) -> Dict:
9
+ # Create a Secrets Manager client
10
+ session = boto3.session.Session()
11
+ client = session.client(
12
+ service_name='secretsmanager',
13
+ region_name=region_name
14
+ )
15
+
16
+ # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
17
+ # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
18
+ # We rethrow the exception by default.
19
+ try:
20
+ get_secret_value_response = client.get_secret_value(
21
+ SecretId=secret_name
22
+ )
23
+ except ClientError as e:
24
+ raise e
25
+ else:
26
+ # Decrypts secret using the associated KMS key.
27
+ # Depending on whether the secret is a string or binary, one of these fields will be populated.
28
+ if 'SecretString' in get_secret_value_response:
29
+ secret = get_secret_value_response['SecretString']
30
+ else:
31
+ secret = base64.b64decode(get_secret_value_response['SecretBinary'])
32
+ return json.loads(secret)
33
+
34
+
35
+ def get_parameter(name: str, region: Optional[str] = None) -> Optional[str]:
36
+ """Retrieve secret from Parameter Store.
37
+
38
+ :param name: Name of the parameter
39
+ :param region: AWS region otherwise env AWS_DEFAULT_REGION
40
+ :return: Value of parameter
41
+ """
42
+ if region:
43
+ client = boto3.client("ssm", region_name=region)
44
+ else:
45
+ client = boto3.client("ssm")
46
+
47
+ try:
48
+ parameter = client.get_parameter(Name=name, WithDecryption=True)
49
+ except ClientError as e:
50
+ raise e
51
+ return parameter.get('Parameter', {}).get('Value')
abi_ds_utils/spark.py ADDED
@@ -0,0 +1,62 @@
1
+ import os
2
+ from pyspark.sql import SparkSession
3
+
4
+
5
+ def getSpark(driver_memory: str = "21g") -> SparkSession:
6
+ spark = (
7
+ SparkSession.builder
8
+ # General
9
+ .master('local[*]')
10
+ .config("spark.driver.maxResultSize", 0)
11
+
12
+ # Get 80% of free memory (this might be a bad idea)
13
+ .config("spark.driver.memory", driver_memory)
14
+ .config("spark.dynamicAllocation.enabled", "true")
15
+
16
+ # PyArrow for dtypes conversions
17
+ .config("spark.sql.execution.arrow.pyspark.enabled", "true")
18
+
19
+ # Spark 3.5.x requires a Delta 3.x line; Delta 2.2 only supports Spark 3.3.x.
20
+ .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.2,io.delta:delta-spark_2.12:3.3.2')
21
+
22
+ # Delta Lake setup
23
+ .config("spark.hadoop.fs.s3a.connection.maximum", 128)
24
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
25
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
26
+ .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
27
+ )
28
+
29
+ spark = spark.config(
30
+ "spark.hadoop.fs.s3a.aws.credentials.provider",
31
+ "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
32
+ )
33
+ return spark.getOrCreate()
34
+
35
+
36
+ def getGlue(driver_memory: str = "21g") -> SparkSession:
37
+ spark = (
38
+ SparkSession.builder
39
+ # General
40
+ .config("spark.driver.maxResultSize", 0)
41
+
42
+ # Get 80% of free memory (this might be a bad idea)
43
+ .config("spark.driver.memory", driver_memory)
44
+ .config("spark.dynamicAllocation.enabled", "true")
45
+
46
+ # PyArrow for dtypes conversions
47
+ .config("spark.sql.execution.arrow.pyspark.enabled", "true")
48
+
49
+ # Delta Lake setup
50
+ .config("spark.hadoop.fs.s3a.connection.maximum", 128)
51
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
52
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
53
+ .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
54
+
55
+ # Glue setup
56
+ .config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
57
+ .config("aws.region", os.environ.get('AWS_DEFAULT_REGION'))
58
+ .config("hive.metastore.glue.catalogid", os.environ.get('GLUE_CATALOG_ID'))
59
+ .enableHiveSupport()
60
+ )
61
+
62
+ return spark.getOrCreate()
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: abi_ds_utils
3
+ Version: 1.2.4
4
+ Summary: Utility modules for working with spark, containers, aws and more.
5
+ Author: Martin Matousek, Ioannis Chios
6
+ License: Private
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: boto3<2.0.0,>=1.21.14
9
+ Requires-Dist: pyspark==3.5.8
10
+ Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
11
+ Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
@@ -0,0 +1,8 @@
1
+ abi_ds_utils/__init__.py,sha256=uXMLKzJ-mkzoylygp1eS00zt8OV7jeTeab1EhlaPdMc,299
2
+ abi_ds_utils/airflow.py,sha256=8cxCGZmDl2thQO_eUGl1XB8FTaolgWv3x4s6Xou0Qxw,823
3
+ abi_ds_utils/aws.py,sha256=P8e0lgcmy1GDQWzyxeQYo2b5mE84bSzrAGIwodC4coo,1737
4
+ abi_ds_utils/spark.py,sha256=iwk68vJ8uV_Fyi8Q7DoFNT4yay3fVxa59f3GnExxl2s,2512
5
+ abi_ds_utils-1.2.4.dist-info/METADATA,sha256=rVogXdRb7LkFqG0jd0xuCf9ZaDm-lD4UZYHgMgXrunU,405
6
+ abi_ds_utils-1.2.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ abi_ds_utils-1.2.4.dist-info/top_level.txt,sha256=SsHKDlGfPNApOYPYVuSKuXkHYHD697V8A4X865QjwVo,13
8
+ abi_ds_utils-1.2.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ abi_ds_utils