abi_ds_utils 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abi_ds_utils/__init__.py +14 -0
- abi_ds_utils/airflow.py +29 -0
- abi_ds_utils/aws.py +51 -0
- abi_ds_utils/spark.py +62 -0
- abi_ds_utils-1.2.4.dist-info/METADATA +11 -0
- abi_ds_utils-1.2.4.dist-info/RECORD +8 -0
- abi_ds_utils-1.2.4.dist-info/WHEEL +5 -0
- abi_ds_utils-1.2.4.dist-info/top_level.txt +1 -0
abi_ds_utils/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from abi_ds_utils.airflow import write_to_xcom
|
|
2
|
+
from abi_ds_utils.aws import get_parameter, get_secret
|
|
3
|
+
from abi_ds_utils.spark import getSpark, getGlue
|
|
4
|
+
|
|
5
|
+
get_spark = getSpark
|
|
6
|
+
get_glue = getGlue
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"get_glue",
|
|
10
|
+
"get_secret",
|
|
11
|
+
"get_spark",
|
|
12
|
+
"get_parameter",
|
|
13
|
+
"write_to_xcom"
|
|
14
|
+
]
|
abi_ds_utils/airflow.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def write_to_xcom(dict_value: Dict) -> Dict:
|
|
7
|
+
"""Write `dict_value` for XCOM to pass it to the next task
|
|
8
|
+
|
|
9
|
+
ref: https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/operators.html#how-does-xcom-work
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
file_save = Path('/airflow/xcom/return.json')
|
|
13
|
+
file_save.parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
|
|
15
|
+
# Get data from the file if there is some
|
|
16
|
+
if file_save.is_file():
|
|
17
|
+
with file_save.open('r') as fin:
|
|
18
|
+
dict_restored = json.load(fin)
|
|
19
|
+
|
|
20
|
+
else:
|
|
21
|
+
dict_restored = dict()
|
|
22
|
+
|
|
23
|
+
# Update dict with new data
|
|
24
|
+
dict_restored.update(dict_value)
|
|
25
|
+
|
|
26
|
+
# Overwrite file with updated dict
|
|
27
|
+
with file_save.open('w') as fout:
|
|
28
|
+
json.dump(dict_restored, fout)
|
|
29
|
+
return dict_restored
|
abi_ds_utils/aws.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import base64
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
import boto3
|
|
5
|
+
from botocore.exceptions import ClientError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_secret(secret_name: str, region_name: str) -> Dict:
|
|
9
|
+
# Create a Secrets Manager client
|
|
10
|
+
session = boto3.session.Session()
|
|
11
|
+
client = session.client(
|
|
12
|
+
service_name='secretsmanager',
|
|
13
|
+
region_name=region_name
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
|
|
17
|
+
# See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
|
|
18
|
+
# We rethrow the exception by default.
|
|
19
|
+
try:
|
|
20
|
+
get_secret_value_response = client.get_secret_value(
|
|
21
|
+
SecretId=secret_name
|
|
22
|
+
)
|
|
23
|
+
except ClientError as e:
|
|
24
|
+
raise e
|
|
25
|
+
else:
|
|
26
|
+
# Decrypts secret using the associated KMS key.
|
|
27
|
+
# Depending on whether the secret is a string or binary, one of these fields will be populated.
|
|
28
|
+
if 'SecretString' in get_secret_value_response:
|
|
29
|
+
secret = get_secret_value_response['SecretString']
|
|
30
|
+
else:
|
|
31
|
+
secret = base64.b64decode(get_secret_value_response['SecretBinary'])
|
|
32
|
+
return json.loads(secret)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_parameter(name: str, region: Optional[str] = None) -> Optional[str]:
|
|
36
|
+
"""Retrieve secret from Parameter Store.
|
|
37
|
+
|
|
38
|
+
:param name: Name of the parameter
|
|
39
|
+
:param region: AWS region otherwise env AWS_DEFAULT_REGION
|
|
40
|
+
:return: Value of parameter
|
|
41
|
+
"""
|
|
42
|
+
if region:
|
|
43
|
+
client = boto3.client("ssm", region_name=region)
|
|
44
|
+
else:
|
|
45
|
+
client = boto3.client("ssm")
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
parameter = client.get_parameter(Name=name, WithDecryption=True)
|
|
49
|
+
except ClientError as e:
|
|
50
|
+
raise e
|
|
51
|
+
return parameter.get('Parameter', {}).get('Value')
|
abi_ds_utils/spark.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def getSpark(driver_memory: str = "21g") -> SparkSession:
|
|
6
|
+
spark = (
|
|
7
|
+
SparkSession.builder
|
|
8
|
+
# General
|
|
9
|
+
.master('local[*]')
|
|
10
|
+
.config("spark.driver.maxResultSize", 0)
|
|
11
|
+
|
|
12
|
+
# Get 80% of free memory (this might be a bad idea)
|
|
13
|
+
.config("spark.driver.memory", driver_memory)
|
|
14
|
+
.config("spark.dynamicAllocation.enabled", "true")
|
|
15
|
+
|
|
16
|
+
# PyArrow for dtypes conversions
|
|
17
|
+
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
18
|
+
|
|
19
|
+
# Spark 3.5.x requires a Delta 3.x line; Delta 2.2 only supports Spark 3.3.x.
|
|
20
|
+
.config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.2,io.delta:delta-spark_2.12:3.3.2')
|
|
21
|
+
|
|
22
|
+
# Delta Lake setup
|
|
23
|
+
.config("spark.hadoop.fs.s3a.connection.maximum", 128)
|
|
24
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
25
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
26
|
+
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
spark = spark.config(
|
|
30
|
+
"spark.hadoop.fs.s3a.aws.credentials.provider",
|
|
31
|
+
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
|
|
32
|
+
)
|
|
33
|
+
return spark.getOrCreate()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def getGlue(driver_memory: str = "21g") -> SparkSession:
|
|
37
|
+
spark = (
|
|
38
|
+
SparkSession.builder
|
|
39
|
+
# General
|
|
40
|
+
.config("spark.driver.maxResultSize", 0)
|
|
41
|
+
|
|
42
|
+
# Get 80% of free memory (this might be a bad idea)
|
|
43
|
+
.config("spark.driver.memory", driver_memory)
|
|
44
|
+
.config("spark.dynamicAllocation.enabled", "true")
|
|
45
|
+
|
|
46
|
+
# PyArrow for dtypes conversions
|
|
47
|
+
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
48
|
+
|
|
49
|
+
# Delta Lake setup
|
|
50
|
+
.config("spark.hadoop.fs.s3a.connection.maximum", 128)
|
|
51
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
52
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
53
|
+
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
|
|
54
|
+
|
|
55
|
+
# Glue setup
|
|
56
|
+
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
|
|
57
|
+
.config("aws.region", os.environ.get('AWS_DEFAULT_REGION'))
|
|
58
|
+
.config("hive.metastore.glue.catalogid", os.environ.get('GLUE_CATALOG_ID'))
|
|
59
|
+
.enableHiveSupport()
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return spark.getOrCreate()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: abi_ds_utils
|
|
3
|
+
Version: 1.2.4
|
|
4
|
+
Summary: Utility modules for working with spark, containers, aws and more.
|
|
5
|
+
Author: Martin Matousek, Ioannis Chios
|
|
6
|
+
License: Private
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Requires-Dist: boto3<2.0.0,>=1.21.14
|
|
9
|
+
Requires-Dist: pyspark==3.5.8
|
|
10
|
+
Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
|
|
11
|
+
Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
abi_ds_utils/__init__.py,sha256=uXMLKzJ-mkzoylygp1eS00zt8OV7jeTeab1EhlaPdMc,299
|
|
2
|
+
abi_ds_utils/airflow.py,sha256=8cxCGZmDl2thQO_eUGl1XB8FTaolgWv3x4s6Xou0Qxw,823
|
|
3
|
+
abi_ds_utils/aws.py,sha256=P8e0lgcmy1GDQWzyxeQYo2b5mE84bSzrAGIwodC4coo,1737
|
|
4
|
+
abi_ds_utils/spark.py,sha256=iwk68vJ8uV_Fyi8Q7DoFNT4yay3fVxa59f3GnExxl2s,2512
|
|
5
|
+
abi_ds_utils-1.2.4.dist-info/METADATA,sha256=rVogXdRb7LkFqG0jd0xuCf9ZaDm-lD4UZYHgMgXrunU,405
|
|
6
|
+
abi_ds_utils-1.2.4.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
abi_ds_utils-1.2.4.dist-info/top_level.txt,sha256=SsHKDlGfPNApOYPYVuSKuXkHYHD697V8A4X865QjwVo,13
|
|
8
|
+
abi_ds_utils-1.2.4.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
abi_ds_utils
|