abi_ds_utils 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abi_ds_utils-1.2.4/PKG-INFO +11 -0
- abi_ds_utils-1.2.4/README.md +68 -0
- abi_ds_utils-1.2.4/abi_ds_utils/__init__.py +14 -0
- abi_ds_utils-1.2.4/abi_ds_utils/airflow.py +29 -0
- abi_ds_utils-1.2.4/abi_ds_utils/aws.py +51 -0
- abi_ds_utils-1.2.4/abi_ds_utils/spark.py +62 -0
- abi_ds_utils-1.2.4/abi_ds_utils.egg-info/PKG-INFO +11 -0
- abi_ds_utils-1.2.4/abi_ds_utils.egg-info/SOURCES.txt +11 -0
- abi_ds_utils-1.2.4/abi_ds_utils.egg-info/dependency_links.txt +1 -0
- abi_ds_utils-1.2.4/abi_ds_utils.egg-info/requires.txt +8 -0
- abi_ds_utils-1.2.4/abi_ds_utils.egg-info/top_level.txt +1 -0
- abi_ds_utils-1.2.4/pyproject.toml +25 -0
- abi_ds_utils-1.2.4/setup.cfg +4 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: abi_ds_utils
|
|
3
|
+
Version: 1.2.4
|
|
4
|
+
Summary: Utility modules for working with spark, containers, aws and more.
|
|
5
|
+
Author: Martin Matousek, Ioannis Chios
|
|
6
|
+
License: Private
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Requires-Dist: boto3<2.0.0,>=1.21.14
|
|
9
|
+
Requires-Dist: pyspark==3.5.8
|
|
10
|
+
Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
|
|
11
|
+
Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# DS Utility Modules
|
|
2
|
+
|
|
3
|
+
Utility modules for working with Airflow, AWS, Spark, and hopefully more.
|
|
4
|
+
|
|
5
|
+
## Spark
|
|
6
|
+
Every DevHub container has the ability to run a single-node Spark instance.
|
|
7
|
+
It is not ideal, but can be handy when trying to run your pipeline on subsets,
|
|
8
|
+
or is necessary if you want to save something in Delta format. For these purposes
|
|
9
|
+
you can use the `get_spark` (or `getSpark` for backwards compatibility) function.
|
|
10
|
+
It will get you a `SparkSession` that works inside DevHub, for our S3 etc.
|
|
11
|
+
|
|
12
|
+
## Airflow
|
|
13
|
+
`write_to_xcom` can be used to pass data between two tasks.
|
|
14
|
+
|
|
15
|
+
## AWS
|
|
16
|
+
Utility functions to retrieve parameters from SSM's parameter store or secrets from secret manager.
|
|
17
|
+
|
|
18
|
+
# Change Log
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## 1.2.4 - 2026-04-01
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Changed
|
|
27
|
+
|
|
28
|
+
- PySpark bump to 3.5.8
|
|
29
|
+
- Delta Lake Maven package bump from `delta-core_2.12:2.2.0` to `delta-spark_2.12:3.3.2` for Spark 3.5.x compatibility
|
|
30
|
+
- `pyarrow` constrained to `<13.0.0` to stay within Spark 3.5's documented support range
|
|
31
|
+
- Fixed the S3A credentials provider config key to use the `spark.hadoop.*` namespace
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 1.2.1 - 2024-01-30
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Added
|
|
40
|
+
|
|
41
|
+
- `getGlue` and `get_glue` to create the spark session with Glue support
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## 1.1.0 - 2023-06-20
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Changed
|
|
50
|
+
|
|
51
|
+
- Scala 2.13 > 2.12
|
|
52
|
+
- `get_spark` actually works now
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## 1.0.1 - 2023-06-19
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Added
|
|
61
|
+
|
|
62
|
+
- `__init__.py` with `get_spark` alias
|
|
63
|
+
|
|
64
|
+
## Changed
|
|
65
|
+
|
|
66
|
+
- PySpark bump to 3.3.2
|
|
67
|
+
- Delta bump to 2.13
|
|
68
|
+
- Removed `hurry.filesize` and `psutils` from deps
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from abi_ds_utils.airflow import write_to_xcom
|
|
2
|
+
from abi_ds_utils.aws import get_parameter, get_secret
|
|
3
|
+
from abi_ds_utils.spark import getSpark, getGlue
|
|
4
|
+
|
|
5
|
+
get_spark = getSpark
|
|
6
|
+
get_glue = getGlue
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"get_glue",
|
|
10
|
+
"get_secret",
|
|
11
|
+
"get_spark",
|
|
12
|
+
"get_parameter",
|
|
13
|
+
"write_to_xcom"
|
|
14
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def write_to_xcom(dict_value: Dict) -> Dict:
|
|
7
|
+
"""Write `dict_value` for XCOM to pass it to the next task
|
|
8
|
+
|
|
9
|
+
ref: https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/operators.html#how-does-xcom-work
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
file_save = Path('/airflow/xcom/return.json')
|
|
13
|
+
file_save.parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
|
|
15
|
+
# Get data from the file if there is some
|
|
16
|
+
if file_save.is_file():
|
|
17
|
+
with file_save.open('r') as fin:
|
|
18
|
+
dict_restored = json.load(fin)
|
|
19
|
+
|
|
20
|
+
else:
|
|
21
|
+
dict_restored = dict()
|
|
22
|
+
|
|
23
|
+
# Update dict with new data
|
|
24
|
+
dict_restored.update(dict_value)
|
|
25
|
+
|
|
26
|
+
# Overwrite file with updated dict
|
|
27
|
+
with file_save.open('w') as fout:
|
|
28
|
+
json.dump(dict_restored, fout)
|
|
29
|
+
return dict_restored
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import base64
|
|
3
|
+
from typing import Dict, Optional
|
|
4
|
+
import boto3
|
|
5
|
+
from botocore.exceptions import ClientError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_secret(secret_name: str, region_name: str) -> Dict:
|
|
9
|
+
# Create a Secrets Manager client
|
|
10
|
+
session = boto3.session.Session()
|
|
11
|
+
client = session.client(
|
|
12
|
+
service_name='secretsmanager',
|
|
13
|
+
region_name=region_name
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
|
|
17
|
+
# See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
|
|
18
|
+
# We rethrow the exception by default.
|
|
19
|
+
try:
|
|
20
|
+
get_secret_value_response = client.get_secret_value(
|
|
21
|
+
SecretId=secret_name
|
|
22
|
+
)
|
|
23
|
+
except ClientError as e:
|
|
24
|
+
raise e
|
|
25
|
+
else:
|
|
26
|
+
# Decrypts secret using the associated KMS key.
|
|
27
|
+
# Depending on whether the secret is a string or binary, one of these fields will be populated.
|
|
28
|
+
if 'SecretString' in get_secret_value_response:
|
|
29
|
+
secret = get_secret_value_response['SecretString']
|
|
30
|
+
else:
|
|
31
|
+
secret = base64.b64decode(get_secret_value_response['SecretBinary'])
|
|
32
|
+
return json.loads(secret)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_parameter(name: str, region: Optional[str] = None) -> Optional[str]:
|
|
36
|
+
"""Retrieve secret from Parameter Store.
|
|
37
|
+
|
|
38
|
+
:param name: Name of the parameter
|
|
39
|
+
:param region: AWS region otherwise env AWS_DEFAULT_REGION
|
|
40
|
+
:return: Value of parameter
|
|
41
|
+
"""
|
|
42
|
+
if region:
|
|
43
|
+
client = boto3.client("ssm", region_name=region)
|
|
44
|
+
else:
|
|
45
|
+
client = boto3.client("ssm")
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
parameter = client.get_parameter(Name=name, WithDecryption=True)
|
|
49
|
+
except ClientError as e:
|
|
50
|
+
raise e
|
|
51
|
+
return parameter.get('Parameter', {}).get('Value')
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def getSpark(driver_memory: str = "21g") -> SparkSession:
|
|
6
|
+
spark = (
|
|
7
|
+
SparkSession.builder
|
|
8
|
+
# General
|
|
9
|
+
.master('local[*]')
|
|
10
|
+
.config("spark.driver.maxResultSize", 0)
|
|
11
|
+
|
|
12
|
+
# Get 80% of free memory (this might be a bad idea)
|
|
13
|
+
.config("spark.driver.memory", driver_memory)
|
|
14
|
+
.config("spark.dynamicAllocation.enabled", "true")
|
|
15
|
+
|
|
16
|
+
# PyArrow for dtypes conversions
|
|
17
|
+
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
18
|
+
|
|
19
|
+
# Spark 3.5.x requires a Delta 3.x line; Delta 2.2 only supports Spark 3.3.x.
|
|
20
|
+
.config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.2,io.delta:delta-spark_2.12:3.3.2')
|
|
21
|
+
|
|
22
|
+
# Delta Lake setup
|
|
23
|
+
.config("spark.hadoop.fs.s3a.connection.maximum", 128)
|
|
24
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
25
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
26
|
+
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
spark = spark.config(
|
|
30
|
+
"spark.hadoop.fs.s3a.aws.credentials.provider",
|
|
31
|
+
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
|
|
32
|
+
)
|
|
33
|
+
return spark.getOrCreate()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def getGlue(driver_memory: str = "21g") -> SparkSession:
|
|
37
|
+
spark = (
|
|
38
|
+
SparkSession.builder
|
|
39
|
+
# General
|
|
40
|
+
.config("spark.driver.maxResultSize", 0)
|
|
41
|
+
|
|
42
|
+
# Get 80% of free memory (this might be a bad idea)
|
|
43
|
+
.config("spark.driver.memory", driver_memory)
|
|
44
|
+
.config("spark.dynamicAllocation.enabled", "true")
|
|
45
|
+
|
|
46
|
+
# PyArrow for dtypes conversions
|
|
47
|
+
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
48
|
+
|
|
49
|
+
# Delta Lake setup
|
|
50
|
+
.config("spark.hadoop.fs.s3a.connection.maximum", 128)
|
|
51
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
52
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
53
|
+
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
|
|
54
|
+
|
|
55
|
+
# Glue setup
|
|
56
|
+
.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
|
|
57
|
+
.config("aws.region", os.environ.get('AWS_DEFAULT_REGION'))
|
|
58
|
+
.config("hive.metastore.glue.catalogid", os.environ.get('GLUE_CATALOG_ID'))
|
|
59
|
+
.enableHiveSupport()
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return spark.getOrCreate()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: abi_ds_utils
|
|
3
|
+
Version: 1.2.4
|
|
4
|
+
Summary: Utility modules for working with spark, containers, aws and more.
|
|
5
|
+
Author: Martin Matousek, Ioannis Chios
|
|
6
|
+
License: Private
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Requires-Dist: boto3<2.0.0,>=1.21.14
|
|
9
|
+
Requires-Dist: pyspark==3.5.8
|
|
10
|
+
Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
|
|
11
|
+
Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
abi_ds_utils/__init__.py
|
|
4
|
+
abi_ds_utils/airflow.py
|
|
5
|
+
abi_ds_utils/aws.py
|
|
6
|
+
abi_ds_utils/spark.py
|
|
7
|
+
abi_ds_utils.egg-info/PKG-INFO
|
|
8
|
+
abi_ds_utils.egg-info/SOURCES.txt
|
|
9
|
+
abi_ds_utils.egg-info/dependency_links.txt
|
|
10
|
+
abi_ds_utils.egg-info/requires.txt
|
|
11
|
+
abi_ds_utils.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
abi_ds_utils
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "abi_ds_utils"
|
|
3
|
+
version = "1.2.4"
|
|
4
|
+
description = "Utility modules for working with spark, containers, aws and more."
|
|
5
|
+
authors = [{ name = "Martin Matousek, Ioannis Chios" }]
|
|
6
|
+
license = { text = "Private" }
|
|
7
|
+
requires-python = ">=3.8"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"boto3>=1.21.14,<2.0.0",
|
|
10
|
+
"pyspark==3.5.8",
|
|
11
|
+
"pyarrow>=7.0.0,<13.0.0; python_version < '3.12'",
|
|
12
|
+
"pyarrow>=15.0.2,<18.0.0; python_version >= '3.12'",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[dependency-groups]
|
|
16
|
+
dev = [
|
|
17
|
+
"pre-commit>=2.17.0,<3.0.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[tool.setuptools]
|
|
21
|
+
packages = ["abi_ds_utils"]
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["setuptools>=68"]
|
|
25
|
+
build-backend = "setuptools.build_meta"
|