abi_ds_utils 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: abi_ds_utils
3
+ Version: 1.2.4
4
+ Summary: Utility modules for working with spark, containers, aws and more.
5
+ Author: Martin Matousek, Ioannis Chios
6
+ License: Private
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: boto3<2.0.0,>=1.21.14
9
+ Requires-Dist: pyspark==3.5.8
10
+ Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
11
+ Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
@@ -0,0 +1,68 @@
1
+ # DS Utility Modules
2
+
3
+ Utility modules for working with Airflow, AWS, Spark, and hopefully more.
4
+
5
+ ## Spark
6
+ Every DevHub container has the ability to run a single-node Spark instance.
7
+ It is not ideal, but can be handy when trying to run your pipeline on subsets,
8
+ or is necessary if you want to save something in Delta format. For these purposes
9
+ you can use the `get_spark` (or `getSpark` for backwards compatibility) function.
10
+ It will get you a `SparkSession` that works inside DevHub, for our S3 etc.
11
+
12
+ ## Airflow
13
+ `write_to_xcom` can be used to pass data between two tasks.
14
+
15
+ ## AWS
16
+ Utility functions to retrieve parameters from SSM's parameter store or secrets from secret manager.
17
+
18
+ # Change Log
19
+
20
+ ---
21
+
22
+ ## 1.2.4 - 2026-04-01
23
+
24
+ ---
25
+
26
+ ## Changed
27
+
28
+ - PySpark bump to 3.5.8
29
+ - Delta Lake Maven package bump from `delta-core_2.12:2.2.0` to `delta-spark_2.12:3.3.2` for Spark 3.5.x compatibility
30
+ - `pyarrow` constrained to `<13.0.0` to stay within Spark 3.5's documented support range
31
+ - Fixed the S3A credentials provider config key to use the `spark.hadoop.*` namespace
32
+
33
+ ---
34
+
35
+ ## 1.2.1 - 2024-01-30
36
+
37
+ ---
38
+
39
+ ## Added
40
+
41
+ - `getGlue` and `get_glue` to create the spark session with Glue support
42
+
43
+ ---
44
+
45
+ ## 1.1.0 - 2023-06-20
46
+
47
+ ---
48
+
49
+ ## Changed
50
+
51
+ - Scala 2.13 > 2.12
52
+ - `get_spark` actually works now
53
+
54
+ ---
55
+
56
+ ## 1.0.1 - 2023-06-19
57
+
58
+ ---
59
+
60
+ ## Added
61
+
62
+ - `__init__.py` with `get_spark` alias
63
+
64
+ ## Changed
65
+
66
+ - PySpark bump to 3.3.2
67
+ - Delta bump to 2.13
68
+ - Removed `hurry.filesize` and `psutils` from deps
@@ -0,0 +1,14 @@
1
+ from abi_ds_utils.airflow import write_to_xcom
2
+ from abi_ds_utils.aws import get_parameter, get_secret
3
+ from abi_ds_utils.spark import getSpark, getGlue
4
+
5
+ get_spark = getSpark
6
+ get_glue = getGlue
7
+
8
+ __all__ = [
9
+ "get_glue",
10
+ "get_secret",
11
+ "get_spark",
12
+ "get_parameter",
13
+ "write_to_xcom"
14
+ ]
@@ -0,0 +1,29 @@
1
+ import json
2
+ from typing import Dict
3
+ from pathlib import Path
4
+
5
+
6
+ def write_to_xcom(dict_value: Dict) -> Dict:
7
+ """Write `dict_value` for XCOM to pass it to the next task
8
+
9
+ ref: https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/operators.html#how-does-xcom-work
10
+
11
+ """
12
+ file_save = Path('/airflow/xcom/return.json')
13
+ file_save.parent.mkdir(parents=True, exist_ok=True)
14
+
15
+ # Get data from the file if there is some
16
+ if file_save.is_file():
17
+ with file_save.open('r') as fin:
18
+ dict_restored = json.load(fin)
19
+
20
+ else:
21
+ dict_restored = dict()
22
+
23
+ # Update dict with new data
24
+ dict_restored.update(dict_value)
25
+
26
+ # Overwrite file with updated dict
27
+ with file_save.open('w') as fout:
28
+ json.dump(dict_restored, fout)
29
+ return dict_restored
@@ -0,0 +1,51 @@
1
+ import json
2
+ import base64
3
+ from typing import Dict, Optional
4
+ import boto3
5
+ from botocore.exceptions import ClientError
6
+
7
+
8
+ def get_secret(secret_name: str, region_name: str) -> Dict:
9
+ # Create a Secrets Manager client
10
+ session = boto3.session.Session()
11
+ client = session.client(
12
+ service_name='secretsmanager',
13
+ region_name=region_name
14
+ )
15
+
16
+ # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
17
+ # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
18
+ # We rethrow the exception by default.
19
+ try:
20
+ get_secret_value_response = client.get_secret_value(
21
+ SecretId=secret_name
22
+ )
23
+ except ClientError as e:
24
+ raise e
25
+ else:
26
+ # Decrypts secret using the associated KMS key.
27
+ # Depending on whether the secret is a string or binary, one of these fields will be populated.
28
+ if 'SecretString' in get_secret_value_response:
29
+ secret = get_secret_value_response['SecretString']
30
+ else:
31
+ secret = base64.b64decode(get_secret_value_response['SecretBinary'])
32
+ return json.loads(secret)
33
+
34
+
35
+ def get_parameter(name: str, region: Optional[str] = None) -> Optional[str]:
36
+ """Retrieve secret from Parameter Store.
37
+
38
+ :param name: Name of the parameter
39
+ :param region: AWS region otherwise env AWS_DEFAULT_REGION
40
+ :return: Value of parameter
41
+ """
42
+ if region:
43
+ client = boto3.client("ssm", region_name=region)
44
+ else:
45
+ client = boto3.client("ssm")
46
+
47
+ try:
48
+ parameter = client.get_parameter(Name=name, WithDecryption=True)
49
+ except ClientError as e:
50
+ raise e
51
+ return parameter.get('Parameter', {}).get('Value')
@@ -0,0 +1,62 @@
1
+ import os
2
+ from pyspark.sql import SparkSession
3
+
4
+
5
+ def getSpark(driver_memory: str = "21g") -> SparkSession:
6
+ spark = (
7
+ SparkSession.builder
8
+ # General
9
+ .master('local[*]')
10
+ .config("spark.driver.maxResultSize", 0)
11
+
12
+ # Get 80% of free memory (this might be a bad idea)
13
+ .config("spark.driver.memory", driver_memory)
14
+ .config("spark.dynamicAllocation.enabled", "true")
15
+
16
+ # PyArrow for dtypes conversions
17
+ .config("spark.sql.execution.arrow.pyspark.enabled", "true")
18
+
19
+ # Spark 3.5.x requires a Delta 3.x line; Delta 2.2 only supports Spark 3.3.x.
20
+ .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.2,io.delta:delta-spark_2.12:3.3.2')
21
+
22
+ # Delta Lake setup
23
+ .config("spark.hadoop.fs.s3a.connection.maximum", 128)
24
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
25
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
26
+ .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
27
+ )
28
+
29
+ spark = spark.config(
30
+ "spark.hadoop.fs.s3a.aws.credentials.provider",
31
+ "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
32
+ )
33
+ return spark.getOrCreate()
34
+
35
+
36
+ def getGlue(driver_memory: str = "21g") -> SparkSession:
37
+ spark = (
38
+ SparkSession.builder
39
+ # General
40
+ .config("spark.driver.maxResultSize", 0)
41
+
42
+ # Get 80% of free memory (this might be a bad idea)
43
+ .config("spark.driver.memory", driver_memory)
44
+ .config("spark.dynamicAllocation.enabled", "true")
45
+
46
+ # PyArrow for dtypes conversions
47
+ .config("spark.sql.execution.arrow.pyspark.enabled", "true")
48
+
49
+ # Delta Lake setup
50
+ .config("spark.hadoop.fs.s3a.connection.maximum", 128)
51
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
52
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
53
+ .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
54
+
55
+ # Glue setup
56
+ .config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
57
+ .config("aws.region", os.environ.get('AWS_DEFAULT_REGION'))
58
+ .config("hive.metastore.glue.catalogid", os.environ.get('GLUE_CATALOG_ID'))
59
+ .enableHiveSupport()
60
+ )
61
+
62
+ return spark.getOrCreate()
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: abi_ds_utils
3
+ Version: 1.2.4
4
+ Summary: Utility modules for working with spark, containers, aws and more.
5
+ Author: Martin Matousek, Ioannis Chios
6
+ License: Private
7
+ Requires-Python: >=3.8
8
+ Requires-Dist: boto3<2.0.0,>=1.21.14
9
+ Requires-Dist: pyspark==3.5.8
10
+ Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
11
+ Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"
@@ -0,0 +1,11 @@
1
+ README.md
2
+ pyproject.toml
3
+ abi_ds_utils/__init__.py
4
+ abi_ds_utils/airflow.py
5
+ abi_ds_utils/aws.py
6
+ abi_ds_utils/spark.py
7
+ abi_ds_utils.egg-info/PKG-INFO
8
+ abi_ds_utils.egg-info/SOURCES.txt
9
+ abi_ds_utils.egg-info/dependency_links.txt
10
+ abi_ds_utils.egg-info/requires.txt
11
+ abi_ds_utils.egg-info/top_level.txt
@@ -0,0 +1,8 @@
1
+ boto3<2.0.0,>=1.21.14
2
+ pyspark==3.5.8
3
+
4
+ [:python_version < "3.12"]
5
+ pyarrow<13.0.0,>=7.0.0
6
+
7
+ [:python_version >= "3.12"]
8
+ pyarrow<18.0.0,>=15.0.2
@@ -0,0 +1 @@
1
+ abi_ds_utils
@@ -0,0 +1,25 @@
1
+ [project]
2
+ name = "abi_ds_utils"
3
+ version = "1.2.4"
4
+ description = "Utility modules for working with spark, containers, aws and more."
5
+ authors = [{ name = "Martin Matousek, Ioannis Chios" }]
6
+ license = { text = "Private" }
7
+ requires-python = ">=3.8"
8
+ dependencies = [
9
+ "boto3>=1.21.14,<2.0.0",
10
+ "pyspark==3.5.8",
11
+ "pyarrow>=7.0.0,<13.0.0; python_version < '3.12'",
12
+ "pyarrow>=15.0.2,<18.0.0; python_version >= '3.12'",
13
+ ]
14
+
15
+ [dependency-groups]
16
+ dev = [
17
+ "pre-commit>=2.17.0,<3.0.0",
18
+ ]
19
+
20
+ [tool.setuptools]
21
+ packages = ["abi_ds_utils"]
22
+
23
+ [build-system]
24
+ requires = ["setuptools>=68"]
25
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+