PyPI - abi_ds_utils - Versions diffs - 1.2.4__tar.gz - Mend

abi_ds_utils 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

abi_ds_utils-1.2.4/PKG-INFO +11 -0
abi_ds_utils-1.2.4/README.md +68 -0
abi_ds_utils-1.2.4/abi_ds_utils/__init__.py +14 -0
abi_ds_utils-1.2.4/abi_ds_utils/airflow.py +29 -0
abi_ds_utils-1.2.4/abi_ds_utils/aws.py +51 -0
abi_ds_utils-1.2.4/abi_ds_utils/spark.py +62 -0
abi_ds_utils-1.2.4/abi_ds_utils.egg-info/PKG-INFO +11 -0
abi_ds_utils-1.2.4/abi_ds_utils.egg-info/SOURCES.txt +11 -0
abi_ds_utils-1.2.4/abi_ds_utils.egg-info/dependency_links.txt +1 -0
abi_ds_utils-1.2.4/abi_ds_utils.egg-info/requires.txt +8 -0
abi_ds_utils-1.2.4/abi_ds_utils.egg-info/top_level.txt +1 -0
abi_ds_utils-1.2.4/pyproject.toml +25 -0
abi_ds_utils-1.2.4/setup.cfg +4 -0

abi_ds_utils-1.2.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: abi_ds_utils
+Version: 1.2.4
+Summary: Utility modules for working with spark, containers, aws and more.
+Author: Martin Matousek, Ioannis Chios
+License: Private
+Requires-Python: >=3.8
+Requires-Dist: boto3<2.0.0,>=1.21.14
+Requires-Dist: pyspark==3.5.8
+Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
+Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"

abi_ds_utils-1.2.4/README.md ADDED Viewed

@@ -0,0 +1,68 @@
+# DS Utility Modules
+Utility modules for working with Airflow, AWS, Spark, and hopefully more.
+## Spark
+Every DevHub container has the ability to run a single-node Spark instance.
+It is not ideal, but can be handy when trying to run your pipeline on subsets,
+or is necessary if you want to save something in Delta format. For these purposes
+you can use the `get_spark` (or `getSpark` for backwards compatibility) function.
+It will get you a `SparkSession` that works inside DevHub, for our S3 etc.
+## Airflow
+`write_to_xcom` can be used to pass data between two tasks.
+## AWS
+Utility functions to retrieve parameters from SSM's parameter store or secrets from secret manager.
+# Change Log
+---
+## 1.2.4 - 2026-04-01
+---
+## Changed
+- PySpark bump to 3.5.8
+- Delta Lake Maven package bump from `delta-core_2.12:2.2.0` to `delta-spark_2.12:3.3.2` for Spark 3.5.x compatibility
+- `pyarrow` constrained to `<13.0.0` to stay within Spark 3.5's documented support range
+- Fixed the S3A credentials provider config key to use the `spark.hadoop.*` namespace
+---
+## 1.2.1 - 2024-01-30
+---
+## Added
+- `getGlue` and `get_glue` to create the spark session with Glue support
+---
+## 1.1.0 - 2023-06-20
+---
+## Changed
+- Scala 2.13 > 2.12
+- `get_spark` actually works now
+---
+## 1.0.1 - 2023-06-19
+---
+## Added
+- `__init__.py` with `get_spark` alias
+## Changed
+- PySpark bump to 3.3.2
+- Delta bump to 2.13
+- Removed `hurry.filesize` and `psutils` from deps

abi_ds_utils-1.2.4/abi_ds_utils/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from abi_ds_utils.airflow import write_to_xcom
+from abi_ds_utils.aws import get_parameter, get_secret
+from abi_ds_utils.spark import getSpark, getGlue
+get_spark = getSpark
+get_glue = getGlue
+__all__ = [
+    "get_glue",
+    "get_secret",
+    "get_spark",
+    "get_parameter",
+    "write_to_xcom"
+]

abi_ds_utils-1.2.4/abi_ds_utils/airflow.py ADDED Viewed

@@ -0,0 +1,29 @@
+import json
+from typing import Dict
+from pathlib import Path
+def write_to_xcom(dict_value: Dict) -> Dict:
+    """Write `dict_value` for XCOM to pass it to the next task
+    ref: https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/operators.html#how-does-xcom-work
+    """
+    file_save = Path('/airflow/xcom/return.json')
+    file_save.parent.mkdir(parents=True, exist_ok=True)
+    # Get data from the file if there is some
+    if file_save.is_file():
+        with file_save.open('r') as fin:
+            dict_restored = json.load(fin)
+    else:
+        dict_restored = dict()
+    # Update dict with new data
+    dict_restored.update(dict_value)
+    # Overwrite file with updated dict
+    with file_save.open('w') as fout:
+        json.dump(dict_restored, fout)
+    return dict_restored

abi_ds_utils-1.2.4/abi_ds_utils/aws.py ADDED Viewed

@@ -0,0 +1,51 @@
+import json
+import base64
+from typing import Dict, Optional
+import boto3
+from botocore.exceptions import ClientError
+def get_secret(secret_name: str, region_name: str) -> Dict:
+    # Create a Secrets Manager client
+    session = boto3.session.Session()
+    client = session.client(
+        service_name='secretsmanager',
+        region_name=region_name
+    )
+    # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
+    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
+    # We rethrow the exception by default.
+    try:
+        get_secret_value_response = client.get_secret_value(
+            SecretId=secret_name
+        )
+    except ClientError as e:
+        raise e
+    else:
+        # Decrypts secret using the associated KMS key.
+        # Depending on whether the secret is a string or binary, one of these fields will be populated.
+        if 'SecretString' in get_secret_value_response:
+            secret = get_secret_value_response['SecretString']
+        else:
+            secret = base64.b64decode(get_secret_value_response['SecretBinary'])
+        return json.loads(secret)
+def get_parameter(name: str, region: Optional[str] = None) -> Optional[str]:
+    """Retrieve secret from Parameter Store.
+    :param name: Name of the parameter
+    :param region: AWS region otherwise env AWS_DEFAULT_REGION
+    :return: Value of parameter
+    """
+    if region:
+        client = boto3.client("ssm", region_name=region)
+    else:
+        client = boto3.client("ssm")
+    try:
+        parameter = client.get_parameter(Name=name, WithDecryption=True)
+    except ClientError as e:
+        raise e
+    return parameter.get('Parameter', {}).get('Value')

abi_ds_utils-1.2.4/abi_ds_utils/spark.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+from pyspark.sql import SparkSession
+def getSpark(driver_memory: str = "21g") -> SparkSession:
+    spark = (
+        SparkSession.builder
+        # General
+        .master('local[*]')
+        .config("spark.driver.maxResultSize", 0)
+        # Get 80% of free memory (this might be a bad idea)
+        .config("spark.driver.memory", driver_memory)
+        .config("spark.dynamicAllocation.enabled", "true")
+        # PyArrow for dtypes conversions
+        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
+        # Spark 3.5.x requires a Delta 3.x line; Delta 2.2 only supports Spark 3.3.x.
+        .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.2,io.delta:delta-spark_2.12:3.3.2')
+        # Delta Lake setup
+        .config("spark.hadoop.fs.s3a.connection.maximum", 128)
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+        .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
+    )
+    spark = spark.config(
+        "spark.hadoop.fs.s3a.aws.credentials.provider",
+        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
+    )
+    return spark.getOrCreate()
+def getGlue(driver_memory: str = "21g") -> SparkSession:
+    spark = (
+        SparkSession.builder
+        # General
+        .config("spark.driver.maxResultSize", 0)
+        # Get 80% of free memory (this might be a bad idea)
+        .config("spark.driver.memory", driver_memory)
+        .config("spark.dynamicAllocation.enabled", "true")
+        # PyArrow for dtypes conversions
+        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
+        # Delta Lake setup
+        .config("spark.hadoop.fs.s3a.connection.maximum", 128)
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+        .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
+        # Glue setup
+        .config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
+        .config("aws.region", os.environ.get('AWS_DEFAULT_REGION'))
+        .config("hive.metastore.glue.catalogid", os.environ.get('GLUE_CATALOG_ID'))
+        .enableHiveSupport()
+    )
+    return spark.getOrCreate()

abi_ds_utils-1.2.4/abi_ds_utils.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: abi_ds_utils
+Version: 1.2.4
+Summary: Utility modules for working with spark, containers, aws and more.
+Author: Martin Matousek, Ioannis Chios
+License: Private
+Requires-Python: >=3.8
+Requires-Dist: boto3<2.0.0,>=1.21.14
+Requires-Dist: pyspark==3.5.8
+Requires-Dist: pyarrow<13.0.0,>=7.0.0; python_version < "3.12"
+Requires-Dist: pyarrow<18.0.0,>=15.0.2; python_version >= "3.12"

abi_ds_utils-1.2.4/abi_ds_utils.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+README.md
+pyproject.toml
+abi_ds_utils/__init__.py
+abi_ds_utils/airflow.py
+abi_ds_utils/aws.py
+abi_ds_utils/spark.py
+abi_ds_utils.egg-info/PKG-INFO
+abi_ds_utils.egg-info/SOURCES.txt
+abi_ds_utils.egg-info/dependency_links.txt
+abi_ds_utils.egg-info/requires.txt
+abi_ds_utils.egg-info/top_level.txt

abi_ds_utils-1.2.4/abi_ds_utils.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

abi_ds_utils-1.2.4/abi_ds_utils.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,8 @@
+boto3<2.0.0,>=1.21.14
+pyspark==3.5.8
+[:python_version < "3.12"]
+pyarrow<13.0.0,>=7.0.0
+[:python_version >= "3.12"]
+pyarrow<18.0.0,>=15.0.2

abi_ds_utils-1.2.4/abi_ds_utils.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ abi_ds_utils

abi_ds_utils-1.2.4/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[project]
+name = "abi_ds_utils"
+version = "1.2.4"
+description = "Utility modules for working with spark, containers, aws and more."
+authors = [{ name = "Martin Matousek, Ioannis Chios" }]
+license = { text = "Private" }
+requires-python = ">=3.8"
+dependencies = [
+    "boto3>=1.21.14,<2.0.0",
+    "pyspark==3.5.8",
+    "pyarrow>=7.0.0,<13.0.0; python_version < '3.12'",
+    "pyarrow>=15.0.2,<18.0.0; python_version >= '3.12'",
+]
+[dependency-groups]
+dev = [
+    "pre-commit>=2.17.0,<3.0.0",
+]
+[tool.setuptools]
+packages = ["abi_ds_utils"]
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"

abi_ds_utils-1.2.4/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0