PyPI - atlan-application-sdk - Versions diffs - 0.1.1rc40__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl - Mend

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

application_sdk/common/aws_utils.py CHANGED Viewed

@@ -1,4 +1,13 @@
+import re
+from typing import Any, Dict, Optional
+import boto3
+from sqlalchemy.engine.url import URL
 from application_sdk.constants import AWS_SESSION_NAME
+from application_sdk.observability.logger_adaptor import get_logger
+logger = get_logger(__name__)
 def get_region_name_from_hostname(hostname: str) -> str:
@@ -12,11 +21,14 @@ def get_region_name_from_hostname(hostname: str) -> str:
     Returns:
         str: AWS region name
     """
-    parts = hostname.split(".")
-    for part in parts:
-        if part.startswith(("us-", "eu-", "ap-", "ca-", "me-", "sa-", "af-")):
-            return part
-    raise ValueError(f"Could not find valid AWS region in hostname: {hostname}")
+    match = re.search(r"\.([a-z]{2}-[a-z]+-\d)\.", hostname)
+    if match:
+        return match.group(1)
+    # Some services may use - instead of . (rare)
+    match = re.search(r"-([a-z]{2}-[a-z]+-\d)\.", hostname)
+    if match:
+        return match.group(1)
+    raise ValueError("Could not find valid AWS region from hostname")
 def generate_aws_rds_token_with_iam_role(
@@ -55,12 +67,10 @@ def generate_aws_rds_token_with_iam_role(
         )
         credentials = assumed_role["Credentials"]
-        aws_client = client(
-            "rds",
-            aws_access_key_id=credentials["AccessKeyId"],
-            aws_secret_access_key=credentials["SecretAccessKey"],
-            aws_session_token=credentials["SessionToken"],
-            region_name=region or get_region_name_from_hostname(host),
+        aws_client = create_aws_client(
+            service="rds",
+            region=region or get_region_name_from_hostname(host),
+            temp_credentials=credentials,
         )
         token: str = aws_client.generate_db_auth_token(
             DBHostname=host, Port=port, DBUsername=user
@@ -107,3 +117,241 @@ def generate_aws_rds_token_with_iam_user(
         return token
     except Exception as e:
         raise Exception(f"Failed to get user credentials: {str(e)}")
+def get_cluster_identifier(aws_client) -> Optional[str]:
+    """
+    Retrieve the cluster identifier from AWS Redshift clusters.
+    Args:
+        aws_client: Boto3 Redshift client instance
+    Returns:
+        str: The cluster identifier
+    Raises:
+        RuntimeError: If no clusters are found
+    """
+    clusters = aws_client.describe_clusters()
+    for cluster in clusters["Clusters"]:
+        cluster_identifier = cluster.get("ClusterIdentifier")
+        if cluster_identifier:
+            # Optionally, you can add logic to filter clusters if needed
+            # we are reading first clusters ID if not provided
+            return cluster_identifier  # Just return the string
+    return None
+def create_aws_session(credentials: Dict[str, Any]) -> boto3.Session:
+    """
+    Create a boto3 session with AWS credentials.
+    Args:
+        credentials: Dictionary containing AWS credentials
+    Returns:
+        boto3.Session: Configured boto3 session
+    """
+    aws_access_key_id = credentials.get("aws_access_key_id") or credentials.get(
+        "username"
+    )
+    aws_secret_access_key = credentials.get("aws_secret_access_key") or credentials.get(
+        "password"
+    )
+    return boto3.Session(
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+def get_cluster_credentials(
+    aws_client, credentials: Dict[str, Any], extra: Dict[str, Any]
+) -> Dict[str, str]:
+    """
+    Retrieve cluster credentials using IAM authentication.
+    Args:
+        aws_client: Boto3 Redshift client instance
+        credentials: Dictionary containing connection credentials
+    Returns:
+        Dict[str, str]: Dictionary containing DbUser and DbPassword
+    """
+    database = extra["database"]
+    cluster_identifier = credentials.get("cluster_id") or get_cluster_identifier(
+        aws_client
+    )
+    return aws_client.get_cluster_credentials_with_iam(
+        DbName=database,
+        ClusterIdentifier=cluster_identifier,
+    )
+def create_aws_client(
+    service: str,
+    region: str,
+    session: Optional[boto3.Session] = None,
+    temp_credentials: Optional[Dict[str, str]] = None,
+    use_default_credentials: bool = False,
+) -> Any:
+    """
+    Create an AWS client with flexible credential options.
+    Args:
+        service: AWS service name (e.g., 'redshift', 'redshift-serverless', 'sts', 'rds')
+        region: AWS region name
+        session: Optional boto3 session instance. If provided, uses session credentials
+        temp_credentials: Optional dictionary containing temporary credentials from assume_role.
+                         Must contain 'AccessKeyId', 'SecretAccessKey', and 'SessionToken'
+        use_default_credentials: If True, uses default AWS credentials (environment, IAM role, etc.)
+                                This is the fallback if no other credentials are provided
+    Returns:
+        AWS client instance
+    Raises:
+        ValueError: If invalid credential combination is provided
+        Exception: If client creation fails
+    Examples:
+        Using temporary credentials::
+            client = create_aws_client(
+                service="redshift",
+                region="us-east-1",
+                temp_credentials={
+                    "AccessKeyId": "AKIA...",
+                    "SecretAccessKey": "...",
+                    "SessionToken": "..."
+                }
+            )
+        Using a session::
+            session = boto3.Session(profile_name="my-profile")
+            client = create_aws_client(
+                service="rds",
+                region="us-west-2",
+                session=session
+            )
+        Using default credentials::
+            client = create_aws_client(
+                service="sts",
+                region="us-east-1",
+                use_default_credentials=True
+            )
+    """
+    # Validate credential options
+    credential_sources = sum(
+        [session is not None, temp_credentials is not None, use_default_credentials]
+    )
+    if credential_sources == 0:
+        raise ValueError("At least one credential source must be provided")
+    if credential_sources > 1:
+        raise ValueError("Only one credential source should be provided at a time")
+    try:
+        # Priority 1: Use provided session
+        if session is not None:
+            logger.debug(
+                f"Creating {service} client using provided session in region {region}"
+            )
+            return session.client(service, region_name=region)  # type: ignore
+        # Priority 2: Use temporary credentials
+        if temp_credentials is not None:
+            logger.debug(
+                f"Creating {service} client using temporary credentials in region {region}"
+            )
+            return boto3.client(  # type: ignore
+                service,
+                aws_access_key_id=temp_credentials["AccessKeyId"],
+                aws_secret_access_key=temp_credentials["SecretAccessKey"],
+                aws_session_token=temp_credentials["SessionToken"],
+                region_name=region,
+            )
+        # Priority 3: Use default credentials
+        if use_default_credentials:
+            logger.debug(
+                f"Creating {service} client using default credentials in region {region}"
+            )
+            return boto3.client(service, region_name=region)  # type: ignore
+    except Exception as e:
+        logger.error(f"Failed to create {service} client in region {region}: {e}")
+        raise Exception(f"Failed to create {service} client: {str(e)}")
+def create_engine_url(
+    drivername: str,
+    credentials: Dict[str, Any],
+    cluster_credentials: Dict[str, str],
+    extra: Dict[str, Any],
+) -> URL:
+    """
+    Create SQLAlchemy engine URL for Redshift connection.
+    Args:
+        credentials: Dictionary containing connection credentials
+        cluster_credentials: Dictionary containing DbUser and DbPassword
+    Returns:
+        URL: SQLAlchemy engine URL
+    """
+    host = credentials["host"]
+    port = credentials.get("port")
+    database = extra["database"]
+    return URL.create(
+        drivername=drivername,
+        username=cluster_credentials["DbUser"],
+        password=cluster_credentials["DbPassword"],
+        host=host,
+        port=port,
+        database=database,
+    )
+def get_all_aws_regions() -> list[str]:
+    """
+    Get all available AWS regions dynamically using EC2 describe_regions API.
+    Returns:
+        list[str]: List of all AWS region names
+    Raises:
+        Exception: If unable to retrieve regions from AWS
+    """
+    try:
+        # Use us-east-1 as the default region for the EC2 client since it's always available
+        ec2_client = boto3.client("ec2", region_name="us-east-1")
+        response = ec2_client.describe_regions()
+        regions = [region["RegionName"] for region in response["Regions"]]
+        return sorted(regions)  # Sort for consistent ordering
+    except Exception as e:
+        # Fallback to a comprehensive hardcoded list if API call fails
+        logger.warning(
+            f"Failed to retrieve AWS regions dynamically: {e}. Using fallback list."
+        )
+        return [
+            "ap-northeast-1",
+            "ap-south-1",
+            "ap-southeast-1",
+            "ap-southeast-2",
+            "aws-global",
+            "ca-central-1",
+            "eu-central-1",
+            "eu-north-1",
+            "eu-west-1",
+            "eu-west-2",
+            "eu-west-3",
+            "sa-east-1",
+            "us-east-1",
+            "us-east-2",
+            "us-west-1",
+            "us-west-2",
+        ]

application_sdk/common/utils.py CHANGED Viewed

@@ -17,8 +17,12 @@ from typing import (
     Union,
 )
+from application_sdk.activities.common.utils import get_object_store_prefix
 from application_sdk.common.error_codes import CommonError
+from application_sdk.constants import TEMPORARY_PATH
+from application_sdk.inputs.sql_query import SQLQueryInput
 from application_sdk.observability.logger_adaptor import get_logger
+from application_sdk.services.objectstore import ObjectStore
 logger = get_logger(__name__)
@@ -106,10 +110,42 @@ def extract_database_names_from_regex_common(
         return empty_default
+def transform_posix_regex(regex_pattern: str) -> str:
+    r"""
+    Transform regex pattern for POSIX compatibility.
+    Rules:
+    1. Add ^ before each database name before \.
+    2. Add an additional . between \. and * if * follows \.
+    Example: 'dev\.public$|dev\.atlan_test_schema$|wide_world_importers\.*'
+    Becomes: '^dev\.public$|^dev\.atlan_test_schema$|^wide_world_importers\..*'
+    """
+    if not regex_pattern:
+        return regex_pattern
+    # Split by | to handle each pattern separately
+    patterns = regex_pattern.split("|")
+    transformed_patterns = []
+    for pattern in patterns:
+        # Add ^ at the beginning if it's not already there
+        if not pattern.startswith("^"):
+            pattern = "^" + pattern
+            # Add additional . between \. and * if * follows \.
+            pattern = re.sub(r"\\\.\*", r"\..*", pattern)
+        transformed_patterns.append(pattern)
+    return "|".join(transformed_patterns)
 def prepare_query(
     query: Optional[str],
     workflow_args: Dict[str, Any],
     temp_table_regex_sql: Optional[str] = "",
+    use_posix_regex: Optional[bool] = False,
 ) -> Optional[str]:
     """
     Prepares a SQL query by applying include and exclude filters, and optional
@@ -158,6 +194,14 @@ def prepare_query(
             include_filter, exclude_filter
         )
+        if use_posix_regex:
+            normalized_include_regex_posix = transform_posix_regex(
+                normalized_include_regex
+            )
+            normalized_exclude_regex_posix = transform_posix_regex(
+                normalized_exclude_regex
+            )
         # Extract database names from the normalized regex patterns
         include_databases = extract_database_names_from_regex_common(
             normalized_regex=normalized_include_regex,
@@ -176,15 +220,26 @@ def prepare_query(
         )
         exclude_views = workflow_args.get("metadata", {}).get("exclude_views", False)
-        return query.format(
-            include_databases=include_databases,
-            exclude_databases=exclude_databases,
-            normalized_include_regex=normalized_include_regex,
-            normalized_exclude_regex=normalized_exclude_regex,
-            temp_table_regex_sql=temp_table_regex_sql,
-            exclude_empty_tables=exclude_empty_tables,
-            exclude_views=exclude_views,
-        )
+        if use_posix_regex:
+            return query.format(
+                include_databases=include_databases,
+                exclude_databases=exclude_databases,
+                normalized_include_regex=normalized_include_regex_posix,
+                normalized_exclude_regex=normalized_exclude_regex_posix,
+                temp_table_regex_sql=temp_table_regex_sql,
+                exclude_empty_tables=exclude_empty_tables,
+                exclude_views=exclude_views,
+            )
+        else:
+            return query.format(
+                include_databases=include_databases,
+                exclude_databases=exclude_databases,
+                normalized_include_regex=normalized_include_regex,
+                normalized_exclude_regex=normalized_exclude_regex,
+                temp_table_regex_sql=temp_table_regex_sql,
+                exclude_empty_tables=exclude_empty_tables,
+                exclude_views=exclude_views,
+            )
     except CommonError as e:
         # Extract the original error message from the CommonError
         error_message = str(e).split(": ", 1)[-1] if ": " in str(e) else str(e)
@@ -195,6 +250,47 @@ def prepare_query(
         return None
+async def get_database_names(
+    sql_client, workflow_args, fetch_database_sql
+) -> Optional[List[str]]:
+    """
+    Get the database names from the workflow args if include-filter is present
+    Args:
+        workflow_args: The workflow args
+    Returns:
+        List[str]: The database names
+    """
+    database_names = parse_filter_input(
+        workflow_args.get("metadata", {}).get("include-filter", {})
+    )
+    database_names = [
+        re.sub(r"^[^\w]+|[^\w]+$", "", database_name)
+        for database_name in database_names
+    ]
+    if not database_names:
+        # if database_names are not provided in the include-filter, we'll run the query to get all the database names
+        # because by default for an empty include-filter, we fetch details corresponding to all the databases.
+        temp_table_regex_sql = workflow_args.get("metadata", {}).get(
+            "temp-table-regex", ""
+        )
+        prepared_query = prepare_query(
+            query=fetch_database_sql,
+            workflow_args=workflow_args,
+            temp_table_regex_sql=temp_table_regex_sql,
+            use_posix_regex=True,
+        )
+        # We'll run the query to get all the database names
+        database_sql_input = SQLQueryInput(
+            engine=sql_client.engine,
+            query=prepared_query,  # type: ignore
+            chunk_size=None,
+        )
+        database_dataframe = await database_sql_input.get_dataframe()
+        database_names = list(database_dataframe["database_name"])
+    return database_names
 def parse_filter_input(
     filter_input: Union[str, Dict[str, Any], None],
 ) -> Dict[str, Any]:
@@ -416,6 +512,46 @@ def parse_credentials_extra(credentials: Dict[str, Any]) -> Dict[str, Any]:
     return extra  # We know it's a Dict[str, Any] due to the Union type and str check
+def has_custom_control_config(workflow_args: Dict[str, Any]) -> bool:
+    """
+    Check if custom control configuration is present in workflow arguments.
+    Args:
+        workflow_args: The workflow arguments
+    Returns:
+        bool: True if custom control configuration is present, False otherwise
+    """
+    return (
+        workflow_args.get("control-config-strategy") == "custom"
+        and workflow_args.get("control-config") is not None
+    )
+async def get_file_names(output_path: str, typename: str) -> List[str]:
+    """
+    Get file names for a specific asset type from the transformed directory.
+    Args:
+        output_path (str): The base output path
+        typename (str): The asset type (e.g., 'table', 'schema', 'column')
+    Returns:
+        List[str]: List of relative file paths for the asset type
+    """
+    source = get_object_store_prefix(os.path.join(output_path, typename))
+    await ObjectStore.download_prefix(source, TEMPORARY_PATH)
+    file_pattern = os.path.join(output_path, typename, "*.json")
+    file_names = glob.glob(file_pattern)
+    file_name_list = [
+        "/".join(file_name.rsplit("/", 2)[-2:]) for file_name in file_names
+    ]
+    return file_name_list
 def run_sync(func):
     """Run a function in a thread pool executor.

application_sdk/handlers/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Dict
 class HandlerInterface(ABC):
@@ -37,3 +37,10 @@ class HandlerInterface(ABC):
         To be implemented by the subclass
         """
         raise NotImplementedError("fetch_metadata method not implemented")
+    @staticmethod
+    async def get_configmap(config_map_id: str) -> Dict[str, Any]:
+        """
+        Static method to get the configmap
+        """
+        return {}

application_sdk/handlers/sql.py CHANGED Viewed

@@ -56,9 +56,13 @@ class BaseSQLHandler(HandlerInterface):
     schema_alias_key: str = SQLConstants.SCHEMA_ALIAS_KEY.value
     database_result_key: str = SQLConstants.DATABASE_RESULT_KEY.value
     schema_result_key: str = SQLConstants.SCHEMA_RESULT_KEY.value
+    multidb: bool = False
-    def __init__(self, sql_client: BaseSQLClient | None = None):
+    def __init__(
+        self, sql_client: BaseSQLClient | None = None, multidb: Optional[bool] = False
+    ):
         self.sql_client = sql_client
+        self.multidb = multidb
     async def load(self, credentials: Dict[str, Any]) -> None:
         """
@@ -294,35 +298,26 @@ class BaseSQLHandler(HandlerInterface):
                         return False, f"{db}.{sch} schema"
         return True, ""
-    async def tables_check(
-        self,
-        payload: Dict[str, Any],
-    ) -> Dict[str, Any]:
+    async def tables_check(self, payload: Dict[str, Any]) -> Dict[str, Any]:
         """
         Method to check the count of tables
         """
         logger.info("Starting tables check")
-        query = prepare_query(
-            query=self.tables_check_sql,
-            workflow_args=payload,
-            temp_table_regex_sql=self.extract_temp_table_regex_table_sql,
-        )
-        if not query:
-            raise ValueError("tables_check_sql is not defined")
-        sql_input = SQLQueryInput(
-            engine=self.sql_client.engine, query=query, chunk_size=None
-        )
-        sql_input = await sql_input.get_dataframe()
-        try:
-            result = 0
-            for row in sql_input.to_dict(orient="records"):
-                result += row["count"]
+        def _sum_counts_from_records(records_iter) -> int:
+            total = 0
+            for row in records_iter:
+                total += row["count"]
+            return total
+        def _build_success(total: int) -> Dict[str, Any]:
             return {
                 "success": True,
-                "successMessage": f"Tables check successful. Table count: {result}",
+                "successMessage": f"Tables check successful. Table count: {total}",
                 "failureMessage": "",
             }
-        except Exception as exc:
+        def _build_failure(exc: Exception) -> Dict[str, Any]:
             logger.error("Error during tables check", exc_info=True)
             return {
                 "success": False,
@@ -331,6 +326,52 @@ class BaseSQLHandler(HandlerInterface):
                 "error": str(exc),
             }
+        if self.multidb:
+            try:
+                from application_sdk.activities.metadata_extraction.sql import (
+                    BaseSQLMetadataExtractionActivities,
+                )
+                # Use the base query executor in multidb mode to get concatenated df
+                activities = BaseSQLMetadataExtractionActivities()
+                activities.multidb = True
+                concatenated_df = await activities.query_executor(
+                    sql_engine=self.sql_client.engine if self.sql_client else None,
+                    sql_query=self.tables_check_sql,
+                    workflow_args=payload,
+                    output_suffix="raw/table",
+                    typename="table",
+                    write_to_file=False,
+                    concatenate=True,
+                    return_dataframe=True,
+                    sql_client=self.sql_client,
+                )
+                if concatenated_df is None:
+                    return _build_success(0)
+                total = int(concatenated_df["count"].sum())  # type: ignore[index]
+                return _build_success(total)
+            except Exception as exc:
+                return _build_failure(exc)
+        else:
+            query = prepare_query(
+                query=self.tables_check_sql,
+                workflow_args=payload,
+                temp_table_regex_sql=self.extract_temp_table_regex_table_sql,
+            )
+            if not query:
+                raise ValueError("tables_check_sql is not defined")
+            sql_input = SQLQueryInput(
+                engine=self.sql_client.engine, query=query, chunk_size=None
+            )
+            sql_input = await sql_input.get_dataframe()
+            try:
+                total = _sum_counts_from_records(sql_input.to_dict(orient="records"))
+                return _build_success(total)
+            except Exception as exc:
+                return _build_failure(exc)
     async def check_client_version(self) -> Dict[str, Any]:
         """
         Check if the client version meets the minimum required version.

atlan-application-sdk 0.1.1rc40__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc41py3-none-any.whl