PyPI - atlan-application-sdk - Versions diffs - 0.1.1rc40__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl - Mend

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

application_sdk/activities/metadata_extraction/sql.py CHANGED Viewed

@@ -1,5 +1,20 @@
 import os
-from typing import Any, Dict, Optional, Tuple, Type, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    cast,
+    overload,
+)
 from temporalio import activity
@@ -13,7 +28,12 @@ from application_sdk.activities.common.utils import (
 from application_sdk.clients.sql import BaseSQLClient
 from application_sdk.common.dataframe_utils import is_empty_dataframe
 from application_sdk.common.error_codes import ActivityError
-from application_sdk.common.utils import prepare_query, read_sql_files
+from application_sdk.common.utils import (
+    get_database_names,
+    parse_credentials_extra,
+    prepare_query,
+    read_sql_files,
+)
 from application_sdk.constants import APP_TENANT_ID, APPLICATION_NAME, SQL_QUERIES_PATH
 from application_sdk.handlers.sql import BaseSQLHandler
 from application_sdk.inputs.parquet import ParquetInput
@@ -31,6 +51,9 @@ activity.logger = logger
 queries = read_sql_files(queries_prefix=SQL_QUERIES_PATH)
+if TYPE_CHECKING:
+    import pandas as pd
 class BaseSQLMetadataExtractionActivitiesState(ActivitiesState):
     """State class for SQL metadata extraction activities.
@@ -90,6 +113,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         sql_client_class: Optional[Type[BaseSQLClient]] = None,
         handler_class: Optional[Type[BaseSQLHandler]] = None,
         transformer_class: Optional[Type[TransformerInterface]] = None,
+        multidb: bool = False,
     ):
         """Initialize the SQL metadata extraction activities.
@@ -100,6 +124,8 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
                 Defaults to BaseSQLHandler.
             transformer_class (Type[TransformerInterface], optional): Class for metadata transformation.
                 Defaults to QueryBasedTransformer.
+            multidb (bool): When True, executes queries across multiple databases using
+                `multidb_query_executor`. Defaults to False.
         """
         if sql_client_class:
             self.sql_client_class = sql_client_class
@@ -108,6 +134,9 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         if transformer_class:
             self.transformer_class = transformer_class
+        # Control whether to execute per-db using multidb executor
+        self.multidb = multidb
         super().__init__()
     # State methods
@@ -206,6 +235,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
             raise ValueError("Missing required workflow arguments")
         return output_prefix, output_path, typename, workflow_id, workflow_run_id
+    @overload
     async def query_executor(
         self,
         sql_engine: Any,
@@ -213,7 +243,38 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         workflow_args: Dict[str, Any],
         output_suffix: str,
         typename: str,
-    ) -> Optional[ActivityStatistics]:
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = False,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[ActivityStatistics]: ...
+    @overload
+    async def query_executor(
+        self,
+        sql_engine: Any,
+        sql_query: Optional[str],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = True,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]: ...
+    async def query_executor(
+        self,
+        sql_engine: Any,
+        sql_query: Optional[str],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = False,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
         """
         Executes a SQL query using the provided engine and saves the results to Parquet.
@@ -233,44 +294,358 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
             typename: Type name used for generating output statistics.
         Returns:
-            Optional[ActivityStatistics]: Statistics about the generated Parquet file,
-            or None if the query is empty or execution fails before writing output.
+            Optional[Union[ActivityStatistics, pd.DataFrame]]: Statistics about the generated Parquet file,
+            or a DataFrame if return_dataframe=True, or None if the query is empty or execution fails.
         Raises:
             ValueError: If `sql_engine` is not provided.
         """
+        # Common pre-checks and setup shared by both multidb and single-db paths
+        if not sql_query:
+            logger.warning("Query is empty, skipping execution.")
+            return None
         if not sql_engine:
             logger.error("SQL engine is not set.")
             raise ValueError("SQL engine must be provided.")
-        if not sql_query:
-            logger.warning("Query is empty, skipping execution.")
+        # Setup parquet output using helper method
+        parquet_output = self._setup_parquet_output(
+            workflow_args, output_suffix, write_to_file
+        )
+        # If multidb mode is enabled, run per-database flow
+        if getattr(self, "multidb", False):
+            return await self._execute_multidb_flow(
+                sql_client,
+                sql_query,
+                workflow_args,
+                output_suffix,
+                typename,
+                write_to_file,
+                concatenate,
+                return_dataframe,
+                parquet_output,
+            )
+        # Single-db execution path
+        # Prepare query for single-db execution
+        prepared_query = self._prepare_database_query(
+            sql_query, None, workflow_args, typename
+        )
+        # Execute using helper method
+        success, _ = await self._execute_single_db(
+            sql_engine, prepared_query, parquet_output, write_to_file
+        )
+        if not success:
+            logger.error("Failed to execute single-db query")
             return None
-        try:
-            sql_input = SQLQueryInput(engine=sql_engine, query=sql_query)
-            dataframe = await sql_input.get_batched_dataframe()
+        if parquet_output:
+            logger.info(
+                f"Successfully wrote query results to {parquet_output.get_full_path()}"
+            )
+            return await parquet_output.get_statistics(typename=typename)
+        logger.warning("No parquet output configured for single-db execution")
+        return None
+    def _setup_parquet_output(
+        self,
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        write_to_file: bool,
+    ) -> Optional[ParquetOutput]:
+        if not write_to_file:
+            return None
+        output_prefix = workflow_args.get("output_prefix")
+        output_path = workflow_args.get("output_path")
+        if not output_prefix or not output_path:
+            logger.error("Output prefix or path not provided in workflow_args.")
+            raise ValueError(
+                "Output prefix and path must be specified in workflow_args."
+            )
+        return ParquetOutput(
+            output_prefix=output_prefix,
+            output_path=output_path,
+            output_suffix=output_suffix,
+        )
-            output_prefix = workflow_args.get("output_prefix")
-            output_path = workflow_args.get("output_path")
+    def _get_temp_table_regex_sql(self, typename: str) -> str:
+        """Get the appropriate temp table regex SQL based on typename."""
+        if typename == "column":
+            return self.extract_temp_table_regex_column_sql or ""
+        elif typename == "table":
+            return self.extract_temp_table_regex_table_sql or ""
+        else:
+            return ""
-            if not output_prefix or not output_path:
-                logger.error("Output prefix or path not provided in workflow_args.")
-                raise ValueError(
-                    "Output prefix and path must be specified in workflow_args."
+    def _prepare_database_query(
+        self,
+        sql_query: str,
+        database_name: Optional[str],
+        workflow_args: Dict[str, Any],
+        typename: str,
+        use_posix_regex: bool = False,
+    ) -> Optional[str]:
+        """Prepare query for database execution with proper substitutions."""
+        # Replace database name placeholder if provided
+        fetch_sql = sql_query
+        if database_name:
+            fetch_sql = fetch_sql.replace("{database_name}", database_name)
+        # Get temp table regex SQL
+        temp_table_regex_sql = self._get_temp_table_regex_sql(typename)
+        # Prepare the query
+        prepared_query = prepare_query(
+            query=fetch_sql,
+            workflow_args=workflow_args,
+            temp_table_regex_sql=temp_table_regex_sql,
+            use_posix_regex=use_posix_regex,
+        )
+        if prepared_query is None:
+            db_context = f" for database {database_name}" if database_name else ""
+            raise ValueError(f"Failed to prepare query{db_context}")
+        return prepared_query
+    async def _setup_database_connection(
+        self,
+        sql_client: BaseSQLClient,
+        database_name: str,
+    ) -> None:
+        """Setup connection for a specific database."""
+        extra = parse_credentials_extra(sql_client.credentials)
+        extra["database"] = database_name
+        sql_client.credentials["extra"] = extra
+        await sql_client.load(sql_client.credentials)
+    # NOTE: Consolidated: per-database processing is now inlined in the multi-DB loop
+    async def _finalize_multidb_results(
+        self,
+        write_to_file: bool,
+        concatenate: bool,
+        return_dataframe: bool,
+        parquet_output: Optional[ParquetOutput],
+        dataframe_list: List[
+            Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
+        ],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
+        """Finalize results for multi-database execution."""
+        if write_to_file and parquet_output:
+            return await parquet_output.get_statistics(typename=typename)
+        if not write_to_file and concatenate:
+            try:
+                import pandas as pd  # type: ignore
+                valid_dataframes: List[pd.DataFrame] = []
+                for df_generator in dataframe_list:
+                    if df_generator is None:
+                        continue
+                    for dataframe in df_generator:  # type: ignore[assignment]
+                        if dataframe is None:
+                            continue
+                        if hasattr(dataframe, "empty") and getattr(dataframe, "empty"):
+                            continue
+                        valid_dataframes.append(dataframe)
+                if not valid_dataframes:
+                    logger.warning(
+                        "No valid dataframes collected across databases for concatenation"
+                    )
+                    return None
+                concatenated = pd.concat(valid_dataframes, ignore_index=True)
+                if return_dataframe:
+                    return concatenated  # type: ignore[return-value]
+                # Create new parquet output for concatenated data
+                concatenated_parquet_output = self._setup_parquet_output(
+                    workflow_args, output_suffix, True
                 )
+                if concatenated_parquet_output:
+                    await concatenated_parquet_output.write_dataframe(concatenated)  # type: ignore[arg-type]
+                    return await concatenated_parquet_output.get_statistics(
+                        typename=typename
+                    )
+            except Exception as e:  # noqa: BLE001
+                logger.error(
+                    f"Error concatenating multi-DB dataframes: {str(e)}",
+                    exc_info=True,
+                )
+                raise
+        logger.warning(
+            "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
+        )
+        return None
-            parquet_output = ParquetOutput(
-                output_prefix=output_prefix,
-                output_path=output_path,
-                output_suffix=output_suffix,
+    async def _execute_multidb_flow(
+        self,
+        sql_client: Optional[BaseSQLClient],
+        sql_query: str,
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool,
+        concatenate: bool,
+        return_dataframe: bool,
+        parquet_output: Optional[ParquetOutput],
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
+        """Execute multi-database flow with proper error handling and result finalization."""
+        # Get effective SQL client
+        effective_sql_client = sql_client
+        if effective_sql_client is None:
+            state = cast(
+                BaseSQLMetadataExtractionActivitiesState,
+                await self._get_state(workflow_args),
             )
-            await parquet_output.write_batched_dataframe(dataframe)
-            logger.info(
-                f"Successfully wrote query results to {parquet_output.get_full_path()}"
+            effective_sql_client = state.sql_client
+        if not effective_sql_client:
+            logger.error("SQL client not initialized for multidb execution")
+            raise ValueError("SQL client not initialized")
+        # Resolve databases to iterate
+        database_names = await get_database_names(
+            effective_sql_client, workflow_args, self.fetch_database_sql
+        )
+        if not database_names:
+            logger.warning("No databases found to process")
+            return None
+        # Validate client
+        if not effective_sql_client.engine:
+            logger.error("SQL client engine not initialized")
+            raise ValueError("SQL client engine not initialized")
+        successful_databases: List[str] = []
+        failed_databases: List[str] = []
+        dataframe_list: List[
+            Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
+        ] = []
+        # Iterate databases and execute (consolidated single-db processing)
+        for database_name in database_names or []:
+            try:
+                # Setup connection for this database
+                await self._setup_database_connection(
+                    effective_sql_client, database_name
+                )
+                # Prepare query for this database
+                prepared_query = self._prepare_database_query(
+                    sql_query,
+                    database_name,
+                    workflow_args,
+                    typename,
+                    use_posix_regex=True,
+                )
+                # Execute using helper method
+                success, batched_iter = await self._execute_single_db(
+                    effective_sql_client.engine,
+                    prepared_query,
+                    parquet_output,
+                    write_to_file,
+                )
+                if success:
+                    logger.info(f"Successfully processed database: {database_name}")
+                else:
+                    logger.warning(
+                        f"Failed to execute query for database: {database_name}"
+                    )
+            except Exception as e:  # noqa: BLE001
+                logger.warning(
+                    f"Failed to process database '{database_name}': {str(e)}. Skipping to next database."
+                )
+                success, batched_iter = False, None
+            if success:
+                successful_databases.append(database_name)
+                if not write_to_file and batched_iter:
+                    dataframe_list.append(batched_iter)
+            else:
+                failed_databases.append(database_name)
+        # Log results
+        logger.info(
+            f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
+        )
+        if failed_databases:
+            logger.warning(
+                f"Failed to process {len(failed_databases)} databases: {failed_databases}"
             )
-            statistics = await parquet_output.get_statistics(typename=typename)
-            return statistics
+        # Finalize results
+        return await self._finalize_multidb_results(
+            write_to_file,
+            concatenate,
+            return_dataframe,
+            parquet_output,
+            dataframe_list,
+            workflow_args,
+            output_suffix,
+            typename,
+        )
+    async def _execute_single_db(
+        self,
+        sql_engine: Any,
+        prepared_query: Optional[str],
+        parquet_output: Optional[ParquetOutput],
+        write_to_file: bool,
+    ) -> Tuple[
+        bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
+    ]:  # type: ignore
+        if not prepared_query:
+            logger.error("Prepared query is None, cannot execute")
+            return False, None
+        try:
+            sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
+            batched_iter = await sql_input.get_batched_dataframe()
+            if write_to_file and parquet_output:
+                # Wrap iterator into a proper (async)generator for type safety
+                if hasattr(batched_iter, "__anext__"):
+                    async def _to_async_gen(
+                        it: AsyncIterator["pd.DataFrame"],
+                    ) -> AsyncGenerator["pd.DataFrame", None]:
+                        async for item in it:
+                            yield item
+                    wrapped: AsyncGenerator["pd.DataFrame", None] = _to_async_gen(  # type: ignore
+                        batched_iter  # type: ignore
+                    )
+                    await parquet_output.write_batched_dataframe(wrapped)
+                else:
+                    def _to_gen(
+                        it: Iterator["pd.DataFrame"],
+                    ) -> Generator["pd.DataFrame", None, None]:
+                        for item in it:
+                            yield item
+                    wrapped_sync: Generator["pd.DataFrame", None, None] = _to_gen(  # type: ignore
+                        batched_iter  # type: ignore
+                    )
+                    await parquet_output.write_batched_dataframe(wrapped_sync)
+                return True, None
+            return True, batched_iter
         except Exception as e:
             logger.error(
                 f"Error during query execution or output writing: {e}", exc_info=True

application_sdk/application/__init__.py CHANGED Viewed

@@ -164,6 +164,7 @@ class BaseApplication:
         self,
         workflow_class,
         ui_enabled: bool = True,
+        has_configmap: bool = False,
     ):
         """
         Optionally set up a server for the application. (No-op by default)
@@ -176,6 +177,7 @@ class BaseApplication:
             workflow_client=self.workflow_client,
             ui_enabled=ui_enabled,
             handler=self.handler_class(client=self.client_class()),
+            has_configmap=has_configmap,
         )
         if self.event_subscriptions:

application_sdk/application/metadata_extraction/sql.py CHANGED Viewed

@@ -161,12 +161,14 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
         workflow_class: Type[
             BaseSQLMetadataExtractionWorkflow
         ] = BaseSQLMetadataExtractionWorkflow,
+        has_configmap: bool = False,
     ) -> Any:
         """
         Set up the FastAPI server for the SQL metadata extraction application.
         Args:
             workflow_class (Type): Workflow class to register with the server. Defaults to BaseSQLMetadataExtractionWorkflow.
+            has_configmap (bool): Whether the application has a configmap. Defaults to False.
         Returns:
             Any: None
@@ -178,6 +180,7 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
         self.server = APIServer(
             handler=self.handler_class(sql_client=self.client_class()),
             workflow_client=self.workflow_client,
+            has_configmap=has_configmap,
         )
         # register the workflow on the application server

application_sdk/clients/models.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""
+Pydantic models for database client configurations.
+This module provides Pydantic models for database connection configurations,
+ensuring type safety and validation for database client settings.
+"""
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+class DatabaseConfig(BaseModel):
+    """
+    Pydantic model for database connection configuration.
+    This model defines the structure for database connection configurations,
+    including connection templates, required parameters, defaults, and additional
+    connection parameters.
+    """
+    template: str = Field(
+        ...,
+        description="SQLAlchemy connection string template with placeholders for connection parameters",
+    )
+    required: List[str] = Field(
+        default=[],
+        description="List of required connection parameters that must be provided",
+    )
+    defaults: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Default connection parameters to be added to the connection string",
+    )
+    parameters: Optional[List[str]] = Field(
+        default=None,
+        description="List of additional connection parameter names that can be dynamically added from credentials",
+    )
+    class Config:
+        """Pydantic configuration for the DatabaseConfig model."""
+        extra = "forbid"  # Prevent additional fields
+        validate_assignment = True  # Validate on assignment
+        use_enum_values = True  # Use enum values instead of enum objects

application_sdk/clients/sql.py CHANGED Viewed

@@ -7,13 +7,14 @@ database operations, supporting batch processing and server-side cursors.
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 from urllib.parse import quote_plus
 from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine
 from temporalio import activity
 from application_sdk.clients import ClientInterface
+from application_sdk.clients.models import DatabaseConfig
 from application_sdk.common.aws_utils import (
     generate_aws_rds_token_with_iam_role,
     generate_aws_rds_token_with_iam_user,
@@ -48,7 +49,7 @@ class BaseSQLClient(ClientInterface):
     credentials: Dict[str, Any] = {}
     resolved_credentials: Dict[str, Any] = {}
     use_server_side_cursor: bool = USE_SERVER_SIDE_CURSOR
-    DB_CONFIG: Dict[str, Any] = {}
+    DB_CONFIG: Optional[DatabaseConfig] = None
     def __init__(
         self,
@@ -262,7 +263,9 @@ class BaseSQLClient(ClientInterface):
         Returns:
             str: The updated URL with the dialect.
         """
-        installed_dialect = self.DB_CONFIG["template"].split("://")[0]
+        if not self.DB_CONFIG:
+            raise ValueError("DB_CONFIG is not configured for this SQL client.")
+        installed_dialect = self.DB_CONFIG.template.split("://")[0]
         url_dialect = sqlalchemy_url.split("://")[0]
         if installed_dialect != url_dialect:
             sqlalchemy_url = sqlalchemy_url.replace(url_dialect, installed_dialect)
@@ -281,6 +284,9 @@ class BaseSQLClient(ClientInterface):
         Raises:
             ValueError: If required connection parameters are missing.
         """
+        if not self.DB_CONFIG:
+            raise ValueError("DB_CONFIG is not configured for this SQL client.")
         extra = parse_credentials_extra(self.credentials)
         # TODO: Uncomment this when the native deployment is ready
@@ -293,7 +299,7 @@ class BaseSQLClient(ClientInterface):
         # Prepare parameters
         param_values = {}
-        for param in self.DB_CONFIG["required"]:
+        for param in self.DB_CONFIG.required:
             if param == "password":
                 param_values[param] = auth_token
             else:
@@ -303,21 +309,19 @@ class BaseSQLClient(ClientInterface):
                 param_values[param] = value
         # Fill in base template
-        conn_str = self.DB_CONFIG["template"].format(**param_values)
+        conn_str = self.DB_CONFIG.template.format(**param_values)
         # Append defaults if not already in the template
-        if self.DB_CONFIG.get("defaults"):
-            conn_str = self.add_connection_params(conn_str, self.DB_CONFIG["defaults"])
+        if self.DB_CONFIG.defaults:
+            conn_str = self.add_connection_params(conn_str, self.DB_CONFIG.defaults)
-        if self.DB_CONFIG.get("parameters"):
-            parameter_keys = self.DB_CONFIG["parameters"]
-            self.DB_CONFIG["parameters"] = {
+        if self.DB_CONFIG.parameters:
+            parameter_keys = self.DB_CONFIG.parameters
+            parameter_values = {
                 key: self.credentials.get(key) or extra.get(key)
                 for key in parameter_keys
             }
-            conn_str = self.add_connection_params(
-                conn_str, self.DB_CONFIG["parameters"]
-            )
+            conn_str = self.add_connection_params(conn_str, parameter_values)
         return conn_str

atlan-application-sdk 0.1.1rc40__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc41py3-none-any.whl