PyPI - atlan-application-sdk - Versions diffs - 0.1.1rc40__py3-none-any.whl → 0.1.1rc42__py3-none-any.whl - Mend

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

application_sdk/activities/common/utils.py CHANGED Viewed

@@ -5,10 +5,11 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
 """
 import asyncio
+import glob
 import os
 from datetime import timedelta
 from functools import wraps
-from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
+from typing import Any, Awaitable, Callable, List, Optional, TypeVar, cast
 from temporalio import activity
@@ -79,17 +80,47 @@ def build_output_path() -> str:
 def get_object_store_prefix(path: str) -> str:
     """Get the object store prefix for the path.
+    This function handles two types of paths:
+    1. Paths under TEMPORARY_PATH - converts them to relative object store paths
+    2. User-provided paths - returns them as-is (already relative object store paths)
     Args:
-        path: The path to the output directory.
+        path: The path to convert to object store prefix.
     Returns:
         The object store prefix for the path.
-    Example:
+    Examples:
+        >>> # Temporary path case
         >>> get_object_store_prefix("./local/tmp/artifacts/apps/appName/workflows/wf-123/run-456")
         "artifacts/apps/appName/workflows/wf-123/run-456"
+        >>> # User-provided path case
+        >>> get_object_store_prefix("datasets/sales/2024/")
+        "datasets/sales/2024"
     """
-    return os.path.relpath(path, TEMPORARY_PATH)
+    # Normalize paths for comparison
+    abs_path = os.path.abspath(path)
+    abs_temp_path = os.path.abspath(TEMPORARY_PATH)
+    # Check if path is under TEMPORARY_PATH
+    try:
+        # Use os.path.commonpath to properly check if path is under temp directory
+        # This prevents false positives like '/tmp/local123' matching '/tmp/local'
+        common_path = os.path.commonpath([abs_path, abs_temp_path])
+        if common_path == abs_temp_path:
+            # Path is under temp directory, convert to relative object store path
+            relative_path = os.path.relpath(abs_path, abs_temp_path)
+            # Normalize path separators to forward slashes for object store
+            return relative_path.replace(os.path.sep, "/")
+        else:
+            # Path is already a relative object store path, return as-is
+            return path.strip("/")
+    except ValueError:
+        # os.path.commonpath or os.path.relpath can raise ValueError on Windows with different drives
+        # In this case, treat as user-provided path, return as-is
+        return path.strip("/")
 def auto_heartbeater(fn: F) -> F:
@@ -199,3 +230,46 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
     while True:
         await asyncio.sleep(delay)
         activity.heartbeat(*details)
+def find_local_files_by_extension(
+    path: str,
+    extension: str,
+    file_names: Optional[List[str]] = None,
+) -> List[str]:
+    """Find local files at the specified local path, optionally filtering by file names.
+    Args:
+        path (str): Local path to search in (file or directory)
+        extension (str): File extension to filter by (e.g., '.parquet', '.json')
+        file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
+    Returns:
+        List[str]: List of matching file paths
+    Example:
+        >>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
+        ['file1.parquet', 'file2.parquet']
+        >>> find_local_files_by_extension("/data/single.json", ".json")
+        ['single.json']
+    """
+    if os.path.isfile(path) and path.endswith(extension):
+        # Single file - return it directly
+        return [path]
+    elif os.path.isdir(path):
+        # Directory - find all files in directory
+        all_files = glob.glob(
+            os.path.join(path, "**", f"*{extension}"),
+            recursive=True,
+        )
+        # Filter by file names if specified
+        if file_names:
+            file_names_set = set(file_names)  # Convert to set for O(1) lookup
+            return [f for f in all_files if os.path.basename(f) in file_names_set]
+        else:
+            return all_files
+    return []

application_sdk/activities/metadata_extraction/sql.py CHANGED Viewed

@@ -1,5 +1,20 @@
 import os
-from typing import Any, Dict, Optional, Tuple, Type, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    cast,
+    overload,
+)
 from temporalio import activity
@@ -13,7 +28,12 @@ from application_sdk.activities.common.utils import (
 from application_sdk.clients.sql import BaseSQLClient
 from application_sdk.common.dataframe_utils import is_empty_dataframe
 from application_sdk.common.error_codes import ActivityError
-from application_sdk.common.utils import prepare_query, read_sql_files
+from application_sdk.common.utils import (
+    get_database_names,
+    parse_credentials_extra,
+    prepare_query,
+    read_sql_files,
+)
 from application_sdk.constants import APP_TENANT_ID, APPLICATION_NAME, SQL_QUERIES_PATH
 from application_sdk.handlers.sql import BaseSQLHandler
 from application_sdk.inputs.parquet import ParquetInput
@@ -31,6 +51,9 @@ activity.logger = logger
 queries = read_sql_files(queries_prefix=SQL_QUERIES_PATH)
+if TYPE_CHECKING:
+    import pandas as pd
 class BaseSQLMetadataExtractionActivitiesState(ActivitiesState):
     """State class for SQL metadata extraction activities.
@@ -90,6 +113,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         sql_client_class: Optional[Type[BaseSQLClient]] = None,
         handler_class: Optional[Type[BaseSQLHandler]] = None,
         transformer_class: Optional[Type[TransformerInterface]] = None,
+        multidb: bool = False,
     ):
         """Initialize the SQL metadata extraction activities.
@@ -100,6 +124,8 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
                 Defaults to BaseSQLHandler.
             transformer_class (Type[TransformerInterface], optional): Class for metadata transformation.
                 Defaults to QueryBasedTransformer.
+            multidb (bool): When True, executes queries across multiple databases using
+                `multidb_query_executor`. Defaults to False.
         """
         if sql_client_class:
             self.sql_client_class = sql_client_class
@@ -108,6 +134,9 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         if transformer_class:
             self.transformer_class = transformer_class
+        # Control whether to execute per-db using multidb executor
+        self.multidb = multidb
         super().__init__()
     # State methods
@@ -206,6 +235,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
             raise ValueError("Missing required workflow arguments")
         return output_prefix, output_path, typename, workflow_id, workflow_run_id
+    @overload
     async def query_executor(
         self,
         sql_engine: Any,
@@ -213,7 +243,38 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         workflow_args: Dict[str, Any],
         output_suffix: str,
         typename: str,
-    ) -> Optional[ActivityStatistics]:
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = False,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[ActivityStatistics]: ...
+    @overload
+    async def query_executor(
+        self,
+        sql_engine: Any,
+        sql_query: Optional[str],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = True,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]: ...
+    async def query_executor(
+        self,
+        sql_engine: Any,
+        sql_query: Optional[str],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool = True,
+        concatenate: bool = False,
+        return_dataframe: bool = False,
+        sql_client: Optional[BaseSQLClient] = None,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
         """
         Executes a SQL query using the provided engine and saves the results to Parquet.
@@ -233,44 +294,358 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
             typename: Type name used for generating output statistics.
         Returns:
-            Optional[ActivityStatistics]: Statistics about the generated Parquet file,
-            or None if the query is empty or execution fails before writing output.
+            Optional[Union[ActivityStatistics, pd.DataFrame]]: Statistics about the generated Parquet file,
+            or a DataFrame if return_dataframe=True, or None if the query is empty or execution fails.
         Raises:
             ValueError: If `sql_engine` is not provided.
         """
+        # Common pre-checks and setup shared by both multidb and single-db paths
+        if not sql_query:
+            logger.warning("Query is empty, skipping execution.")
+            return None
         if not sql_engine:
             logger.error("SQL engine is not set.")
             raise ValueError("SQL engine must be provided.")
-        if not sql_query:
-            logger.warning("Query is empty, skipping execution.")
+        # Setup parquet output using helper method
+        parquet_output = self._setup_parquet_output(
+            workflow_args, output_suffix, write_to_file
+        )
+        # If multidb mode is enabled, run per-database flow
+        if getattr(self, "multidb", False):
+            return await self._execute_multidb_flow(
+                sql_client,
+                sql_query,
+                workflow_args,
+                output_suffix,
+                typename,
+                write_to_file,
+                concatenate,
+                return_dataframe,
+                parquet_output,
+            )
+        # Single-db execution path
+        # Prepare query for single-db execution
+        prepared_query = self._prepare_database_query(
+            sql_query, None, workflow_args, typename
+        )
+        # Execute using helper method
+        success, _ = await self._execute_single_db(
+            sql_engine, prepared_query, parquet_output, write_to_file
+        )
+        if not success:
+            logger.error("Failed to execute single-db query")
             return None
-        try:
-            sql_input = SQLQueryInput(engine=sql_engine, query=sql_query)
-            dataframe = await sql_input.get_batched_dataframe()
+        if parquet_output:
+            logger.info(
+                f"Successfully wrote query results to {parquet_output.get_full_path()}"
+            )
+            return await parquet_output.get_statistics(typename=typename)
+        logger.warning("No parquet output configured for single-db execution")
+        return None
+    def _setup_parquet_output(
+        self,
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        write_to_file: bool,
+    ) -> Optional[ParquetOutput]:
+        if not write_to_file:
+            return None
+        output_prefix = workflow_args.get("output_prefix")
+        output_path = workflow_args.get("output_path")
+        if not output_prefix or not output_path:
+            logger.error("Output prefix or path not provided in workflow_args.")
+            raise ValueError(
+                "Output prefix and path must be specified in workflow_args."
+            )
+        return ParquetOutput(
+            output_prefix=output_prefix,
+            output_path=output_path,
+            output_suffix=output_suffix,
+        )
-            output_prefix = workflow_args.get("output_prefix")
-            output_path = workflow_args.get("output_path")
+    def _get_temp_table_regex_sql(self, typename: str) -> str:
+        """Get the appropriate temp table regex SQL based on typename."""
+        if typename == "column":
+            return self.extract_temp_table_regex_column_sql or ""
+        elif typename == "table":
+            return self.extract_temp_table_regex_table_sql or ""
+        else:
+            return ""
-            if not output_prefix or not output_path:
-                logger.error("Output prefix or path not provided in workflow_args.")
-                raise ValueError(
-                    "Output prefix and path must be specified in workflow_args."
+    def _prepare_database_query(
+        self,
+        sql_query: str,
+        database_name: Optional[str],
+        workflow_args: Dict[str, Any],
+        typename: str,
+        use_posix_regex: bool = False,
+    ) -> Optional[str]:
+        """Prepare query for database execution with proper substitutions."""
+        # Replace database name placeholder if provided
+        fetch_sql = sql_query
+        if database_name:
+            fetch_sql = fetch_sql.replace("{database_name}", database_name)
+        # Get temp table regex SQL
+        temp_table_regex_sql = self._get_temp_table_regex_sql(typename)
+        # Prepare the query
+        prepared_query = prepare_query(
+            query=fetch_sql,
+            workflow_args=workflow_args,
+            temp_table_regex_sql=temp_table_regex_sql,
+            use_posix_regex=use_posix_regex,
+        )
+        if prepared_query is None:
+            db_context = f" for database {database_name}" if database_name else ""
+            raise ValueError(f"Failed to prepare query{db_context}")
+        return prepared_query
+    async def _setup_database_connection(
+        self,
+        sql_client: BaseSQLClient,
+        database_name: str,
+    ) -> None:
+        """Setup connection for a specific database."""
+        extra = parse_credentials_extra(sql_client.credentials)
+        extra["database"] = database_name
+        sql_client.credentials["extra"] = extra
+        await sql_client.load(sql_client.credentials)
+    # NOTE: Consolidated: per-database processing is now inlined in the multi-DB loop
+    async def _finalize_multidb_results(
+        self,
+        write_to_file: bool,
+        concatenate: bool,
+        return_dataframe: bool,
+        parquet_output: Optional[ParquetOutput],
+        dataframe_list: List[
+            Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
+        ],
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
+        """Finalize results for multi-database execution."""
+        if write_to_file and parquet_output:
+            return await parquet_output.get_statistics(typename=typename)
+        if not write_to_file and concatenate:
+            try:
+                import pandas as pd  # type: ignore
+                valid_dataframes: List[pd.DataFrame] = []
+                for df_generator in dataframe_list:
+                    if df_generator is None:
+                        continue
+                    for dataframe in df_generator:  # type: ignore[assignment]
+                        if dataframe is None:
+                            continue
+                        if hasattr(dataframe, "empty") and getattr(dataframe, "empty"):
+                            continue
+                        valid_dataframes.append(dataframe)
+                if not valid_dataframes:
+                    logger.warning(
+                        "No valid dataframes collected across databases for concatenation"
+                    )
+                    return None
+                concatenated = pd.concat(valid_dataframes, ignore_index=True)
+                if return_dataframe:
+                    return concatenated  # type: ignore[return-value]
+                # Create new parquet output for concatenated data
+                concatenated_parquet_output = self._setup_parquet_output(
+                    workflow_args, output_suffix, True
                 )
+                if concatenated_parquet_output:
+                    await concatenated_parquet_output.write_dataframe(concatenated)  # type: ignore[arg-type]
+                    return await concatenated_parquet_output.get_statistics(
+                        typename=typename
+                    )
+            except Exception as e:  # noqa: BLE001
+                logger.error(
+                    f"Error concatenating multi-DB dataframes: {str(e)}",
+                    exc_info=True,
+                )
+                raise
+        logger.warning(
+            "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
+        )
+        return None
-            parquet_output = ParquetOutput(
-                output_prefix=output_prefix,
-                output_path=output_path,
-                output_suffix=output_suffix,
+    async def _execute_multidb_flow(
+        self,
+        sql_client: Optional[BaseSQLClient],
+        sql_query: str,
+        workflow_args: Dict[str, Any],
+        output_suffix: str,
+        typename: str,
+        write_to_file: bool,
+        concatenate: bool,
+        return_dataframe: bool,
+        parquet_output: Optional[ParquetOutput],
+    ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
+        """Execute multi-database flow with proper error handling and result finalization."""
+        # Get effective SQL client
+        effective_sql_client = sql_client
+        if effective_sql_client is None:
+            state = cast(
+                BaseSQLMetadataExtractionActivitiesState,
+                await self._get_state(workflow_args),
             )
-            await parquet_output.write_batched_dataframe(dataframe)
-            logger.info(
-                f"Successfully wrote query results to {parquet_output.get_full_path()}"
+            effective_sql_client = state.sql_client
+        if not effective_sql_client:
+            logger.error("SQL client not initialized for multidb execution")
+            raise ValueError("SQL client not initialized")
+        # Resolve databases to iterate
+        database_names = await get_database_names(
+            effective_sql_client, workflow_args, self.fetch_database_sql
+        )
+        if not database_names:
+            logger.warning("No databases found to process")
+            return None
+        # Validate client
+        if not effective_sql_client.engine:
+            logger.error("SQL client engine not initialized")
+            raise ValueError("SQL client engine not initialized")
+        successful_databases: List[str] = []
+        failed_databases: List[str] = []
+        dataframe_list: List[
+            Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
+        ] = []
+        # Iterate databases and execute (consolidated single-db processing)
+        for database_name in database_names or []:
+            try:
+                # Setup connection for this database
+                await self._setup_database_connection(
+                    effective_sql_client, database_name
+                )
+                # Prepare query for this database
+                prepared_query = self._prepare_database_query(
+                    sql_query,
+                    database_name,
+                    workflow_args,
+                    typename,
+                    use_posix_regex=True,
+                )
+                # Execute using helper method
+                success, batched_iter = await self._execute_single_db(
+                    effective_sql_client.engine,
+                    prepared_query,
+                    parquet_output,
+                    write_to_file,
+                )
+                if success:
+                    logger.info(f"Successfully processed database: {database_name}")
+                else:
+                    logger.warning(
+                        f"Failed to execute query for database: {database_name}"
+                    )
+            except Exception as e:  # noqa: BLE001
+                logger.warning(
+                    f"Failed to process database '{database_name}': {str(e)}. Skipping to next database."
+                )
+                success, batched_iter = False, None
+            if success:
+                successful_databases.append(database_name)
+                if not write_to_file and batched_iter:
+                    dataframe_list.append(batched_iter)
+            else:
+                failed_databases.append(database_name)
+        # Log results
+        logger.info(
+            f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
+        )
+        if failed_databases:
+            logger.warning(
+                f"Failed to process {len(failed_databases)} databases: {failed_databases}"
             )
-            statistics = await parquet_output.get_statistics(typename=typename)
-            return statistics
+        # Finalize results
+        return await self._finalize_multidb_results(
+            write_to_file,
+            concatenate,
+            return_dataframe,
+            parquet_output,
+            dataframe_list,
+            workflow_args,
+            output_suffix,
+            typename,
+        )
+    async def _execute_single_db(
+        self,
+        sql_engine: Any,
+        prepared_query: Optional[str],
+        parquet_output: Optional[ParquetOutput],
+        write_to_file: bool,
+    ) -> Tuple[
+        bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
+    ]:  # type: ignore
+        if not prepared_query:
+            logger.error("Prepared query is None, cannot execute")
+            return False, None
+        try:
+            sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
+            batched_iter = await sql_input.get_batched_dataframe()
+            if write_to_file and parquet_output:
+                # Wrap iterator into a proper (async)generator for type safety
+                if hasattr(batched_iter, "__anext__"):
+                    async def _to_async_gen(
+                        it: AsyncIterator["pd.DataFrame"],
+                    ) -> AsyncGenerator["pd.DataFrame", None]:
+                        async for item in it:
+                            yield item
+                    wrapped: AsyncGenerator["pd.DataFrame", None] = _to_async_gen(  # type: ignore
+                        batched_iter  # type: ignore
+                    )
+                    await parquet_output.write_batched_dataframe(wrapped)
+                else:
+                    def _to_gen(
+                        it: Iterator["pd.DataFrame"],
+                    ) -> Generator["pd.DataFrame", None, None]:
+                        for item in it:
+                            yield item
+                    wrapped_sync: Generator["pd.DataFrame", None, None] = _to_gen(  # type: ignore
+                        batched_iter  # type: ignore
+                    )
+                    await parquet_output.write_batched_dataframe(wrapped_sync)
+                return True, None
+            return True, batched_iter
         except Exception as e:
             logger.error(
                 f"Error during query execution or output writing: {e}", exc_info=True
@@ -485,9 +860,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         raw_input = ParquetInput(
             path=os.path.join(output_path, "raw"),
-            input_prefix=output_prefix,
             file_names=workflow_args.get("file_names"),
-            chunk_size=None,
         )
         raw_input = raw_input.get_batched_daft_dataframe()
         transformed_output = JsonOutput(

application_sdk/application/__init__.py CHANGED Viewed

@@ -164,6 +164,7 @@ class BaseApplication:
         self,
         workflow_class,
         ui_enabled: bool = True,
+        has_configmap: bool = False,
     ):
         """
         Optionally set up a server for the application. (No-op by default)
@@ -176,6 +177,7 @@ class BaseApplication:
             workflow_client=self.workflow_client,
             ui_enabled=ui_enabled,
             handler=self.handler_class(client=self.client_class()),
+            has_configmap=has_configmap,
         )
         if self.event_subscriptions:

application_sdk/application/metadata_extraction/sql.py CHANGED Viewed

@@ -161,12 +161,14 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
         workflow_class: Type[
             BaseSQLMetadataExtractionWorkflow
         ] = BaseSQLMetadataExtractionWorkflow,
+        has_configmap: bool = False,
     ) -> Any:
         """
         Set up the FastAPI server for the SQL metadata extraction application.
         Args:
             workflow_class (Type): Workflow class to register with the server. Defaults to BaseSQLMetadataExtractionWorkflow.
+            has_configmap (bool): Whether the application has a configmap. Defaults to False.
         Returns:
             Any: None
@@ -178,6 +180,7 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
         self.server = APIServer(
             handler=self.handler_class(sql_client=self.client_class()),
             workflow_client=self.workflow_client,
+            has_configmap=has_configmap,
         )
         # register the workflow on the application server

atlan-application-sdk 0.1.1rc40__py3-none-any.whl → 0.1.1rc42__py3-none-any.whl

atlan-application-sdk 0.1.1rc40py3-none-any.whl → 0.1.1rc42py3-none-any.whl