PyPI - atlan-application-sdk - Versions diffs - 0.1.1rc42__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl - Mend

atlan-application-sdk 0.1.1rc42py3-none-any.whl → 0.1.1rc44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

application_sdk/activities/metadata_extraction/sql.py CHANGED Viewed

@@ -2,10 +2,8 @@ import os
 from typing import (
     TYPE_CHECKING,
     Any,
-    AsyncGenerator,
     AsyncIterator,
     Dict,
-    Generator,
     Iterator,
     List,
     Optional,
@@ -368,9 +366,9 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
                 "Output prefix and path must be specified in workflow_args."
             )
         return ParquetOutput(
-            output_prefix=output_prefix,
             output_path=output_path,
             output_suffix=output_suffix,
+            use_consolidation=True,
         )
     def _get_temp_table_regex_sql(self, typename: str) -> str:
@@ -553,7 +551,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
                 )
                 # Execute using helper method
-                success, batched_iter = await self._execute_single_db(
+                success, batched_iterator = await self._execute_single_db(
                     effective_sql_client.engine,
                     prepared_query,
                     parquet_output,
@@ -570,12 +568,12 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
                 logger.warning(
                     f"Failed to process database '{database_name}': {str(e)}. Skipping to next database."
                 )
-                success, batched_iter = False, None
+                success, batched_iterator = False, None
             if success:
                 successful_databases.append(database_name)
-                if not write_to_file and batched_iter:
-                    dataframe_list.append(batched_iter)
+                if not write_to_file and batched_iterator:
+                    dataframe_list.append(batched_iterator)
             else:
                 failed_databases.append(database_name)
@@ -615,37 +613,13 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
         try:
             sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
-            batched_iter = await sql_input.get_batched_dataframe()
+            batched_iterator = await sql_input.get_batched_dataframe()
             if write_to_file and parquet_output:
-                # Wrap iterator into a proper (async)generator for type safety
-                if hasattr(batched_iter, "__anext__"):
-                    async def _to_async_gen(
-                        it: AsyncIterator["pd.DataFrame"],
-                    ) -> AsyncGenerator["pd.DataFrame", None]:
-                        async for item in it:
-                            yield item
-                    wrapped: AsyncGenerator["pd.DataFrame", None] = _to_async_gen(  # type: ignore
-                        batched_iter  # type: ignore
-                    )
-                    await parquet_output.write_batched_dataframe(wrapped)
-                else:
-                    def _to_gen(
-                        it: Iterator["pd.DataFrame"],
-                    ) -> Generator["pd.DataFrame", None, None]:
-                        for item in it:
-                            yield item
-                    wrapped_sync: Generator["pd.DataFrame", None, None] = _to_gen(  # type: ignore
-                        batched_iter  # type: ignore
-                    )
-                    await parquet_output.write_batched_dataframe(wrapped_sync)
+                await parquet_output.write_batched_dataframe(batched_iterator)  # type: ignore
                 return True, None
-            return True, batched_iter
+            return True, batched_iterator
         except Exception as e:
             logger.error(
                 f"Error during query execution or output writing: {e}", exc_info=True
@@ -863,10 +837,10 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
             file_names=workflow_args.get("file_names"),
         )
         raw_input = raw_input.get_batched_daft_dataframe()
         transformed_output = JsonOutput(
             output_path=output_path,
             output_suffix="transformed",
-            output_prefix=output_prefix,
             typename=typename,
             chunk_start=workflow_args.get("chunk_start"),
         )

application_sdk/activities/query_extraction/sql.py CHANGED Viewed

@@ -210,7 +210,6 @@ class SQLQueryExtractionActivities(ActivitiesInterface):
             sql_input = await sql_input.get_dataframe()
             raw_output = ParquetOutput(
-                output_prefix=workflow_args["output_prefix"],
                 output_path=workflow_args["output_path"],
                 output_suffix="raw/query",
                 chunk_size=workflow_args["miner_args"].get("chunk_size", 100000),
@@ -218,7 +217,6 @@ class SQLQueryExtractionActivities(ActivitiesInterface):
                 end_marker=workflow_args["end_marker"],
             )
             await raw_output.write_dataframe(sql_input)
             logger.info(
                 f"Query fetch completed, {raw_output.total_record_count} records processed",
             )

application_sdk/inputs/parquet.py CHANGED Viewed

@@ -22,6 +22,7 @@ class ParquetInput(Input):
         self,
         path: str,
         chunk_size: int = 100000,
+        buffer_size: int = 5000,
         file_names: Optional[List[str]] = None,
     ):
         """Initialize the Parquet input class.
@@ -32,6 +33,7 @@ class ParquetInput(Input):
                 local path or object store path
                 Wildcards are not supported.
             chunk_size (int): Number of rows per batch. Defaults to 100000.
+            buffer_size (int): Number of rows per batch. Defaults to 5000.
             file_names (Optional[List[str]]): List of file names to read. Defaults to None.
         Raises:
@@ -47,6 +49,7 @@ class ParquetInput(Input):
         self.path = path
         self.chunk_size = chunk_size
+        self.buffer_size = buffer_size
         self.file_names = file_names
     async def get_dataframe(self) -> "pd.DataFrame":
@@ -249,9 +252,18 @@ class ParquetInput(Input):
             parquet_files = await self.download_files()
             logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
-            # Yield each discovered file as separate batch
-            for parquet_file in parquet_files:
-                yield daft.read_parquet(parquet_file)
+            # Create a lazy dataframe without loading data into memory
+            lazy_df = daft.read_parquet(parquet_files)
+            # Get total count efficiently
+            total_rows = lazy_df.count_rows()
+            # Yield chunks without loading everything into memory
+            for offset in range(0, total_rows, self.buffer_size):
+                chunk = lazy_df.offset(offset).limit(self.buffer_size)
+                yield chunk
+            del lazy_df
         except Exception as error:
             logger.error(

application_sdk/inputs/sql_query.py CHANGED Viewed

@@ -34,7 +34,7 @@ class SQLQueryInput(Input):
         self,
         query: str,
         engine: Union["Engine", str],
-        chunk_size: Optional[int] = 100000,
+        chunk_size: Optional[int] = 5000,
     ):
         """Initialize the async SQL query input handler.
@@ -42,7 +42,7 @@ class SQLQueryInput(Input):
             engine (Union[Engine, str]): SQLAlchemy engine or connection string.
             query (str): The SQL query to execute.
             chunk_size (Optional[int], optional): Number of rows per batch.
-                Defaults to 100000.
+                Defaults to 5000.
         """
         self.query = query
         self.engine = engine

application_sdk/interceptors/cleanup.py CHANGED Viewed

@@ -114,7 +114,6 @@ class CleanupWorkflowInboundInterceptor(WorkflowInboundInterceptor):
                     retry_policy=RetryPolicy(
                         maximum_attempts=3,
                     ),
-                    summary="This activity is used to cleanup the local artifacts and the activity state after the workflow is completed.",
                 )
                 logger.info("Cleanup completed successfully")

application_sdk/outputs/__init__.py CHANGED Viewed

@@ -4,8 +4,11 @@ This module provides base classes and utilities for handling various types of da
 in the application, including file outputs and object store interactions.
 """
+import gc
 import inspect
+import os
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -13,7 +16,6 @@ from typing import (
     Dict,
     Generator,
     List,
-    Literal,
     Optional,
     Union,
     cast,
@@ -26,6 +28,7 @@ from application_sdk.activities.common.models import ActivityStatistics
 from application_sdk.activities.common.utils import get_object_store_prefix
 from application_sdk.common.dataframe_utils import is_empty_dataframe
 from application_sdk.observability.logger_adaptor import get_logger
+from application_sdk.observability.metrics_adaptor import MetricType
 from application_sdk.services.objectstore import ObjectStore
 logger = get_logger(__name__)
@@ -36,6 +39,14 @@ if TYPE_CHECKING:
     import pandas as pd
+class WriteMode(Enum):
+    """Enumeration of write modes for output operations."""
+    APPEND = "append"
+    OVERWRITE = "overwrite"
+    OVERWRITE_PARTITIONS = "overwrite-partitions"
 class Output(ABC):
     """Abstract base class for output handlers.
@@ -53,11 +64,13 @@ class Output(ABC):
     output_prefix: str
     total_record_count: int
     chunk_count: int
-    statistics: List[int] = []
+    buffer_size: int
+    max_file_size_bytes: int
+    current_buffer_size: int
+    current_buffer_size_bytes: int
+    partitions: List[int]
-    def estimate_dataframe_file_size(
-        self, dataframe: "pd.DataFrame", file_type: Literal["json", "parquet"]
-    ) -> int:
+    def estimate_dataframe_record_size(self, dataframe: "pd.DataFrame") -> int:
         """Estimate File size of a DataFrame by sampling a few records."""
         if len(dataframe) == 0:
             return 0
@@ -65,16 +78,47 @@ class Output(ABC):
         # Sample up to 10 records to estimate average size
         sample_size = min(10, len(dataframe))
         sample = dataframe.head(sample_size)
+        file_type = type(self).__name__.lower().replace("output", "")
+        compression_factor = 1
         if file_type == "json":
             sample_file = sample.to_json(orient="records", lines=True)
         else:
             sample_file = sample.to_parquet(index=False, compression="snappy")
+            compression_factor = 0.01
         if sample_file is not None:
-            avg_record_size = len(sample_file) / sample_size
-            return int(avg_record_size * len(dataframe))
+            avg_record_size = len(sample_file) / sample_size * compression_factor
+            return int(avg_record_size)
         return 0
+    def path_gen(
+        self,
+        chunk_count: Optional[int] = None,
+        chunk_part: int = 0,
+        start_marker: Optional[str] = None,
+        end_marker: Optional[str] = None,
+    ) -> str:
+        """Generate a file path for a chunk.
+        Args:
+            chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
+            chunk_count (int): Total number of chunks.
+            start_marker (Optional[str]): Start marker for query extraction.
+            end_marker (Optional[str]): End marker for query extraction.
+        Returns:
+            str: Generated file path for the chunk.
+        """
+        # For Query Extraction - use start and end markers without chunk count
+        if start_marker and end_marker:
+            return f"{start_marker}_{end_marker}{self._EXTENSION}"
+        # For regular chunking - include chunk count
+        if chunk_count is None:
+            return f"{str(chunk_part)}{self._EXTENSION}"
+        else:
+            return f"chunk-{str(chunk_count)}-part{str(chunk_part)}{self._EXTENSION}"
     def process_null_fields(
         self,
         obj: Any,
@@ -146,15 +190,86 @@ class Output(ABC):
                         await self.write_dataframe(dataframe)
         except Exception as e:
             logger.error(f"Error writing batched dataframe: {str(e)}")
+            raise
-    @abstractmethod
     async def write_dataframe(self, dataframe: "pd.DataFrame"):
-        """Write a pandas DataFrame to the output destination.
+        """Write a pandas DataFrame to Parquet files and upload to object store.
         Args:
             dataframe (pd.DataFrame): The DataFrame to write.
         """
-        pass
+        try:
+            if self.chunk_start is None:
+                self.chunk_part = 0
+            if len(dataframe) == 0:
+                return
+            chunk_size_bytes = self.estimate_dataframe_record_size(dataframe)
+            for i in range(0, len(dataframe), self.buffer_size):
+                chunk = dataframe[i : i + self.buffer_size]
+                if (
+                    self.current_buffer_size_bytes + chunk_size_bytes
+                    > self.max_file_size_bytes
+                ):
+                    output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
+                    if os.path.exists(output_file_name):
+                        await self._upload_file(output_file_name)
+                        self.chunk_part += 1
+                self.current_buffer_size += len(chunk)
+                self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
+                await self._flush_buffer(chunk, self.chunk_part)
+                del chunk
+                gc.collect()
+            if self.current_buffer_size_bytes > 0:
+                # Finally upload the final file to the object store
+                output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
+                if os.path.exists(output_file_name):
+                    await self._upload_file(output_file_name)
+                    self.chunk_part += 1
+            # Record metrics for successful write
+            self.metrics.record_metric(
+                name="write_records",
+                value=len(dataframe),
+                metric_type=MetricType.COUNTER,
+                labels={"type": "pandas", "mode": WriteMode.APPEND.value},
+                description="Number of records written to files from pandas DataFrame",
+            )
+            # Record chunk metrics
+            self.metrics.record_metric(
+                name="chunks_written",
+                value=1,
+                metric_type=MetricType.COUNTER,
+                labels={"type": "pandas", "mode": WriteMode.APPEND.value},
+                description="Number of chunks written to files",
+            )
+            # If chunk_start is set we don't want to increment the chunk_count
+            # Since it should only increment the chunk_part in this case
+            if self.chunk_start is None:
+                self.chunk_count += 1
+            self.partitions.append(self.chunk_part)
+        except Exception as e:
+            # Record metrics for failed write
+            self.metrics.record_metric(
+                name="write_errors",
+                value=1,
+                metric_type=MetricType.COUNTER,
+                labels={
+                    "type": "pandas",
+                    "mode": WriteMode.APPEND.value,
+                    "error": str(e),
+                },
+                description="Number of errors while writing to files",
+            )
+            logger.error(f"Error writing pandas dataframe to files: {str(e)}")
+            raise
     async def write_batched_daft_dataframe(
         self,
@@ -225,6 +340,55 @@ class Output(ABC):
             logger.error(f"Error getting statistics: {str(e)}")
             raise
+    async def _upload_file(self, file_name: str):
+        """Upload a file to the object store."""
+        await ObjectStore.upload_file(
+            source=file_name,
+            destination=get_object_store_prefix(file_name),
+        )
+        self.current_buffer_size_bytes = 0
+    async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
+        """Flush the current buffer to a JSON file.
+        This method combines all DataFrames in the buffer, writes them to a JSON file,
+        and uploads the file to the object store.
+        Note:
+            If the buffer is empty or has no records, the method returns without writing.
+        """
+        try:
+            if not is_empty_dataframe(chunk):
+                self.total_record_count += len(chunk)
+                output_file_name = (
+                    f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
+                )
+                await self.write_chunk(chunk, output_file_name)
+                self.current_buffer_size = 0
+                # Record chunk metrics
+                self.metrics.record_metric(
+                    name="chunks_written",
+                    value=1,
+                    metric_type=MetricType.COUNTER,
+                    labels={"type": "output"},
+                    description="Number of chunks written to files",
+                )
+        except Exception as e:
+            # Record metrics for failed write
+            self.metrics.record_metric(
+                name="write_errors",
+                value=1,
+                metric_type=MetricType.COUNTER,
+                labels={"type": "output", "error": str(e)},
+                description="Number of errors while writing to files",
+            )
+            logger.error(f"Error flushing buffer to files: {str(e)}")
+            raise e
     async def write_statistics(self) -> Optional[Dict[str, Any]]:
         """Write statistics about the output to a JSON file.
@@ -238,8 +402,8 @@ class Output(ABC):
             # prepare the statistics
             statistics = {
                 "total_record_count": self.total_record_count,
-                "chunk_count": self.chunk_count,
-                "partitions": self.statistics,
+                "chunk_count": len(self.partitions),
+                "partitions": self.partitions,
             }
             # Write the statistics to a json file

atlan-application-sdk 0.1.1rc42__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl

atlan-application-sdk 0.1.1rc42py3-none-any.whl → 0.1.1rc44py3-none-any.whl