PyPI - acryl-datahub-cloud - Versions diffs - 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl - Mend

acryl-datahub-cloud 0.3.11rc0py3-none-any.whl → 0.3.16.1rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (238) hide show

acryl_datahub_cloud/datahub_reporting/datahub_dataset.py CHANGED Viewed

@@ -5,14 +5,14 @@ import pathlib
 import tempfile
 import time
 from enum import Enum
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
 import boto3
 import duckdb
 import pandas
 import pyarrow as pa
 import pyarrow.parquet as pq
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, field_validator
 from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow, SchemaField
 from datahub.configuration.common import ConfigModel
@@ -73,7 +73,9 @@ class FileStoreBackedDatasetConfig(ConfigModel):
     store_platform: str = "s3"
     file_name: str = "data"
     file_extension: str = "parquet"
-    file_compression: str = "snappy"
+    file_compression: Literal[
+        "gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"
+    ] = "snappy"
     file_overwrite_existing: bool = True
     snapshot_partitioning_strategy: str = PartitioningStrategy.DATE
     generate_presigned_url: bool = True
@@ -85,7 +87,8 @@ class FileStoreBackedDatasetConfig(ConfigModel):
     datahub_platform: str = "acryl"
-    @validator("snapshot_partitioning_strategy")
+    @field_validator("snapshot_partitioning_strategy")
+    @classmethod
     def validate_partitioning_strategy(cls, v):
         if v not in PartitioningStrategy._value2member_map_:
             raise ValueError(f"Unsupported partitioning strategy: {v}")
@@ -119,9 +122,14 @@ class DataHubBasedS3Dataset:
         self.local_file_path: str = (
             config.file if config.file else self._initialize_local_file()
         )
-        self.file_writer = None
+        self.file_writer: Optional[pq.ParquetWriter] = None
         self.schema = (
-            pa.schema([(x.name, x.type) for x in self.dataset_metadata.schemaFields])
+            pa.schema(
+                [
+                    pa.field(x.name, BaseModelRow.string_to_pyarrow_type(x.type))
+                    for x in self.dataset_metadata.schemaFields
+                ]
+            )
             if self.dataset_metadata.schemaFields
             else None
         )
@@ -163,18 +171,32 @@ class DataHubBasedS3Dataset:
                     self.schema = row.arrow_schema()
                 else:
                     # hail mary: infer schema from the first row and cast everything to string
-                    self.schema = pa.schema([(key, pa.string()) for key in row])
+                    self.schema = pa.schema([pa.field(key, pa.string()) for key in row])
                     self.stringify_row = True
             self._initialize_local_file()
+            # Map compression names to PyArrow format (most are direct mappings)
+            compression_map = {
+                "gzip": "gzip",
+                "bz2": "brotli",  # PyArrow doesn't support bz2, use brotli
+                "brotli": "brotli",
+                "lz4": "lz4",
+                "zstd": "zstd",
+                "snappy": "snappy",
+                "none": "none",
+            }
+            compression = cast(
+                Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"],
+                compression_map.get(self.config.file_compression, "snappy"),
+            )
             self.file_writer = pq.ParquetWriter(
                 self.local_file_path,
                 self.schema,
-                compression=self.config.file_compression,
+                compression=compression,
             )
         if isinstance(row, (BaseModel, BaseModelRow)):
             # for anything extending BaseModel, we want to use the dict representation
-            write_row: Dict[str, Any] = row.dict()
+            write_row: Dict[str, Any] = row.model_dump()
         elif isinstance(row, dict):
             write_row = row
         else:
@@ -271,7 +293,7 @@ class DataHubBasedS3Dataset:
             )
     def _generate_schema_metadata(
-        self, duckdb_columns: List[Tuple[str, str]]
+        self, duckdb_columns: List[Tuple[str, Any]]
     ) -> SchemaMetadataClass:
         def get_type_from_dtype(dtype: str) -> SchemaFieldDataTypeClass:
             if "int" in dtype or "float" in dtype or "number" in dtype:
@@ -302,7 +324,7 @@ class DataHubBasedS3Dataset:
         )
         for column in duckdb_columns:
             # generate data type
-            data_type = column[1].lower()
+            data_type = str(column[1]).lower()
             schema_metadata.fields.append(
                 SchemaFieldClass(
                     fieldPath=column[0],
@@ -341,7 +363,7 @@ class DataHubBasedS3Dataset:
                 # generate min, max, avg, distinct count, null count
                 column_name = column[0]
                 logger.info(f"Generating field profile for {column_name}")
-                data_type = column[1].lower()
+                data_type = str(column[1]).lower()
                 if "int" in data_type or "float" in data_type:
                     query = (
                         f"SELECT COUNT(DISTINCT {column_name}), COUNT(*) - COUNT({column_name}), MIN({column_name}), MAX({column_name}), AVG({column_name})"
@@ -396,7 +418,9 @@ class DataHubBasedS3Dataset:
                 assert dataset_profiles.fieldProfiles is not None
                 dataset_profiles.fieldProfiles.append(field_profile)
             logger.info("Generated dataset profile")
-            schema_metadata = self._generate_schema_metadata(columns)
+            schema_metadata = self._generate_schema_metadata(
+                [(col[0], col[1]) for col in columns]
+            )
         return dataset_profiles, schema_metadata
     def register_dataset(

acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py CHANGED Viewed

@@ -1,16 +1,16 @@
-import json
 import logging
 from datetime import date, datetime, timezone
 from enum import Enum
-from typing import Any, Callable, Dict, Iterable, List, Optional
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 import pandas as pd
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator
 from acryl_datahub_cloud.elasticsearch.graph_service import BaseModelRow
+from acryl_datahub_cloud.graphql_utils import parse_extra_properties_for_model
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.ingestion.graph.filters import RawSearchFilterRule
+from datahub.ingestion.graph.filters import RawSearchFilter
 from datahub.metadata.schema_classes import (
     DomainPropertiesClass,
     FormAssociationClass,
@@ -130,6 +130,22 @@ class DataHubFormReportingData(FormData):
         platformInstance: Optional[str] = None
         domains: List[str] = []
+        @field_validator(
+            "completedFormsIncompletePromptResponseTimes",
+            "completedFormsCompletedPromptResponseTimes",
+            "incompleteFormsIncompletePromptResponseTimes",
+            "incompleteFormsCompletedPromptResponseTimes",
+            mode="before",
+        )
+        @classmethod
+        def convert_timestamps_to_strings(
+            cls, v: Union[List[int], List[str]]
+        ) -> List[str]:
+            """Convert timestamp integers to strings for compatibility with GMS data."""
+            if not isinstance(v, list):
+                return v
+            return [str(item) for item in v]
     def __init__(self, graph: DataHubGraph, allowed_forms: Optional[List[str]] = None):
         self.graph: DataHubGraph = graph
         self.form_registry = FormRegistry(graph)
@@ -143,13 +159,13 @@ class DataHubFormReportingData(FormData):
         on_form_scanned: Callable[[str], Any],
     ) -> pd.DataFrame:
         return pd.DataFrame(
-            x.dict()
+            x.model_dump()
             for x in self.get_data(
                 on_asset_scanned=on_asset_scanned, on_form_scanned=on_form_scanned
             )
         )
-    def get_form_existence_or_filters(self) -> List[RawSearchFilterRule]:
+    def get_form_existence_or_filters(self) -> RawSearchFilter:
         """
         Datasets must either have completedForms or incompleteForms assigned to
         them
@@ -157,25 +173,41 @@ class DataHubFormReportingData(FormData):
         if self.allowed_forms:
             return [
                 {
-                    "field": "completedForms",
-                    "condition": "EQUAL",
-                    "values": self.allowed_forms,
+                    "and": [
+                        {
+                            "field": "completedForms",
+                            "condition": "EQUAL",
+                            "values": self.allowed_forms,
+                        }
+                    ]
                 },
                 {
-                    "field": "incompleteForms",
-                    "condition": "EQUAL",
-                    "values": self.allowed_forms,
+                    "and": [
+                        {
+                            "field": "incompleteForms",
+                            "condition": "EQUAL",
+                            "values": self.allowed_forms,
+                        }
+                    ]
                 },
             ]
         else:
             return [
                 {
-                    "field": "completedForms",
-                    "condition": "EXISTS",
+                    "and": [
+                        {
+                            "field": "completedForms",
+                            "condition": "EXISTS",
+                        }
+                    ]
                 },
                 {
-                    "field": "incompleteForms",
-                    "condition": "EXISTS",
+                    "and": [
+                        {
+                            "field": "incompleteForms",
+                            "condition": "EXISTS",
+                        }
+                    ]
                 },
             ]
@@ -290,10 +322,10 @@ class DataHubFormReportingData(FormData):
         on_asset_scanned: Optional[Callable[[str], Any]] = None,
         on_form_scanned: Optional[Callable[[str], Any]] = None,
     ) -> Iterable[FormReportingRow]:
-        extra_fields = [f for f in self.DataHubDatasetSearchRow.__fields__]
+        extra_fields = [f for f in self.DataHubDatasetSearchRow.model_fields]
         # TODO: Replace with the new search/filter SDK.
         result = self.graph.get_results_by_filter(
-            extra_or_filters=[{"and": self.get_form_existence_or_filters()}],
+            extra_or_filters=self.get_form_existence_or_filters(),
             extra_source_fields=extra_fields,
             skip_cache=True,
         )
@@ -304,10 +336,9 @@ class DataHubFormReportingData(FormData):
             if row_index % 100 == 0:
                 logger.info(f"Scanned {row_index} assets")
             extra_properties = row["extraProperties"]
-            extra_properties_map = {
-                x["name"]: json.loads(x["value"]) for x in extra_properties
-            }
+            extra_properties_map = parse_extra_properties_for_model(
+                extra_properties, self.DataHubDatasetSearchRow
+            )
             search_row = self.DataHubDatasetSearchRow(**extra_properties_map)
             if on_asset_scanned:
                 on_asset_scanned(search_row.urn)
@@ -414,7 +445,7 @@ class DataHubFormReportingData(FormData):
                             question_status=QuestionStatus.COMPLETED,
                             question_completed_date=datetime.fromtimestamp(
                                 float(prompt_response_time) / 1000, tz=timezone.utc
-                            ),
+                            ).date(),
                             snapshot_date=self.snapshot_date,
                         )
             complete_forms = (
@@ -516,7 +547,7 @@ class DataHubFormReportingData(FormData):
                             question_status=QuestionStatus.COMPLETED,
                             question_completed_date=datetime.fromtimestamp(
                                 float(prompt_response_time) / 1000, tz=timezone.utc
-                            ),
+                            ).date(),
                             snapshot_date=self.snapshot_date,
                         )

acryl_datahub_cloud/datahub_reporting/extract_graph.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional
 import boto3
 from opensearchpy import OpenSearch
-from pydantic import validator
+from pydantic import field_validator
 from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
     DataHubBasedS3Dataset,
@@ -43,8 +43,9 @@ class DataHubReportingExtractGraphSourceConfig(ConfigModel):
     query_timeout: int = 30
     extract_batch_size: int = 2000
-    @validator("extract_graph_store", pre=True, always=True)
-    def set_default_extract_soft_delete_flag(cls, v, values):
+    @field_validator("extract_graph_store", mode="before")
+    @classmethod
+    def set_default_extract_soft_delete_flag(cls, v):
         if v is not None:
             if "dataset_registration_spec" not in v:
                 v["dataset_registration_spec"] = DatasetRegistrationSpec(

acryl_datahub_cloud/datahub_reporting/extract_sql.py CHANGED Viewed

@@ -4,10 +4,14 @@ import shutil
 import zipfile
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import TYPE_CHECKING, Iterable, List, Literal, Optional
 import boto3
-from pydantic import validator
+from botocore.exceptions import ClientError
+from pydantic import field_validator
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import ObjectSummary
 from acryl_datahub_cloud.datahub_reporting.datahub_dataset import (
     DataHubBasedS3Dataset,
@@ -42,25 +46,36 @@ class DataHubReportingExtractSQLSourceConfig(ConfigModel):
     server: Optional[DatahubClientConfig] = None
     sql_backup_config: S3ClientConfig
     extract_sql_store: FileStoreBackedDatasetConfig
-    @validator("extract_sql_store", pre=True, always=True)
+    # Maximum size (in bytes) of files to stream from S3 per batch using chunked streaming.
+    # Files are streamed in 8MB chunks directly from S3 to ZIP without writing to disk, processing
+    # files in batches to limit peak memory usage. This prevents both disk pressure and excessive
+    # memory consumption during batch processing.
+    # Default: 5GB (5 * 1024 * 1024 * 1024 bytes)
+    batch_size_bytes: int = 5 * 1024 * 1024 * 1024
+    @field_validator("extract_sql_store", mode="before")
+    @classmethod
     def set_default_extract_soft_delete_flag(cls, v):
-        if v is not None:
-            if "dataset_registration_spec" not in v:
-                v["dataset_registration_spec"] = DatasetRegistrationSpec(
-                    soft_deleted=False
-                )
-            elif "soft_deleted" not in v["dataset_registration_spec"]:
-                v["dataset_registration_spec"]["soft_deleted"] = False
+        if v is None:
+            return v
+        # If v is already a FileStoreBackedDatasetConfig object, skip dict-based modifications
+        if isinstance(v, FileStoreBackedDatasetConfig):
+            return v
+        # v is a dictionary - apply default values
+        if "dataset_registration_spec" not in v:
+            v["dataset_registration_spec"] = DatasetRegistrationSpec(soft_deleted=False)
+        elif "soft_deleted" not in v["dataset_registration_spec"]:
+            v["dataset_registration_spec"]["soft_deleted"] = False
+        if "file" not in v:
+            default_config = FileStoreBackedDatasetConfig.dummy()
+            v["file"] = f"{default_config.file_name}.{default_config.file_extension}"
+        else:
+            v["file_name"] = v["file"].split(".")[0]
+            v["file_extension"] = v["file"].split(".")[-1]
-            if "file" not in v:
-                default_config = FileStoreBackedDatasetConfig.dummy()
-                v["file"] = (
-                    f"{default_config.file_name}.{default_config.file_extension}"
-                )
-            else:
-                v["file_name"] = v["file"].split(".")[0]
-                v["file_extension"] = v["file"].split(".")[-1]
         return v
@@ -166,20 +181,17 @@ class DataHubReportingExtractSQLSource(Source):
             self._clean_up_old_state(state_directory=tmp_dir)
-            files_downloaded: bool = self._download_files(
+            files_downloaded: bool = self._download_and_zip_in_batches(
                 bucket=self.config.sql_backup_config.bucket,
                 prefix=bucket_prefix,
-                target_dir=f"{tmp_dir}/download/",
+                batch_dir=f"{tmp_dir}/download/",
+                output_zip=f"{tmp_dir}/{output_file}",
+                batch_size_bytes=self.config.batch_size_bytes,
             )
             if not files_downloaded:
                 logger.warning(f"Skipping as no files were found in {bucket_prefix}")
                 return
-            self._zip_folder(
-                folder_path=f"{tmp_dir}/download",
-                output_file=f"{tmp_dir}/{output_file}",
-            )
             # Compute profile & schema information, this is based on the parquet files that were downloaded and not the zip file.
             # We must hard-code the local file from which the dataset will be created, otherwise the upload to s3 will be in
             # unexpected path.
@@ -210,40 +222,219 @@ class DataHubReportingExtractSQLSource(Source):
         path = Path(f"{state_directory}/download/")
         path.mkdir(parents=True, exist_ok=True)
-    def _download_files(self, bucket: str, prefix: str, target_dir: str) -> bool:
-        objects = boto3.resource("s3").Bucket(bucket).objects.filter(Prefix=prefix)
+    @staticmethod
+    def _stream_file_to_zip_from_local(
+        local_file_path: str,
+        zipf: zipfile.ZipFile,
+        file_name: str,
+        chunk_size: int,
+    ) -> None:
+        """Stream file from local disk to ZIP using chunked reads."""
+        with (
+            open(local_file_path, "rb") as local_file,
+            zipf.open(file_name, "w") as zip_entry,
+        ):
+            while True:
+                chunk = local_file.read(chunk_size)
+                if not chunk:
+                    break
+                zip_entry.write(chunk)
+    def _stream_file_to_zip_from_s3(
+        self,
+        bucket: str,
+        file_key: str,
+        zipf: zipfile.ZipFile,
+        file_name: str,
+        chunk_size: int,
+    ) -> None:
+        """Stream file from S3 to ZIP using chunked reads."""
+        s3_response = self.s3_client.get_object(Bucket=bucket, Key=file_key)
+        body_stream = s3_response["Body"]
+        with zipf.open(file_name, "w") as zip_entry:
+            while True:
+                chunk = body_stream.read(chunk_size)
+                if not chunk:
+                    break
+                zip_entry.write(chunk)
-        files_downloaded = False
+    @staticmethod
+    def _group_objects_into_batches(
+        objects: List["ObjectSummary"], batch_size_bytes: int
+    ) -> List[List["ObjectSummary"]]:
+        """
+        Group S3 objects into batches based on cumulative size.
+        Files larger than batch_size_bytes get their own batch.
+        """
+        batches: List[List["ObjectSummary"]] = []
+        current_batch: List["ObjectSummary"] = []
+        current_batch_size = 0
-        # Iterate over objects in the time partition path
         for obj in objects:
-            # Extract file key
-            file_key = obj.key
+            obj_size = obj.size
+            # If file is larger than batch size, give it its own batch
+            if obj_size > batch_size_bytes:
+                if current_batch:
+                    batches.append(current_batch)
+                    current_batch = []
+                    current_batch_size = 0
+                batches.append([obj])  # Solo batch for large file
+                logger.warning(
+                    f"File {obj.key} ({obj_size / (1024**2):.2f} MB) exceeds batch size "
+                    f"({batch_size_bytes / (1024**2):.2f} MB), processing in separate batch"
+                )
+                continue
+            # If adding this file would exceed batch size, start a new batch
+            if (
+                current_batch_size > 0
+                and current_batch_size + obj_size > batch_size_bytes
+            ):
+                batches.append(current_batch)
+                current_batch = []
+                current_batch_size = 0
+            current_batch.append(obj)
+            current_batch_size += obj_size
+        # Add the last batch if it has files
+        if current_batch:
+            batches.append(current_batch)
+        return batches
+    def _download_and_zip_in_batches(
+        self,
+        bucket: str,
+        prefix: str,
+        batch_dir: str,
+        output_zip: str,
+        batch_size_bytes: int,
+    ) -> bool:
+        """
+        Stream files from S3 directly into ZIP using chunked streaming, processing in batches to limit memory usage.
-            # Generate local file path
-            local_file_path = os.path.join(
-                os.getcwd(), target_dir, os.path.basename(file_key)
-            )
+        Downloads the first file to batch_dir for schema/profile computation, then streams all files to ZIP
+        using 8MB chunks to ensure constant memory usage regardless of individual file sizes.
+        Args:
+            bucket: S3 bucket name
+            prefix: S3 prefix to filter objects
+            batch_dir: Local directory for temporary sample file download (for schema computation)
+            output_zip: Output ZIP file path
+            batch_size_bytes: Maximum total size of files to stream in each batch before flushing
-            logger.info(f"Downloading s3://{bucket}/{file_key} to {local_file_path}")
+        Returns:
+            True if any files were processed, False otherwise
+        """
+        s3_resource = boto3.resource("s3")
+        objects = list(s3_resource.Bucket(bucket).objects.filter(Prefix=prefix))
-            # Download file from S3
-            self.s3_client.download_file(bucket, file_key, local_file_path)
+        if not objects:
+            return False
-            files_downloaded = True
+        logger.info(
+            f"Found {len(objects)} files in s3://{bucket}/{prefix}, streaming in batches of up to {batch_size_bytes / (1024**2):.2f} MB"
+        )
-        return files_downloaded
+        # Download first file to batch_dir for schema/profile computation
+        # This is required by register_dataset() which needs a local parquet file to generate schema
+        os.makedirs(batch_dir, exist_ok=True)
+        first_obj = objects[0]
+        sample_file_path = os.path.join(batch_dir, os.path.basename(first_obj.key))
-    @staticmethod
-    def _zip_folder(folder_path: str, output_file: str) -> None:
-        logger.info(f"Zipping {folder_path} to {output_file}")
-        with zipfile.ZipFile(output_file, "x", zipfile.ZIP_DEFLATED) as zipf:
-            for root, _, files in os.walk(folder_path):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    logger.info(f"Adding {file_path} to ZIP file")
-                    # Add file to zip archive with relative path
-                    zipf.write(file_path, os.path.relpath(file_path, folder_path))
+        try:
+            logger.info(
+                f"Downloading first file s3://{bucket}/{first_obj.key} ({first_obj.size / (1024**2):.2f} MB) "
+                f"to {sample_file_path} for schema computation"
+            )
+            self.s3_client.download_file(bucket, first_obj.key, sample_file_path)
+        except ClientError as e:
+            logger.error(f"Failed to download first file for schema computation: {e}")
+            raise RuntimeError(
+                f"Cannot compute schema without at least one sample file: {e}"
+            ) from e
+        # Group objects into batches based on cumulative size
+        batches = self._group_objects_into_batches(objects, batch_size_bytes)
+        logger.info(f"Split {len(objects)} files into {len(batches)} batches")
+        # Track whether we've processed the first file to avoid downloading it twice
+        first_obj_processed = False
+        # Process each batch: stream from S3 directly to ZIP using chunked reads
+        zip_mode: Literal["x", "a"] = "x"  # Create new file for first batch
+        chunk_size = 8 * 1024 * 1024  # 8MB chunks for constant memory usage
+        for batch_idx, batch in enumerate(batches):
+            batch_size_mb = sum(obj.size for obj in batch) / (1024 * 1024)
+            logger.info(
+                f"Processing batch {batch_idx + 1}/{len(batches)} with {len(batch)} files ({batch_size_mb:.2f} MB)"
+            )
+            # Stream files from S3 directly into ZIP using chunked reads
+            with zipfile.ZipFile(output_zip, zip_mode, zipfile.ZIP_DEFLATED) as zipf:
+                for obj in batch:
+                    file_key = obj.key
+                    # Preserve S3 path structure in ZIP to avoid filename collisions
+                    # Strip only the common prefix, keep subdirectories
+                    relative_path = file_key[len(prefix) :].lstrip("/")
+                    file_name = (
+                        relative_path if relative_path else os.path.basename(file_key)
+                    )
+                    try:
+                        # If this is the first file and we already downloaded it, reuse local copy
+                        if not first_obj_processed and file_key == first_obj.key:
+                            logger.info(
+                                f"Adding {file_name} ({obj.size / (1024**2):.2f} MB) to ZIP from local file "
+                                f"(already downloaded for schema computation)"
+                            )
+                            self._stream_file_to_zip_from_local(
+                                sample_file_path, zipf, file_name, chunk_size
+                            )
+                            first_obj_processed = True
+                        else:
+                            # Stream from S3 using chunked reads for constant memory usage
+                            logger.info(
+                                f"Streaming {file_name} ({obj.size / (1024**2):.2f} MB) from S3 using chunked reads"
+                            )
+                            self._stream_file_to_zip_from_s3(
+                                bucket, file_key, zipf, file_name, chunk_size
+                            )
+                        logger.info(f"Added {file_name} to ZIP file")
+                    except ClientError as e:
+                        logger.error(f"Failed to stream s3://{bucket}/{file_key}: {e}")
+                        raise RuntimeError(
+                            f"Failed to stream file {file_key} from S3: {e}"
+                        ) from e
+                    except Exception as e:
+                        logger.error(
+                            f"Unexpected error processing s3://{bucket}/{file_key}: {e}"
+                        )
+                        raise RuntimeError(
+                            f"Failed to process file {file_key}: {e}"
+                        ) from e
+            # After first batch, switch to append mode for subsequent batches
+            zip_mode = "a"
+            logger.info(
+                f"Batch {batch_idx + 1}/{len(batches)} complete, streamed {len(batch)} files"
+            )
+        total_size_mb = sum(obj.size for obj in objects) / (1024 * 1024)
+        logger.info(
+            f"Successfully streamed all {len(objects)} files ({total_size_mb:.2f} MB) across {len(batches)} batches"
+        )
+        return True
     def get_report(self) -> SourceReport:
         return self.report

acryl_datahub_cloud/datahub_reporting/forms.py CHANGED Viewed

@@ -75,7 +75,7 @@ class DataHubReportingFormsSource(Source):
                     enabled=False, dataset_urn=None, physical_uri_prefix=None
                 )
             result_map = query_result.get(query_name, {})
-            return FormAnalyticsConfig.parse_obj(
+            return FormAnalyticsConfig.model_validate(
                 dict(
                     (field, result_map.get(graphql_field))
                     for field, graphql_field in field_mappings.items()

acryl_datahub_cloud/datahub_reporting/forms_config.py CHANGED Viewed

@@ -2,7 +2,7 @@ from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
-from pydantic import validator
+from pydantic import field_validator
 from datahub.configuration.common import ConfigModel
 from datahub.ingestion.api.source import SourceReport
@@ -32,7 +32,8 @@ class DataHubReportingFormSourceConfig(ConfigModel):
     generate_presigned_url: bool = True
     presigned_url_expiry_days: int = 7
-    @validator("reporting_snapshot_partitioning_strategy")
+    @field_validator("reporting_snapshot_partitioning_strategy")
+    @classmethod
     def validate_partitioning_strategy(cls, v):
         if v not in PartitioningStrategy:
             raise ValueError(f"Unsupported partitioning strategy: {v}")

acryl-datahub-cloud 0.3.11rc0__py3-none-any.whl → 0.3.16.1rc0__py3-none-any.whl

Potentially problematic release.

acryl-datahub-cloud 0.3.11rc0py3-none-any.whl → 0.3.16.1rc0py3-none-any.whl