PyPI - acryl-datahub - Versions diffs - 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show

{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -28
datahub/cli/specific/dataset_cli.py +26 -10
datahub/emitter/mce_builder.py +1 -3
datahub/emitter/mcp_builder.py +8 -0
datahub/emitter/request_helper.py +19 -14
datahub/emitter/response_helper.py +25 -18
datahub/emitter/rest_emitter.py +23 -7
datahub/errors.py +8 -0
datahub/ingestion/api/source.py +7 -2
datahub/ingestion/api/source_helpers.py +14 -2
datahub/ingestion/extractor/schema_util.py +1 -0
datahub/ingestion/graph/client.py +26 -20
datahub/ingestion/graph/filters.py +62 -17
datahub/ingestion/sink/datahub_rest.py +2 -2
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
datahub/ingestion/source/common/subtypes.py +17 -1
datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
datahub/ingestion/source/dbt/dbt_common.py +6 -4
datahub/ingestion/source/dbt/dbt_core.py +4 -6
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_source.py +96 -117
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/ge_data_profiler.py +11 -1
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +394 -0
datahub/ingestion/source/hex/constants.py +3 -0
datahub/ingestion/source/hex/hex.py +167 -0
datahub/ingestion/source/hex/mapper.py +372 -0
datahub/ingestion/source/hex/model.py +68 -0
datahub/ingestion/source/iceberg/iceberg.py +193 -140
datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
datahub/ingestion/source/mlflow.py +217 -8
datahub/ingestion/source/mode.py +11 -1
datahub/ingestion/source/openapi.py +69 -34
datahub/ingestion/source/powerbi/config.py +31 -4
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +41 -24
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
datahub/ingestion/source/redshift/lineage_v2.py +9 -1
datahub/ingestion/source/redshift/query.py +1 -1
datahub/ingestion/source/s3/source.py +11 -0
datahub/ingestion/source/sigma/config.py +3 -4
datahub/ingestion/source/sigma/sigma.py +10 -6
datahub/ingestion/source/slack/slack.py +399 -82
datahub/ingestion/source/snowflake/constants.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
datahub/ingestion/source/sql/mssql/job_models.py +15 -1
datahub/ingestion/source/sql/mssql/source.py +8 -4
datahub/ingestion/source/sql/oracle.py +51 -4
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
datahub/ingestion/source/superset.py +291 -35
datahub/ingestion/source/usage/usage_common.py +0 -65
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1055 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
datahub/metadata/_schema_classes.py +472 -1
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/schema.avsc +313 -2
datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
datahub/metadata/schemas/QueryProperties.avsc +20 -0
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/dataset.py +122 -0
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +27 -3
datahub/sdk/main_client.py +24 -1
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +94 -37
datahub/sql_parsing/split_statements.py +17 -3
datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
datahub/sql_parsing/tool_meta_extractor.py +27 -2
datahub/testing/mcp_diff.py +1 -18
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/source/vertexai.py +0 -697
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -38,6 +38,7 @@ from pyiceberg.types import (
 )
 from datahub.emitter.mce_builder import (
+    make_container_urn,
     make_data_platform_urn,
     make_dataplatform_instance_urn,
     make_dataset_urn_with_platform_instance,
@@ -45,6 +46,7 @@ from datahub.emitter.mce_builder import (
     make_user_urn,
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import NamespaceKey
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SourceCapability,
@@ -57,6 +59,10 @@ from datahub.ingestion.api.decorators import (
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.extractor import schema_util
+from datahub.ingestion.source.common.subtypes import (
+    DatasetContainerSubTypes,
+    DatasetSubTypes,
+)
 from datahub.ingestion.source.iceberg.iceberg_common import (
     IcebergSourceConfig,
     IcebergSourceReport,
@@ -68,21 +74,22 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionSourceBase,
 )
-from datahub.metadata.com.linkedin.pegasus2avro.common import Status
-from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
-from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
+from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
+from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     OtherSchema,
     SchemaField,
     SchemaMetadata,
 )
 from datahub.metadata.schema_classes import (
+    ContainerClass,
     DataPlatformInstanceClass,
     DatasetPropertiesClass,
     OwnerClass,
     OwnershipClass,
     OwnershipTypeClass,
     TimeStampClass,
+    _Aspect,
 )
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -121,9 +128,10 @@ class IcebergSource(StatefulIngestionSourceBase):
     [pyiceberg library](https://py.iceberg.apache.org/).
     """
+    platform: str = "iceberg"
     def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
         super().__init__(config, ctx)
-        self.platform: str = "iceberg"
         self.report: IcebergSourceReport = IcebergSourceReport()
         self.config: IcebergSourceConfig = config
@@ -140,13 +148,12 @@ class IcebergSource(StatefulIngestionSourceBase):
             ).workunit_processor,
         ]
-    def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]:
+    def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
         namespaces = catalog.list_namespaces()
         LOGGER.debug(
             f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
         )
         self.report.report_no_listed_namespaces(len(namespaces))
-        tables_count = 0
         for namespace in namespaces:
             namespace_repr = ".".join(namespace)
             if not self.config.namespace_pattern.allowed(namespace_repr):
@@ -155,6 +162,14 @@ class IcebergSource(StatefulIngestionSourceBase):
                 )
                 self.report.report_dropped(f"{namespace_repr}.*")
                 continue
+            yield namespace
+    def _get_datasets(
+        self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
+    ) -> Iterable[Tuple[Identifier, str]]:
+        LOGGER.debug("Starting to retrieve tables")
+        tables_count = 0
+        for namespace, namespace_urn in namespaces:
             try:
                 tables = catalog.list_tables(namespace)
                 tables_count += len(tables)
@@ -164,29 +179,27 @@ class IcebergSource(StatefulIngestionSourceBase):
                 self.report.report_listed_tables_for_namespace(
                     ".".join(namespace), len(tables)
                 )
-                yield from tables
-            except NoSuchNamespaceError:
-                self.report.report_warning(
-                    "no-such-namespace",
-                    f"Couldn't list tables for namespace {namespace} due to NoSuchNamespaceError exception",
-                )
-                LOGGER.warning(
-                    f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
+                yield from [(table, namespace_urn) for table in tables]
+            except NoSuchNamespaceError as e:
+                self.report.warning(
+                    title="No such namespace",
+                    message="Skipping the missing namespace.",
+                    context=str(namespace),
+                    exc=e,
                 )
             except Exception as e:
                 self.report.report_failure(
-                    "listing-tables-exception",
-                    f"Couldn't list tables for namespace {namespace} due to {e}",
-                )
-                LOGGER.exception(
-                    f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
+                    title="Error when processing a namespace",
+                    message="Skipping the namespace due to errors while processing it.",
+                    context=str(namespace),
+                    exc=e,
                 )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         thread_local = threading.local()
         def _try_processing_dataset(
-            dataset_path: Tuple[str, ...], dataset_name: str
+            dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
         ) -> Iterable[MetadataWorkUnit]:
             try:
                 if not hasattr(thread_local, "local_catalog"):
@@ -202,56 +215,66 @@ class IcebergSource(StatefulIngestionSourceBase):
                         time_taken, dataset_name, table.metadata_location
                     )
                 LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
-                yield from self._create_iceberg_workunit(dataset_name, table)
-            except NoSuchPropertyException as e:
-                self.report.report_warning(
-                    "table-property-missing",
-                    f"Failed to create workunit for {dataset_name}. {e}",
+                dataset_urn: str = make_dataset_urn_with_platform_instance(
+                    self.platform,
+                    dataset_name,
+                    self.config.platform_instance,
+                    self.config.env,
                 )
-                LOGGER.warning(
-                    f"NoSuchPropertyException while processing table {dataset_path}, skipping it.",
+                for aspect in self._create_iceberg_table_aspects(
+                    dataset_name, table, namespace_urn
+                ):
+                    yield MetadataChangeProposalWrapper(
+                        entityUrn=dataset_urn, aspect=aspect
+                    ).as_workunit()
+            except NoSuchPropertyException as e:
+                self.report.warning(
+                    title="Unable to process table",
+                    message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
+                    context=dataset_name,
+                    exc=e,
                 )
             except NoSuchIcebergTableError as e:
-                self.report.report_warning(
-                    "not-an-iceberg-table",
-                    f"Failed to create workunit for {dataset_name}. {e}",
-                )
-                LOGGER.warning(
-                    f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
+                self.report.warning(
+                    title="Skipped non-iceberg table",
+                    message="Table was recognized as non-iceberg and skipped.",
+                    context=dataset_name,
+                    exc=e,
                 )
             except NoSuchTableError as e:
-                self.report.report_warning(
-                    "no-such-table",
-                    f"Failed to create workunit for {dataset_name}. {e}",
-                )
-                LOGGER.warning(
-                    f"NoSuchTableError while processing table {dataset_path}, skipping it.",
+                self.report.warning(
+                    title="Table not found",
+                    message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
+                    context=dataset_name,
+                    exc=e,
                 )
             except FileNotFoundError as e:
-                self.report.report_warning(
-                    "file-not-found",
-                    f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
-                )
-                LOGGER.warning(
-                    f"FileNotFoundError while processing table {dataset_path}, skipping it."
+                self.report.warning(
+                    title="Manifest file not found",
+                    message="Couldn't find manifest file to read for the table, skipping it.",
+                    context=dataset_name,
+                    exc=e,
                 )
             except ServerError as e:
-                self.report.report_warning(
-                    "iceberg-rest-server-error",
-                    f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
-                )
-                LOGGER.warning(
-                    f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
+                self.report.warning(
+                    title="Iceberg REST Server Error",
+                    message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
+                    context=dataset_name,
+                    exc=e,
                 )
             except ValueError as e:
                 if "Could not initialize FileIO" not in str(e):
                     raise
                 self.report.warning(
-                    "Could not initialize FileIO",
-                    f"Could not initialize FileIO for {dataset_path} due to: {e}",
+                    title="Could not initialize FileIO",
+                    message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
+                    context=dataset_name,
+                    exc=e,
                 )
-        def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
+        def _process_dataset(
+            dataset_path: Identifier, namespace_urn: str
+        ) -> Iterable[MetadataWorkUnit]:
             try:
                 LOGGER.debug(f"Processing dataset for path {dataset_path}")
                 dataset_name = ".".join(dataset_path)
@@ -263,106 +286,96 @@ class IcebergSource(StatefulIngestionSourceBase):
                     )
                     return
-                yield from _try_processing_dataset(dataset_path, dataset_name)
+                yield from _try_processing_dataset(
+                    dataset_path, dataset_name, namespace_urn
+                )
             except Exception as e:
                 self.report.report_failure(
-                    "general",
-                    f"Failed to create workunit for dataset {dataset_path}: {e}",
-                )
-                LOGGER.exception(
-                    f"Exception while processing table {dataset_path}, skipping it.",
+                    title="Error when processing a table",
+                    message="Skipping the table due to errors when processing it.",
+                    context=str(dataset_path),
+                    exc=e,
                 )
         try:
             catalog = self.config.get_catalog()
         except Exception as e:
-            self.report.report_failure("get-catalog", f"Failed to get catalog: {e}")
+            self.report.report_failure(
+                title="Failed to initialize catalog object",
+                message="Couldn't start the ingestion due to failure to initialize catalog object.",
+                exc=e,
+            )
+            return
+        try:
+            namespace_ids = self._get_namespaces(catalog)
+            namespaces: List[Tuple[Identifier, str]] = []
+            for namespace in namespace_ids:
+                namespace_repr = ".".join(namespace)
+                LOGGER.debug(f"Processing namespace {namespace_repr}")
+                namespace_urn = make_container_urn(
+                    NamespaceKey(
+                        namespace=namespace_repr,
+                        platform=self.platform,
+                        instance=self.config.platform_instance,
+                        env=self.config.env,
+                    )
+                )
+                namespaces.append((namespace, namespace_urn))
+                for aspect in self._create_iceberg_namespace_aspects(namespace):
+                    yield MetadataChangeProposalWrapper(
+                        entityUrn=namespace_urn, aspect=aspect
+                    ).as_workunit()
+            LOGGER.debug("Namespaces ingestion completed")
+        except Exception as e:
+            self.report.report_failure(
+                title="Failed to list namespaces",
+                message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
+                exc=e,
+            )
             return
         for wu in ThreadedIteratorExecutor.process(
             worker_func=_process_dataset,
-            args_list=[(dataset_path,) for dataset_path in self._get_datasets(catalog)],
+            args_list=[
+                (dataset_path, namespace_urn)
+                for dataset_path, namespace_urn in self._get_datasets(
+                    catalog, namespaces
+                )
+            ],
             max_workers=self.config.processing_threads,
         ):
             yield wu
-    def _create_iceberg_workunit(
-        self, dataset_name: str, table: Table
-    ) -> Iterable[MetadataWorkUnit]:
+    def _create_iceberg_table_aspects(
+        self, dataset_name: str, table: Table, namespace_urn: str
+    ) -> Iterable[_Aspect]:
         with PerfTimer() as timer:
             self.report.report_table_scanned(dataset_name)
             LOGGER.debug(f"Processing table {dataset_name}")
-            dataset_urn: str = make_dataset_urn_with_platform_instance(
-                self.platform,
-                dataset_name,
-                self.config.platform_instance,
-                self.config.env,
-            )
-            dataset_snapshot = DatasetSnapshot(
-                urn=dataset_urn,
-                aspects=[Status(removed=False)],
-            )
+            yield Status(removed=False)
+            yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
-            # Dataset properties aspect.
-            additional_properties = {}
-            custom_properties = table.metadata.properties.copy()
-            custom_properties["location"] = table.metadata.location
-            custom_properties["format-version"] = str(table.metadata.format_version)
-            custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
-            if table.current_snapshot():
-                custom_properties["snapshot-id"] = str(
-                    table.current_snapshot().snapshot_id
-                )
-                custom_properties["manifest-list"] = (
-                    table.current_snapshot().manifest_list
-                )
-                additional_properties["lastModified"] = TimeStampClass(
-                    int(table.current_snapshot().timestamp_ms)
-                )
-            if "created-at" in custom_properties:
-                try:
-                    dt = dateutil_parser.isoparse(custom_properties["created-at"])
-                    additional_properties["created"] = TimeStampClass(
-                        int(dt.timestamp() * 1000)
-                    )
-                except Exception as ex:
-                    LOGGER.warning(
-                        f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
-                    )
+            yield self._get_dataset_properties_aspect(dataset_name, table)
-            dataset_properties = DatasetPropertiesClass(
-                name=table.name()[-1],
-                description=table.metadata.properties.get("comment", None),
-                customProperties=custom_properties,
-                lastModified=additional_properties.get("lastModified"),
-                created=additional_properties.get("created"),
-                qualifiedName=dataset_name,
-            )
-            dataset_snapshot.aspects.append(dataset_properties)
-            # Dataset ownership aspect.
             dataset_ownership = self._get_ownership_aspect(table)
             if dataset_ownership:
                 LOGGER.debug(
                     f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
                 )
-                dataset_snapshot.aspects.append(dataset_ownership)
+                yield dataset_ownership
-            schema_metadata = self._create_schema_metadata(dataset_name, table)
-            dataset_snapshot.aspects.append(schema_metadata)
+            yield self._create_schema_metadata(dataset_name, table)
+            yield self._get_dataplatform_instance_aspect()
+            yield ContainerClass(container=str(namespace_urn))
-            mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
         self.report.report_table_processing_time(
             timer.elapsed_seconds(), dataset_name, table.metadata_location
         )
-        yield MetadataWorkUnit(id=dataset_name, mce=mce)
-        dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
-        if dpi_aspect:
-            yield dpi_aspect
         if self.config.is_profiling_enabled():
             profiler = IcebergProfiler(self.report, self.config.profiling)
-            yield from profiler.profile_table(dataset_name, dataset_urn, table)
+            yield from profiler.profile_table(dataset_name, table)
     def _get_partition_aspect(self, table: Table) -> Optional[str]:
         """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
@@ -401,12 +414,48 @@ class IcebergSource(StatefulIngestionSourceBase):
                 ]
             )
         except Exception as e:
-            self.report.report_warning(
-                "extract-partition",
-                f"Failed to extract partition spec from Iceberg table {table.name()} due to error: {str(e)}",
+            self.report.warning(
+                title="Failed to extract partition information",
+                message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
+                context=str(table.name),
+                exc=e,
             )
             return None
+    def _get_dataset_properties_aspect(
+        self, dataset_name: str, table: Table
+    ) -> DatasetPropertiesClass:
+        additional_properties = {}
+        custom_properties = table.metadata.properties.copy()
+        custom_properties["location"] = table.metadata.location
+        custom_properties["format-version"] = str(table.metadata.format_version)
+        custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
+        if table.current_snapshot():
+            custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
+            custom_properties["manifest-list"] = table.current_snapshot().manifest_list
+            additional_properties["lastModified"] = TimeStampClass(
+                int(table.current_snapshot().timestamp_ms)
+            )
+        if "created-at" in custom_properties:
+            try:
+                dt = dateutil_parser.isoparse(custom_properties["created-at"])
+                additional_properties["created"] = TimeStampClass(
+                    int(dt.timestamp() * 1000)
+                )
+            except Exception as ex:
+                LOGGER.warning(
+                    f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
+                )
+        return DatasetPropertiesClass(
+            name=table.name()[-1],
+            description=table.metadata.properties.get("comment", None),
+            customProperties=custom_properties,
+            lastModified=additional_properties.get("lastModified"),
+            created=additional_properties.get("created"),
+            qualifiedName=dataset_name,
+        )
     def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
         owners = []
         if self.config.user_ownership_property:
@@ -435,22 +484,15 @@ class IcebergSource(StatefulIngestionSourceBase):
                 )
         return OwnershipClass(owners=owners) if owners else None
-    def _get_dataplatform_instance_aspect(
-        self, dataset_urn: str
-    ) -> Optional[MetadataWorkUnit]:
-        # If we are a platform instance based source, emit the instance aspect
-        if self.config.platform_instance:
-            return MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn,
-                aspect=DataPlatformInstanceClass(
-                    platform=make_data_platform_urn(self.platform),
-                    instance=make_dataplatform_instance_urn(
-                        self.platform, self.config.platform_instance
-                    ),
-                ),
-            ).as_workunit()
-        return None
+    def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
+        return DataPlatformInstanceClass(
+            platform=make_data_platform_urn(self.platform),
+            instance=make_dataplatform_instance_urn(
+                self.platform, self.config.platform_instance
+            )
+            if self.config.platform_instance
+            else None,
+        )
     def _create_schema_metadata(
         self, dataset_name: str, table: Table
@@ -479,6 +521,17 @@ class IcebergSource(StatefulIngestionSourceBase):
     def get_report(self) -> SourceReport:
         return self.report
+    def _create_iceberg_namespace_aspects(
+        self, namespace: Identifier
+    ) -> Iterable[_Aspect]:
+        namespace_repr = ".".join(namespace)
+        yield Status(removed=False)
+        yield ContainerProperties(
+            name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
+        )
+        yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
+        yield self._get_dataplatform_instance_aspect()
 class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
     """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""

datahub/ingestion/source/iceberg/iceberg_profiler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Callable, Dict, Iterable, Union, cast
+from typing import Any, Callable, Dict, Iterable, Optional, cast
 from pyiceberg.conversions import from_bytes
 from pyiceberg.schema import Schema
@@ -24,8 +24,6 @@ from pyiceberg.utils.datetime import (
 )
 from datahub.emitter.mce_builder import get_sys_time
-from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.iceberg.iceberg_common import (
     IcebergProfilingConfig,
     IcebergSourceReport,
@@ -33,6 +31,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
 from datahub.metadata.schema_classes import (
     DatasetFieldProfileClass,
     DatasetProfileClass,
+    _Aspect,
 )
 from datahub.utilities.perf_timer import PerfTimer
@@ -86,9 +85,8 @@ class IcebergProfiler:
     def profile_table(
         self,
         dataset_name: str,
-        dataset_urn: str,
         table: Table,
-    ) -> Iterable[MetadataWorkUnit]:
+    ) -> Iterable[_Aspect]:
         """This method will profile the supplied Iceberg table by looking at the table's manifest.
         The overall profile of the table is aggregated from the individual manifest files.
@@ -167,11 +165,11 @@ class IcebergProfiler:
                             )
                         total_count += data_file.record_count
             except Exception as e:
-                # Catch any errors that arise from attempting to read the Iceberg table's manifests
-                # This will prevent stateful ingestion from being blocked by an error (profiling is not critical)
-                self.report.report_warning(
-                    "profiling",
-                    f"Error while profiling dataset {dataset_name}: {e}",
+                self.report.warning(
+                    title="Error when profiling a table",
+                    message="Skipping profiling of the table due to errors",
+                    context=dataset_name,
+                    exc=e,
                 )
             if row_count:
                 # Iterating through fieldPaths introduces unwanted stats for list element fields...
@@ -211,14 +209,11 @@ class IcebergProfiler:
                 f"Finished profiling of dataset: {dataset_name} in {time_taken}"
             )
-        yield MetadataChangeProposalWrapper(
-            entityUrn=dataset_urn,
-            aspect=dataset_profile,
-        ).as_workunit()
+        yield dataset_profile
     def _render_value(
         self, dataset_name: str, value_type: IcebergType, value: Any
-    ) -> Union[str, None]:
+    ) -> Optional[str]:
         try:
             if isinstance(value_type, TimestampType):
                 return to_human_timestamp(value)
@@ -230,9 +225,17 @@ class IcebergProfiler:
                 return to_human_time(value)
             return str(value)
         except Exception as e:
-            self.report.report_warning(
-                "profiling",
-                f"Error in dataset {dataset_name} when profiling a {value_type} field with value {value}: {e}",
+            self.report.warning(
+                title="Couldn't render value when profiling a table",
+                message="Encountered error, when trying to redner a value for table profile.",
+                context=str(
+                    {
+                        "value": value,
+                        "value_type": value_type,
+                        "dataset_name": dataset_name,
+                    }
+                ),
+                exc=e,
             )
             return None

acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl