PyPI - acryl-datahub - Versions diffs - 0.15.0rc12__py3-none-any.whl → 0.15.0rc14__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc12py3-none-any.whl → 0.15.0rc14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (26) hide show

datahub/ingestion/source/preset.py CHANGED Viewed

@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
         super().__init__(ctx, config)
         self.config = config
         self.report = StaleEntityRemovalSourceReport()
+        self.platform = "preset"
     def login(self):
         try:

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, cast
+from typing import Dict, List, Optional, Set
 import pydantic
 from pydantic import Field, SecretStr, root_validator, validator
@@ -118,9 +118,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
             )
         # Always exclude reporting metadata for INFORMATION_SCHEMA schema
-        if schema_pattern is not None and schema_pattern:
+        if schema_pattern:
             logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
-            cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
+            assert isinstance(schema_pattern, AllowDenyPattern)
+            schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
         return values

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
     "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
     "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
     "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
+    "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
 }
 _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
         description="Connect args to pass to Snowflake SqlAlchemy driver",
         exclude=True,
     )
+    token: Optional[str] = pydantic.Field(
+        default=None,
+        description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
+    )
     def get_account(self) -> str:
         assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
         logger.info(f"using authenticator type '{v}'")
         return v
+    @pydantic.validator("token", always=True)
+    def validate_token_oauth_config(cls, v, values):
+        auth_type = values.get("authentication_type")
+        if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            if not v:
+                raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
+        elif v is not None:
+            raise ValueError(
+                "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
+            )
+        return v
     @staticmethod
     def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
         if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
                 application=_APPLICATION_NAME,
                 **connect_args,
             )
+        elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
+            return snowflake.connector.connect(
+                user=self.username,
+                account=self.account_id,
+                authenticator="oauth",
+                token=self.token,  # Token generated externally and provided directly to the recipe
+                warehouse=self.warehouse,
+                role=self.role,
+                application=_APPLICATION_NAME,
+                **connect_args,
+            )
         elif self.authentication_type == "OAUTH_AUTHENTICATOR":
             return self.get_oauth_connection()
         elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -132,7 +132,7 @@ class SnowflakeQuery:
         auto_clustering_on AS "AUTO_CLUSTERING_ON"
         FROM {db_clause}information_schema.tables t
         WHERE table_schema != 'INFORMATION_SCHEMA'
-        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod
@@ -152,7 +152,7 @@ class SnowflakeQuery:
         auto_clustering_on AS "AUTO_CLUSTERING_ON"
         FROM {db_clause}information_schema.tables t
         where table_schema='{schema_name}'
-        and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
+        and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
     @staticmethod

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import pydantic
 import sqlalchemy.dialects.mssql
-# This import verifies that the dependencies are available.
 from pydantic.fields import Field
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.engine.base import Connection

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             generate_operations=False,
         )
         for dataset_name in self._view_definition_cache.keys():
+            # TODO: Ensure that the lineage generated from the view definition
+            # matches the dataset_name.
             view_definition = self._view_definition_cache[dataset_name]
             result = self._run_sql_parser(
                 dataset_name,
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 exc=e,
             )
+    def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
+        try:
+            view_definition = inspector.get_view_definition(view, schema)
+            if view_definition is None:
+                view_definition = ""
+            else:
+                # Some dialects return a TextClause instead of a raw string,
+                # so we need to convert them to a string.
+                view_definition = str(view_definition)
+        except NotImplementedError:
+            view_definition = ""
+        return view_definition
     def _process_view(
         self,
         dataset_name: str,
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             columns = inspector.get_columns(view, schema)
         except KeyError:
             # For certain types of views, we are unable to fetch the list of columns.
-            self.warn(logger, dataset_name, "unable to get schema for this view")
+            self.report.warning(
+                message="Unable to get schema for a view",
+                context=f"{dataset_name}",
+            )
             schema_metadata = None
         else:
             schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             if self._save_schema_to_resolver():
                 self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
                 self.discovered_datasets.add(dataset_name)
         description, properties, _ = self.get_table_properties(inspector, schema, view)
-        try:
-            view_definition = inspector.get_view_definition(view, schema)
-            if view_definition is None:
-                view_definition = ""
-            else:
-                # Some dialects return a TextClause instead of a raw string,
-                # so we need to convert them to a string.
-                view_definition = str(view_definition)
-        except NotImplementedError:
-            view_definition = ""
-        properties["view_definition"] = view_definition
         properties["is_view"] = "True"
+        view_definition = self._get_view_definition(inspector, schema, view)
+        properties["view_definition"] = view_definition
         if view_definition and self.config.include_view_lineage:
             self._view_definition_cache[dataset_name] = view_definition
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             entityUrn=dataset_urn,
             aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
         ).as_workunit()
-        if "view_definition" in properties:
-            view_definition_string = properties["view_definition"]
-            view_properties_aspect = ViewPropertiesClass(
-                materialized=False, viewLanguage="SQL", viewLogic=view_definition_string
-            )
-            yield MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn,
-                aspect=view_properties_aspect,
-            ).as_workunit()
+        view_properties_aspect = ViewPropertiesClass(
+            materialized=False, viewLanguage="SQL", viewLogic=view_definition
+        )
+        yield MetadataChangeProposalWrapper(
+            entityUrn=dataset_urn,
+            aspect=view_properties_aspect,
+        ).as_workunit()
         if self.config.domain and self.domain_registry:
             yield from get_domain_wu(
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             )
         else:
             self.report.num_view_definitions_parsed += 1
+            if raw_lineage.out_tables != [view_urn]:
+                self.report.num_view_definitions_view_urn_mismatch += 1
         return view_definition_lineage_helper(raw_lineage, view_urn)
     def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:

datahub/ingestion/source/sql/sql_report.py CHANGED Viewed

@@ -48,6 +48,7 @@ class SQLSourceReport(
     query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
     num_view_definitions_parsed: int = 0
+    num_view_definitions_view_urn_mismatch: int = 0
     num_view_definitions_failed_parsing: int = 0
     num_view_definitions_failed_column_parsing: int = 0
     view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import json
 import logging
+from datetime import datetime
 from functools import lru_cache
-from typing import Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional
 import dateutil.parser as dp
 import requests
+from pydantic import BaseModel
 from pydantic.class_validators import root_validator, validator
 from pydantic.fields import Field
@@ -16,7 +18,9 @@ from datahub.configuration.source_common import (
 from datahub.emitter.mce_builder import (
     make_chart_urn,
     make_dashboard_urn,
+    make_data_platform_urn,
     make_dataset_urn,
+    make_dataset_urn_with_platform_instance,
     make_domain_urn,
 )
 from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -31,6 +35,7 @@ from datahub.ingestion.api.decorators import (
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
 from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
 from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
     get_platform_from_sqlalchemy_uri,
 )
@@ -47,16 +52,26 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
     AuditStamp,
     ChangeAuditStamps,
     Status,
+    TimeStamp,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
     ChartSnapshot,
     DashboardSnapshot,
+    DatasetSnapshot,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
+from datahub.metadata.com.linkedin.pegasus2avro.schema import (
+    MySqlDDL,
+    NullType,
+    SchemaField,
+    SchemaFieldDataType,
+    SchemaMetadata,
+)
 from datahub.metadata.schema_classes import (
     ChartInfoClass,
     ChartTypeClass,
     DashboardInfoClass,
+    DatasetPropertiesClass,
 )
 from datahub.utilities import config_clean
 from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -82,9 +97,29 @@ chart_type_from_viz_type = {
     "box_plot": ChartTypeClass.BAR,
 }
 platform_without_databases = ["druid"]
+class SupersetDataset(BaseModel):
+    id: int
+    table_name: str
+    changed_on_utc: Optional[str] = None
+    explore_url: Optional[str] = ""
+    @property
+    def modified_dt(self) -> Optional[datetime]:
+        if self.changed_on_utc:
+            return dp.parse(self.changed_on_utc)
+        return None
+    @property
+    def modified_ts(self) -> Optional[int]:
+        if self.modified_dt:
+            return int(self.modified_dt.timestamp() * 1000)
+        return None
 class SupersetConfig(
     StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
 ):
@@ -103,15 +138,17 @@ class SupersetConfig(
     )
     username: Optional[str] = Field(default=None, description="Superset username.")
     password: Optional[str] = Field(default=None, description="Superset password.")
-    api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
-    api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
-    manager_uri: str = Field(
-        default="https://api.app.preset.io", description="Preset.io API URL"
-    )
     # Configuration for stateful ingestion
     stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
         default=None, description="Superset Stateful Ingestion Config."
     )
+    ingest_dashboards: bool = Field(
+        default=True, description="Enable to ingest dashboards."
+    )
+    ingest_charts: bool = Field(default=True, description="Enable to ingest charts.")
+    ingest_datasets: bool = Field(
+        default=False, description="Enable to ingest datasets."
+    )
     provider: str = Field(default="db", description="Superset provider.")
     options: Dict = Field(default={}, description="")
@@ -123,6 +160,10 @@ class SupersetConfig(
         description="Can be used to change mapping for database names in superset to what you have in datahub",
     )
+    class Config:
+        # This is required to allow preset configs to get parsed
+        extra = "allow"
     @validator("connect_uri", "display_uri")
     def remove_trailing_slash(cls, v):
         return config_clean.remove_trailing_slashes(v)
@@ -229,6 +270,28 @@ class SupersetSource(StatefulIngestionSourceBase):
         config = SupersetConfig.parse_obj(config_dict)
         return cls(ctx, config)
+    def paginate_entity_api_results(self, entity_type, page_size=100):
+        current_page = 0
+        total_items = page_size
+        while current_page * page_size < total_items:
+            response = self.session.get(
+                f"{self.config.connect_uri}/api/v1/{entity_type}/",
+                params={"q": f"(page:{current_page},page_size:{page_size})"},
+            )
+            if response.status_code != 200:
+                logger.warning(f"Failed to get {entity_type} data: {response.text}")
+            payload = response.json()
+            # Update total_items with the actual count from the response
+            total_items = payload.get("count", total_items)
+            # Yield each item in the result, this gets passed into the construct functions
+            for item in payload.get("result", []):
+                yield item
+            current_page += 1
     @lru_cache(maxsize=None)
     def get_platform_from_database_id(self, database_id):
         database_response = self.session.get(
@@ -250,11 +313,18 @@ class SupersetSource(StatefulIngestionSourceBase):
         return platform_name
     @lru_cache(maxsize=None)
-    def get_datasource_urn_from_id(self, datasource_id):
+    def get_dataset_info(self, dataset_id: int) -> dict:
         dataset_response = self.session.get(
-            f"{self.config.connect_uri}/api/v1/dataset/{datasource_id}"
-        ).json()
+            f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
+        )
+        if dataset_response.status_code != 200:
+            logger.warning(f"Failed to get dataset info: {dataset_response.text}")
+            dataset_response.raise_for_status()
+        return dataset_response.json()
+    def get_datasource_urn_from_id(
+        self, dataset_response: dict, platform_instance: str
+    ) -> str:
         schema_name = dataset_response.get("result", {}).get("schema")
         table_name = dataset_response.get("result", {}).get("table_name")
         database_id = dataset_response.get("result", {}).get("database", {}).get("id")
@@ -283,9 +353,11 @@ class SupersetSource(StatefulIngestionSourceBase):
                 ),
                 env=self.config.env,
             )
-        return None
+        raise ValueError("Could not construct dataset URN")
-    def construct_dashboard_from_api_data(self, dashboard_data):
+    def construct_dashboard_from_api_data(
+        self, dashboard_data: dict
+    ) -> DashboardSnapshot:
         dashboard_urn = make_dashboard_urn(
             platform=self.platform,
             name=dashboard_data["id"],
@@ -340,7 +412,7 @@ class SupersetSource(StatefulIngestionSourceBase):
         }
         if dashboard_data.get("certified_by"):
-            custom_properties["CertifiedBy"] = dashboard_data.get("certified_by")
+            custom_properties["CertifiedBy"] = dashboard_data.get("certified_by", "")
             custom_properties["CertificationDetails"] = str(
                 dashboard_data.get("certification_details")
             )
@@ -358,38 +430,25 @@ class SupersetSource(StatefulIngestionSourceBase):
         return dashboard_snapshot
     def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
-        current_dashboard_page = 0
-        # we will set total dashboards to the actual number after we get the response
-        total_dashboards = PAGE_SIZE
-        while current_dashboard_page * PAGE_SIZE <= total_dashboards:
-            dashboard_response = self.session.get(
-                f"{self.config.connect_uri}/api/v1/dashboard/",
-                params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
-            )
-            if dashboard_response.status_code != 200:
-                logger.warning(
-                    f"Failed to get dashboard data: {dashboard_response.text}"
-                )
-            dashboard_response.raise_for_status()
-            payload = dashboard_response.json()
-            total_dashboards = payload.get("count") or 0
-            current_dashboard_page += 1
-            for dashboard_data in payload["result"]:
+        for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
+            try:
                 dashboard_snapshot = self.construct_dashboard_from_api_data(
                     dashboard_data
                 )
-                mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
-                yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
-                yield from self._get_domain_wu(
-                    title=dashboard_data.get("dashboard_title", ""),
-                    entity_urn=dashboard_snapshot.urn,
+            except Exception as e:
+                self.report.warning(
+                    f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
                 )
+                continue
+            # Emit the dashboard
+            mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
+            yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
+            yield from self._get_domain_wu(
+                title=dashboard_data.get("dashboard_title", ""),
+                entity_urn=dashboard_snapshot.urn,
+            )
-    def construct_chart_from_chart_data(self, chart_data):
+    def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
         chart_urn = make_chart_urn(
             platform=self.platform,
             name=chart_data["id"],
@@ -415,9 +474,12 @@ class SupersetSource(StatefulIngestionSourceBase):
         chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
         datasource_id = chart_data.get("datasource_id")
-        datasource_urn = self.get_datasource_urn_from_id(datasource_id)
+        dataset_response = self.get_dataset_info(datasource_id)
+        datasource_urn = self.get_datasource_urn_from_id(
+            dataset_response, self.platform
+        )
-        params = json.loads(chart_data.get("params"))
+        params = json.loads(chart_data.get("params", "{}"))
         metrics = [
             get_metric_name(metric)
             for metric in (params.get("metrics", []) or [params.get("metric")])
@@ -467,36 +529,124 @@ class SupersetSource(StatefulIngestionSourceBase):
         return chart_snapshot
     def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
-        current_chart_page = 0
-        # we will set total charts to the actual number after we get the response
-        total_charts = PAGE_SIZE
-        while current_chart_page * PAGE_SIZE <= total_charts:
-            chart_response = self.session.get(
-                f"{self.config.connect_uri}/api/v1/chart/",
-                params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
+        for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
+            try:
+                chart_snapshot = self.construct_chart_from_chart_data(chart_data)
+                mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
+            except Exception as e:
+                self.report.warning(
+                    f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
+                )
+                continue
+            # Emit the chart
+            yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
+            yield from self._get_domain_wu(
+                title=chart_data.get("slice_name", ""),
+                entity_urn=chart_snapshot.urn,
             )
-            if chart_response.status_code != 200:
-                logger.warning(f"Failed to get chart data: {chart_response.text}")
-            chart_response.raise_for_status()
-            current_chart_page += 1
+    def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
+        schema_fields: List[SchemaField] = []
+        for col in column_data:
+            col_type = (col.get("type") or "").lower()
+            data_type = resolve_sql_type(col_type)
+            if data_type is None:
+                data_type = NullType()
+            field = SchemaField(
+                fieldPath=col.get("column_name", ""),
+                type=SchemaFieldDataType(data_type),
+                nativeDataType="",
+                description=col.get("column_name", ""),
+                nullable=True,
+            )
+            schema_fields.append(field)
+        return schema_fields
+    def gen_schema_metadata(
+        self,
+        dataset_response: dict,
+    ) -> SchemaMetadata:
+        dataset_response = dataset_response.get("result", {})
+        column_data = dataset_response.get("columns", [])
+        schema_metadata = SchemaMetadata(
+            schemaName=dataset_response.get("table_name", ""),
+            platform=make_data_platform_urn(self.platform),
+            version=0,
+            hash="",
+            platformSchema=MySqlDDL(tableSchema=""),
+            fields=self.gen_schema_fields(column_data),
+        )
+        return schema_metadata
-            payload = chart_response.json()
-            total_charts = payload["count"]
-            for chart_data in payload["result"]:
-                chart_snapshot = self.construct_chart_from_chart_data(chart_data)
+    def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
+        return make_dataset_urn_with_platform_instance(
+            platform=self.platform,
+            name=datahub_dataset_name,
+            platform_instance=self.config.platform_instance,
+            env=self.config.env,
+        )
-                mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
-                yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
-                yield from self._get_domain_wu(
-                    title=chart_data.get("slice_name", ""),
-                    entity_urn=chart_snapshot.urn,
+    def construct_dataset_from_dataset_data(
+        self, dataset_data: dict
+    ) -> DatasetSnapshot:
+        dataset_response = self.get_dataset_info(dataset_data.get("id"))
+        dataset = SupersetDataset(**dataset_response["result"])
+        datasource_urn = self.get_datasource_urn_from_id(
+            dataset_response, self.platform
+        )
+        dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
+        dataset_info = DatasetPropertiesClass(
+            name=dataset.table_name,
+            description="",
+            lastModified=TimeStamp(time=dataset.modified_ts)
+            if dataset.modified_ts
+            else None,
+            externalUrl=dataset_url,
+        )
+        aspects_items: List[Any] = []
+        aspects_items.extend(
+            [
+                self.gen_schema_metadata(dataset_response),
+                dataset_info,
+            ]
+        )
+        dataset_snapshot = DatasetSnapshot(
+            urn=datasource_urn,
+            aspects=aspects_items,
+        )
+        return dataset_snapshot
+    def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
+        for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
+            try:
+                dataset_snapshot = self.construct_dataset_from_dataset_data(
+                    dataset_data
                 )
+                mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
+            except Exception as e:
+                self.report.warning(
+                    f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
+                )
+                continue
+            # Emit the dataset
+            yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
+            yield from self._get_domain_wu(
+                title=dataset_data.get("table_name", ""),
+                entity_urn=dataset_snapshot.urn,
+            )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        yield from self.emit_dashboard_mces()
-        yield from self.emit_chart_mces()
+        if self.config.ingest_dashboards:
+            yield from self.emit_dashboard_mces()
+        if self.config.ingest_charts:
+            yield from self.emit_chart_mces()
+        if self.config.ingest_datasets:
+            yield from self.emit_dataset_mces()
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -974,6 +974,8 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             )
         else:
             self.report.num_view_definitions_parsed += 1
+            if raw_lineage.out_tables != [view_urn]:
+                self.report.num_view_definitions_view_urn_mismatch += 1
         return view_definition_lineage_helper(raw_lineage, view_urn)
     def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:

datahub/sql_parsing/sqlglot_lineage.py CHANGED Viewed

@@ -1243,13 +1243,19 @@ def infer_output_schema(result: SqlParsingResult) -> Optional[List[SchemaFieldCl
 def view_definition_lineage_helper(
     result: SqlParsingResult, view_urn: str
 ) -> SqlParsingResult:
-    if result.query_type is QueryType.SELECT:
+    if result.query_type is QueryType.SELECT or (
+        result.out_tables and result.out_tables != [view_urn]
+    ):
         # Some platforms (e.g. postgres) store only <select statement> from view definition
         # `create view V as <select statement>` . For such view definitions, `result.out_tables` and
         # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream
         # details and downstream column details are extracted correctly.
         # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream`
         # to get complete lineage result.
+        # Some platforms(e.g. mssql) may have slightly different view name in view definition than
+        # actual view name used elsewhere. Therefore we overwrite downstream table for such cases as well.
         result.out_tables = [view_urn]
         if result.column_lineage:
             for col_result in result.column_lineage:

{acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0rc12__py3-none-any.whl → 0.15.0rc14__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc12py3-none-any.whl → 0.15.0rc14py3-none-any.whl