PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import logging
+from dataclasses import dataclass, field
 from datetime import datetime
 from functools import lru_cache
 from typing import Any, Dict, Iterable, List, Optional
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
     make_dataset_urn,
     make_dataset_urn_with_platform_instance,
     make_domain_urn,
+    make_user_urn,
 )
 from datahub.emitter.mcp_builder import add_domain_to_entity_wu
 from datahub.ingestion.api.common import PipelineContext
@@ -36,9 +38,6 @@ from datahub.ingestion.api.decorators import (
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.sql.sql_types import resolve_sql_type
-from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
-    get_platform_from_sqlalchemy_uri,
-)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -49,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionSourceBase,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
-    AuditStamp,
     ChangeAuditStamps,
     Status,
     TimeStamp,
@@ -68,12 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     SchemaMetadata,
 )
 from datahub.metadata.schema_classes import (
+    AuditStampClass,
     ChartInfoClass,
     ChartTypeClass,
     DashboardInfoClass,
+    DatasetLineageTypeClass,
     DatasetPropertiesClass,
+    GlobalTagsClass,
+    OwnerClass,
+    OwnershipClass,
+    OwnershipTypeClass,
+    TagAssociationClass,
+    UpstreamClass,
+    UpstreamLineageClass,
 )
 from datahub.utilities import config_clean
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.registries.domain_registry import DomainRegistry
 logger = logging.getLogger(__name__)
@@ -101,6 +109,14 @@ chart_type_from_viz_type = {
 platform_without_databases = ["druid"]
+@dataclass
+class SupersetSourceReport(StaleEntityRemovalSourceReport):
+    filtered: LossyList[str] = field(default_factory=LossyList)
+    def report_dropped(self, name: str) -> None:
+        self.filtered.append(name)
 class SupersetDataset(BaseModel):
     id: int
     table_name: str
@@ -136,6 +152,18 @@ class SupersetConfig(
         default=dict(),
         description="regex patterns for tables to filter to assign domain_key. ",
     )
+    dataset_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for dataset to filter in ingestion.",
+    )
+    chart_pattern: AllowDenyPattern = Field(
+        AllowDenyPattern.allow_all(),
+        description="Patterns for selecting chart names that are to be included",
+    )
+    dashboard_pattern: AllowDenyPattern = Field(
+        AllowDenyPattern.allow_all(),
+        description="Patterns for selecting dashboard names that are to be included",
+    )
     username: Optional[str] = Field(default=None, description="Superset username.")
     password: Optional[str] = Field(default=None, description="Superset password.")
     # Configuration for stateful ingestion
@@ -216,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
     """
     config: SupersetConfig
-    report: StaleEntityRemovalSourceReport
+    report: SupersetSourceReport
     platform = "superset"
     def __hash__(self):
@@ -225,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
     def __init__(self, ctx: PipelineContext, config: SupersetConfig):
         super().__init__(config, ctx)
         self.config = config
-        self.report = StaleEntityRemovalSourceReport()
+        self.report = SupersetSourceReport()
         if self.config.domain:
             self.domain_registry = DomainRegistry(
                 cached_domains=[domain_id for domain_id in self.config.domain],
                 graph=self.ctx.graph,
             )
         self.session = self.login()
+        self.owner_info = self.parse_owner_info()
     def login(self) -> requests.Session:
         login_response = requests.post(
@@ -271,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
         while current_page * page_size < total_items:
             response = self.session.get(
-                f"{self.config.connect_uri}/api/v1/{entity_type}/",
+                f"{self.config.connect_uri}/api/v1/{entity_type}",
                 params={"q": f"(page:{current_page},page_size:{page_size})"},
             )
@@ -287,25 +316,24 @@ class SupersetSource(StatefulIngestionSourceBase):
             current_page += 1
-    @lru_cache(maxsize=None)
-    def get_platform_from_database_id(self, database_id):
-        database_response = self.session.get(
-            f"{self.config.connect_uri}/api/v1/database/{database_id}"
-        ).json()
-        sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
-        if sqlalchemy_uri is None:
-            platform_name = database_response.get("result", {}).get(
-                "backend", "external"
-            )
-        else:
-            platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
-        if platform_name == "awsathena":
-            return "athena"
-        if platform_name == "clickhousedb":
-            return "clickhouse"
-        if platform_name == "postgresql":
-            return "postgres"
-        return platform_name
+    def parse_owner_info(self) -> Dict[str, Any]:
+        entity_types = ["dataset", "dashboard", "chart"]
+        owners_info = {}
+        for entity in entity_types:
+            for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
+                owner_id = owner.get("value")
+                if owner_id:
+                    owners_info[owner_id] = owner.get("extra", {}).get("email", "")
+        return owners_info
+    def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
+        return [
+            make_user_urn(self.owner_info.get(owner.get("id"), ""))
+            for owner in data.get("owners", [])
+            if owner.get("id")
+        ]
     @lru_cache(maxsize=None)
     def get_dataset_info(self, dataset_id: int) -> dict:
@@ -323,8 +351,6 @@ class SupersetSource(StatefulIngestionSourceBase):
         schema_name = dataset_response.get("result", {}).get("schema")
         table_name = dataset_response.get("result", {}).get("table_name")
         database_id = dataset_response.get("result", {}).get("database", {}).get("id")
-        platform = self.get_platform_from_database_id(database_id)
         database_name = (
             dataset_response.get("result", {}).get("database", {}).get("database_name")
         )
@@ -333,21 +359,24 @@ class SupersetSource(StatefulIngestionSourceBase):
         # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
         # from superset. There is only one database per platform instance, and one schema named druid, so it would be
         # redundant to systemically store them both in the URN.
-        if platform in platform_without_databases:
+        if platform_instance in platform_without_databases:
             database_name = None
-        if platform == "druid" and schema_name == "druid":
+        if platform_instance == "druid" and schema_name == "druid":
             # Follow DataHub's druid source convention.
             schema_name = None
-        if database_id and table_name:
+        # If the information about the datasource is already contained in the dataset response,
+        # can just return the urn directly
+        if table_name and database_id:
             return make_dataset_urn(
-                platform=platform,
+                platform=platform_instance,
                 name=".".join(
                     name for name in [database_name, schema_name, table_name] if name
                 ),
                 env=self.config.env,
             )
         raise ValueError("Could not construct dataset URN")
     def construct_dashboard_from_api_data(
@@ -355,7 +384,7 @@ class SupersetSource(StatefulIngestionSourceBase):
     ) -> DashboardSnapshot:
         dashboard_urn = make_dashboard_urn(
             platform=self.platform,
-            name=dashboard_data["id"],
+            name=str(dashboard_data["id"]),
             platform_instance=self.config.platform_instance,
         )
         dashboard_snapshot = DashboardSnapshot(
@@ -363,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
             aspects=[Status(removed=False)],
         )
-        modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
+        modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
         modified_ts = int(
             dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
         )
         title = dashboard_data.get("dashboard_title", "")
         # note: the API does not currently supply created_by usernames due to a bug
-        last_modified = ChangeAuditStamps(
-            created=None,
-            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
+        last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
+        change_audit_stamps = ChangeAuditStamps(
+            created=None, lastModified=last_modified
         )
         dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
@@ -386,7 +416,7 @@ class SupersetSource(StatefulIngestionSourceBase):
             chart_urns.append(
                 make_chart_urn(
                     platform=self.platform,
-                    name=value.get("meta", {}).get("chartId", "unknown"),
+                    name=str(value.get("meta", {}).get("chartId", "unknown")),
                     platform_instance=self.config.platform_instance,
                 )
             )
@@ -397,13 +427,11 @@ class SupersetSource(StatefulIngestionSourceBase):
             "IsPublished": str(dashboard_data.get("published", False)).lower(),
             "Owners": ", ".join(
                 map(
-                    lambda owner: owner.get("username", "unknown"),
+                    lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
                     dashboard_data.get("owners", []),
                 )
             ),
-            "IsCertified": str(
-                True if dashboard_data.get("certified_by") else False
-            ).lower(),
+            "IsCertified": str(bool(dashboard_data.get("certified_by"))).lower(),
         }
         if dashboard_data.get("certified_by"):
@@ -417,16 +445,39 @@ class SupersetSource(StatefulIngestionSourceBase):
             description="",
             title=title,
             charts=chart_urns,
-            lastModified=last_modified,
             dashboardUrl=dashboard_url,
             customProperties=custom_properties,
+            lastModified=change_audit_stamps,
         )
         dashboard_snapshot.aspects.append(dashboard_info)
+        dashboard_owners_list = self.build_owner_urn(dashboard_data)
+        owners_info = OwnershipClass(
+            owners=[
+                OwnerClass(
+                    owner=urn,
+                    type=OwnershipTypeClass.TECHNICAL_OWNER,
+                )
+                for urn in (dashboard_owners_list or [])
+            ],
+            lastModified=last_modified,
+        )
+        dashboard_snapshot.aspects.append(owners_info)
         return dashboard_snapshot
     def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
-        for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
+        for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
             try:
+                dashboard_id = str(dashboard_data.get("id"))
+                dashboard_title = dashboard_data.get("dashboard_title", "")
+                if not self.config.dashboard_pattern.allowed(dashboard_title):
+                    self.report.report_dropped(
+                        f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
+                    )
+                    continue
                 dashboard_snapshot = self.construct_dashboard_from_api_data(
                     dashboard_data
                 )
@@ -439,14 +490,14 @@ class SupersetSource(StatefulIngestionSourceBase):
             mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
             yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
             yield from self._get_domain_wu(
-                title=dashboard_data.get("dashboard_title", ""),
+                title=dashboard_title,
                 entity_urn=dashboard_snapshot.urn,
             )
     def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
         chart_urn = make_chart_urn(
             platform=self.platform,
-            name=chart_data["id"],
+            name=str(chart_data["id"]),
             platform_instance=self.config.platform_instance,
         )
         chart_snapshot = ChartSnapshot(
@@ -454,25 +505,33 @@ class SupersetSource(StatefulIngestionSourceBase):
             aspects=[Status(removed=False)],
         )
-        modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('username', 'unknown')}"
+        modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
         modified_ts = int(
             dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
         )
         title = chart_data.get("slice_name", "")
         # note: the API does not currently supply created_by usernames due to a bug
-        last_modified = ChangeAuditStamps(
-            created=None,
-            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
+        last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
+        change_audit_stamps = ChangeAuditStamps(
+            created=None, lastModified=last_modified
         )
         chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
         chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
         datasource_id = chart_data.get("datasource_id")
-        dataset_response = self.get_dataset_info(datasource_id)
-        datasource_urn = self.get_datasource_urn_from_id(
-            dataset_response, self.platform
-        )
+        if not datasource_id:
+            logger.debug(
+                f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
+            )
+            datasource_urn = None
+        else:
+            dataset_response = self.get_dataset_info(datasource_id)
+            datasource_urn = self.get_datasource_urn_from_id(
+                dataset_response, self.platform
+            )
         params = json.loads(chart_data.get("params", "{}"))
         metrics = [
@@ -515,23 +574,61 @@ class SupersetSource(StatefulIngestionSourceBase):
             type=chart_type,
             description="",
             title=title,
-            lastModified=last_modified,
             chartUrl=chart_url,
             inputs=[datasource_urn] if datasource_urn else None,
             customProperties=custom_properties,
+            lastModified=change_audit_stamps,
         )
         chart_snapshot.aspects.append(chart_info)
+        chart_owners_list = self.build_owner_urn(chart_data)
+        owners_info = OwnershipClass(
+            owners=[
+                OwnerClass(
+                    owner=urn,
+                    type=OwnershipTypeClass.TECHNICAL_OWNER,
+                )
+                for urn in (chart_owners_list or [])
+            ],
+            lastModified=last_modified,
+        )
+        chart_snapshot.aspects.append(owners_info)
         return chart_snapshot
     def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
-        for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
+        for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
             try:
+                chart_id = str(chart_data.get("id"))
+                chart_name = chart_data.get("slice_name", "")
+                if not self.config.chart_pattern.allowed(chart_name):
+                    self.report.report_dropped(
+                        f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
+                    )
+                    continue
+                # Emit a warning if charts use data from a dataset that will be filtered out
+                if self.config.dataset_pattern != AllowDenyPattern.allow_all():
+                    datasource_id = chart_data.get("datasource_id")
+                    if datasource_id:
+                        dataset_response = self.get_dataset_info(datasource_id)
+                        dataset_name = dataset_response.get("result", {}).get(
+                            "table_name", ""
+                        )
+                        if dataset_name and not self.config.dataset_pattern.allowed(
+                            dataset_name
+                        ):
+                            self.report.warning(
+                                f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
+                            )
                 chart_snapshot = self.construct_chart_from_chart_data(chart_data)
                 mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
             except Exception as e:
                 self.report.warning(
-                    f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
+                    f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
                 )
                 continue
             # Emit the chart
@@ -588,25 +685,65 @@ class SupersetSource(StatefulIngestionSourceBase):
     ) -> DatasetSnapshot:
         dataset_response = self.get_dataset_info(dataset_data.get("id"))
         dataset = SupersetDataset(**dataset_response["result"])
         datasource_urn = self.get_datasource_urn_from_id(
             dataset_response, self.platform
         )
+        dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
+        modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
+        modified_ts = int(
+            dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
+        )
+        last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
+        upstream_warehouse_platform = (
+            dataset_response.get("result", {}).get("database", {}).get("backend")
+        )
-        dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
+        # Preset has a way of naming their platforms differently than
+        # how datahub names them, so map the platform name to the correct naming
+        warehouse_naming = {
+            "awsathena": "athena",
+            "clickhousedb": "clickhouse",
+            "postgresql": "postgres",
+        }
+        if upstream_warehouse_platform in warehouse_naming:
+            upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
+        # TODO: Categorize physical vs virtual upstream dataset
+        # mark all upstream dataset as physical for now, in the future we would ideally like
+        # to differentiate physical vs virtual upstream datasets
+        tag_urn = f"urn:li:tag:{self.platform}:physical"
+        upstream_dataset = self.get_datasource_urn_from_id(
+            dataset_response, upstream_warehouse_platform
+        )
+        upstream_lineage = UpstreamLineageClass(
+            upstreams=[
+                UpstreamClass(
+                    type=DatasetLineageTypeClass.TRANSFORMED,
+                    dataset=upstream_dataset,
+                    properties={"externalUrl": dataset_url},
+                )
+            ]
+        )
         dataset_info = DatasetPropertiesClass(
             name=dataset.table_name,
             description="",
-            lastModified=TimeStamp(time=dataset.modified_ts)
-            if dataset.modified_ts
-            else None,
             externalUrl=dataset_url,
+            lastModified=TimeStamp(time=modified_ts),
         )
+        global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
         aspects_items: List[Any] = []
         aspects_items.extend(
             [
                 self.gen_schema_metadata(dataset_response),
                 dataset_info,
+                upstream_lineage,
+                global_tags,
             ]
         )
@@ -614,11 +751,34 @@ class SupersetSource(StatefulIngestionSourceBase):
             urn=datasource_urn,
             aspects=aspects_items,
         )
+        dataset_owners_list = self.build_owner_urn(dataset_data)
+        owners_info = OwnershipClass(
+            owners=[
+                OwnerClass(
+                    owner=urn,
+                    type=OwnershipTypeClass.TECHNICAL_OWNER,
+                )
+                for urn in (dataset_owners_list or [])
+            ],
+            lastModified=last_modified,
+        )
+        aspects_items.append(owners_info)
         return dataset_snapshot
     def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
-        for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
+        for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
             try:
+                dataset_name = dataset_data.get("table_name", "")
+                # Check if dataset should be filtered by dataset name
+                if not self.config.dataset_pattern.allowed(dataset_name):
+                    self.report.report_dropped(
+                        f"Dataset '{dataset_name}' filtered by dataset_pattern"
+                    )
+                    continue
                 dataset_snapshot = self.construct_dataset_from_dataset_data(
                     dataset_data
                 )

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -1562,8 +1562,9 @@ class TableauSiteSource:
         query: str,
         connection_type: str,
         page_size: int,
-        query_filter: dict = {},
+        query_filter: Optional[dict] = None,
     ) -> Iterable[dict]:
+        query_filter = query_filter or {}
         query_filter = optimize_query_filter(query_filter)
         # Calls the get_connection_object_page function to get the objects,
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
                     if upstream_col.get(c.TABLE)
                     else None
                 )
-                if (
-                    name
-                    and upstream_table_id
-                    and upstream_table_id in table_id_to_urn.keys()
-                ):
+                if name and upstream_table_id and upstream_table_id in table_id_to_urn:
                     parent_dataset_urn = table_id_to_urn[upstream_table_id]
                     if (
                         self.is_snowflake_urn(parent_dataset_urn)
@@ -2190,6 +2187,10 @@ class TableauSiteSource:
                 dataset_snapshot.aspects.append(browse_paths)
             else:
                 logger.debug(f"Browse path not set for Custom SQL table {csql_id}")
+                logger.warning(
+                    f"Skipping Custom SQL table {csql_id} due to filtered downstream"
+                )
+                continue
             dataset_properties = DatasetPropertiesClass(
                 name=csql.get(c.NAME),
@@ -2628,6 +2629,15 @@ class TableauSiteSource:
             datasource_info = datasource
         browse_path = self._get_project_browse_path_name(datasource)
+        if (
+            not is_embedded_ds
+            and self._get_published_datasource_project_luid(datasource) is None
+        ):
+            logger.warning(
+                f"Skip ingesting published datasource {datasource.get(c.NAME)} because of filtered project"
+            )
+            return
         logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}")
         datasource_id = datasource[c.ID]
         datasource_urn = builder.make_dataset_urn_with_platform_instance(
@@ -2851,6 +2861,11 @@ class TableauSiteSource:
             query_filter=tables_filter,
             page_size=self.config.effective_database_table_page_size,
         ):
+            if tableau_database_table_id_to_urn_map.get(tableau_table[c.ID]) is None:
+                logger.warning(
+                    f"Skipping table {tableau_table[c.ID]} due to filtered out published datasource"
+                )
+                continue
             database_table = self.database_tables[
                 tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
             ]
@@ -2905,6 +2920,7 @@ class TableauSiteSource:
             dataset_snapshot.aspects.append(browse_paths)
         else:
             logger.debug(f"Browse path not set for table {database_table.urn}")
+            return
         schema_metadata = self.get_schema_metadata_for_table(
             tableau_columns, database_table.parsed_columns

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
 }
-def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
+def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
+    params = params or []
     tags = [
         TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
         for tag in params
@@ -901,7 +902,7 @@ def get_unique_custom_sql(custom_sql_list: List[dict]) -> List[dict]:
             "name": custom_sql.get("name"),
             # We assume that this is unsupported custom sql if "actual tables that this query references"
             # are missing from api result.
-            "isUnsupportedCustomSql": True if not custom_sql.get("tables") else False,
+            "isUnsupportedCustomSql": not custom_sql.get("tables"),
             "query": custom_sql.get("query"),
             "connectionType": custom_sql.get("connectionType"),
             "columns": custom_sql.get("columns"),

datahub/ingestion/source/unity/ge_profiler.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import concurrent.futures
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
@@ -91,7 +92,7 @@ class UnityCatalogGEProfiler(GenericProfiler):
                         profile_requests.append(profile_request)
                     if i > 0 and i % 100 == 0:
                         logger.info(f"Finished table-level profiling for {i} tables")
-            except TimeoutError:
+            except (TimeoutError, concurrent.futures.TimeoutError):
                 logger.warning("Timed out waiting to complete table-level profiling.")
         if len(profile_requests) == 0:

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -464,7 +464,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             with self.report.new_stage(f"Ingest schema {schema.id}"):
                 yield from self.gen_schema_containers(schema)
-                yield from self.process_tables(schema)
+                try:
+                    yield from self.process_tables(schema)
+                except Exception as e:
+                    logger.exception(f"Error parsing schema {schema}")
+                    self.report.report_warning(
+                        message="Missed schema because of parsing issues",
+                        context=str(schema),
+                        title="Error parsing schema",
+                        exc=e,
+                    )
+                    continue
                 self.report.schemas.processed(schema.id)

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl