PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show

{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
datahub/_version.py +1 -1
datahub/cli/check_cli.py +0 -7
datahub/cli/cli_utils.py +73 -0
datahub/cli/delete_cli.py +0 -6
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +148 -228
datahub/cli/exists_cli.py +0 -4
datahub/cli/get_cli.py +0 -4
datahub/cli/ingest_cli.py +1 -20
datahub/cli/put_cli.py +0 -6
datahub/cli/quickstart_versioning.py +50 -5
datahub/cli/specific/assertions_cli.py +0 -6
datahub/cli/specific/datacontract_cli.py +0 -6
datahub/cli/specific/dataproduct_cli.py +0 -22
datahub/cli/specific/dataset_cli.py +0 -11
datahub/cli/specific/forms_cli.py +0 -6
datahub/cli/specific/group_cli.py +0 -4
datahub/cli/specific/structuredproperties_cli.py +0 -7
datahub/cli/specific/user_cli.py +0 -4
datahub/cli/state_cli.py +0 -4
datahub/cli/timeline_cli.py +0 -4
datahub/entrypoints.py +4 -3
datahub/ingestion/api/report.py +183 -35
datahub/ingestion/autogenerated/capability_summary.json +3431 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +30 -128
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/graph/client.py +2 -2
datahub/ingestion/run/pipeline.py +47 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +1 -1
datahub/ingestion/source/data_lake_common/object_store.py +40 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dremio/dremio_source.py +7 -7
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +28 -20
datahub/ingestion/source/identity/okta.py +0 -13
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
datahub/ingestion/source/powerbi/powerbi.py +0 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/source.py +19 -3
datahub/ingestion/source/sigma/sigma.py +6 -1
datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/hive_metastore.py +0 -10
datahub/ingestion/source/sql/sql_common.py +4 -0
datahub/ingestion/source/sql/vertica.py +0 -4
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/superset.py +56 -1
datahub/ingestion/source/tableau/tableau.py +40 -34
datahub/ingestion/source/tableau/tableau_constant.py +0 -2
datahub/ingestion/source/unity/proxy.py +4 -3
datahub/ingestion/source/unity/source.py +19 -9
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +85 -4
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
datahub/metadata/schema.avsc +54 -1
datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
datahub/sdk/lineage_client.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
datahub/sql_parsing/sqlglot_lineage.py +40 -13
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/snowflake/stored_proc_lineage.py ADDED Viewed

@@ -0,0 +1,143 @@
+import dataclasses
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Iterable, List, Optional
+from datahub.ingestion.api.closeable import Closeable
+from datahub.metadata.urns import CorpUserUrn
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    PreparsedQuery,
+    UrnStr,
+)
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
+from datahub.utilities.file_backed_collections import FileBackedDict
+@dataclasses.dataclass
+class StoredProcCall:
+    snowflake_root_query_id: str
+    # Query text will typically be something like:
+    # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
+    query_text: str
+    timestamp: datetime
+    user: CorpUserUrn
+    default_db: str
+    default_schema: str
+@dataclass
+class StoredProcExecutionLineage:
+    call: StoredProcCall
+    inputs: List[UrnStr]
+    outputs: List[UrnStr]
+@dataclass
+class StoredProcLineageReport:
+    num_stored_proc_calls: int = 0
+    num_related_queries: int = 0
+    num_related_queries_without_proc_call: int = 0
+    # Incremented at generation/build time.
+    num_stored_proc_lineage_entries: int = 0
+    num_stored_proc_calls_with_no_inputs: int = 0
+    num_stored_proc_calls_with_no_outputs: int = 0
+class StoredProcLineageTracker(Closeable):
+    """
+    Tracks table-level lineage for Snowflake stored procedures.
+    Stored procedures in Snowflake trigger multiple SQL queries during execution.
+    Snowflake assigns each stored procedure call a unique query_id and uses this as the
+    root_query_id for all subsequent queries executed within that procedure. This allows
+    us to trace which queries belong to a specific stored procedure execution and build
+    table-level lineage by aggregating inputs/outputs from all related queries.
+    """
+    def __init__(self, platform: str, shared_connection: Optional[Any] = None):
+        self.platform = platform
+        self.report = StoredProcLineageReport()
+        # { root_query_id -> StoredProcExecutionLineage }
+        self._stored_proc_execution_lineage: FileBackedDict[
+            StoredProcExecutionLineage
+        ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
+    def add_stored_proc_call(self, call: StoredProcCall) -> None:
+        """Add a stored procedure call to track."""
+        self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
+            StoredProcExecutionLineage(
+                call=call,
+                # Will be populated by subsequent queries.
+                inputs=[],
+                outputs=[],
+            )
+        )
+        self.report.num_stored_proc_calls += 1
+    def add_related_query(self, query: PreparsedQuery) -> bool:
+        """Add a query that might be related to a stored procedure execution.
+        Returns True if the query was added to a stored procedure execution, False otherwise.
+        """
+        snowflake_root_query_id = (query.extra_info or {}).get(
+            "snowflake_root_query_id"
+        )
+        if snowflake_root_query_id:
+            if snowflake_root_query_id not in self._stored_proc_execution_lineage:
+                self.report.num_related_queries_without_proc_call += 1
+                return False
+            stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
+                snowflake_root_query_id
+            )
+            stored_proc_execution.inputs.extend(query.upstreams)
+            if query.downstream is not None:
+                stored_proc_execution.outputs.append(query.downstream)
+            self.report.num_related_queries += 1
+            return True
+        return False
+    def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
+        # For stored procedures, we can only get table-level lineage from the audit log.
+        # We represent these as PreparsedQuery objects for now. Eventually we'll want to
+        # create dataJobInputOutput lineage instead.
+        for stored_proc_execution in self._stored_proc_execution_lineage.values():
+            if not stored_proc_execution.inputs:
+                self.report.num_stored_proc_calls_with_no_inputs += 1
+                continue
+            if not stored_proc_execution.outputs:
+                self.report.num_stored_proc_calls_with_no_outputs += 1
+                # Still continue to generate lineage for cases where we have inputs but no outputs
+            for downstream in stored_proc_execution.outputs:
+                stored_proc_query_id = get_query_fingerprint(
+                    stored_proc_execution.call.query_text,
+                    self.platform,
+                    fast=True,
+                    secondary_id=downstream,
+                )
+                lineage_entry = PreparsedQuery(
+                    query_id=stored_proc_query_id,
+                    query_text=stored_proc_execution.call.query_text,
+                    upstreams=stored_proc_execution.inputs,
+                    downstream=downstream,
+                    query_count=0,
+                    user=stored_proc_execution.call.user,
+                    timestamp=stored_proc_execution.call.timestamp,
+                )
+                self.report.num_stored_proc_lineage_entries += 1
+                yield lineage_entry
+    def close(self) -> None:
+        self._stored_proc_execution_lineage.close()

datahub/ingestion/source/sql/hive_metastore.py CHANGED Viewed

@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
 from datahub.metadata.schema_classes import (
-    ChangeTypeClass,
     DatasetPropertiesClass,
     SubTypesClass,
     ViewPropertiesClass,
@@ -601,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
                 yield dpi_aspect
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="subTypes",
                 aspect=SubTypesClass(typeNames=[self.table_subtype]),
             ).as_workunit()
@@ -810,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
             # Add views subtype
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="subTypes",
                 aspect=SubTypesClass(typeNames=[self.view_subtype]),
             ).as_workunit()
@@ -824,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
                 viewLogic=dataset.view_definition if dataset.view_definition else "",
             )
             yield MetadataChangeProposalWrapper(
-                entityType="dataset",
-                changeType=ChangeTypeClass.UPSERT,
                 entityUrn=dataset_urn,
-                aspectName="viewProperties",
                 aspect=view_properties_aspect,
             ).as_workunit()

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -292,6 +292,10 @@ class ProfileMetadata:
     SourceCapability.CONTAINERS,
     "Enabled by default",
     supported=True,
+    subtype_modifier=[
+        SourceCapabilityModifier.DATABASE,
+        SourceCapabilityModifier.SCHEMA,
+    ],
 )
 @capability(
     SourceCapability.DESCRIPTIONS,

datahub/ingestion/source/sql/vertica.py CHANGED Viewed

@@ -45,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.schema_classes import (
-    ChangeTypeClass,
     DatasetLineageTypeClass,
     DatasetPropertiesClass,
     SubTypesClass,
@@ -501,10 +500,7 @@ class VerticaSource(SQLAlchemySource):
         if dpi_aspect:
             yield dpi_aspect
         yield MetadataChangeProposalWrapper(
-            entityType="dataset",
-            changeType=ChangeTypeClass.UPSERT,
             entityUrn=dataset_urn,
-            aspectName="subTypes",
             aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
         ).as_workunit()

datahub/ingestion/source/sql_queries.py CHANGED Viewed

@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
         description="The default schema to use for unqualified table names",
         default=None,
     )
-    default_dialect: Optional[str] = Field(
+    override_dialect: Optional[str] = Field(
         description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
         default=None,
     )
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
             schema_resolver=self.schema_resolver,
             default_db=self.config.default_db,
             default_schema=self.config.default_schema,
-            default_dialect=self.config.default_dialect,
+            override_dialect=self.config.override_dialect,
         )
         if result.debug_info.table_error:
             logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
         if datasource_id:
             dataset_info = self.get_dataset_info(datasource_id).get("result", {})
             dataset_column_info = dataset_info.get("columns", [])
+            dataset_metric_info = dataset_info.get("metrics", [])
             for column in dataset_column_info:
                 col_name = column.get("column_name", "")
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
                     continue
                 dataset_columns.append((col_name, col_type, col_description))
+            for metric in dataset_metric_info:
+                metric_name = metric.get("metric_name", "")
+                metric_type = metric.get("metric_type", "")
+                metric_description = metric.get("description", "")
+                if metric_name == "" or metric_type == "":
+                    logger.info(f"could not construct metric lineage for {metric}")
+                    continue
+                dataset_columns.append((metric_name, metric_type, metric_description))
         else:
             # if no datasource id, cannot build cll, just return
             logger.warning(
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
             schema_fields.append(field)
         return schema_fields
+    def gen_metric_schema_fields(
+        self, metric_data: List[Dict[str, Any]]
+    ) -> List[SchemaField]:
+        schema_fields: List[SchemaField] = []
+        for metric in metric_data:
+            metric_type = metric.get("metric_type", "")
+            data_type = resolve_sql_type(metric_type)
+            if data_type is None:
+                data_type = NullType()
+            field = SchemaField(
+                fieldPath=metric.get("metric_name", ""),
+                type=SchemaFieldDataType(data_type),
+                nativeDataType=metric_type or "",
+                description=metric.get("description", ""),
+                nullable=True,
+            )
+            schema_fields.append(field)
+        return schema_fields
     def gen_schema_metadata(
         self,
         dataset_response: dict,
     ) -> SchemaMetadata:
         dataset_response = dataset_response.get("result", {})
         column_data = dataset_response.get("columns", [])
+        metric_data = dataset_response.get("metrics", [])
+        column_fields = self.gen_schema_fields(column_data)
+        metric_fields = self.gen_metric_schema_fields(metric_data)
         schema_metadata = SchemaMetadata(
             schemaName=dataset_response.get("table_name", ""),
             platform=make_data_platform_urn(self.platform),
             version=0,
             hash="",
             platformSchema=MySqlDDL(tableSchema=""),
-            fields=self.gen_schema_fields(column_data),
+            fields=column_fields + metric_fields,
         )
         return schema_metadata
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
         # To generate column level lineage, we can manually decode the metadata
         # to produce the ColumnLineageInfo
         columns = dataset_response.get("result", {}).get("columns", [])
+        metrics = dataset_response.get("result", {}).get("metrics", [])
         fine_grained_lineages: List[FineGrainedLineageClass] = []
         for column in columns:
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
                 )
             )
+        for metric in metrics:
+            metric_name = metric.get("metric_name", "")
+            if not metric_name:
+                continue
+            downstream = [make_schema_field_urn(datasource_urn, metric_name)]
+            upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
+            fine_grained_lineages.append(
+                FineGrainedLineageClass(
+                    downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
+                    downstreams=downstream,
+                    upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
+                    upstreams=upstreams,
+                )
+            )
         upstream_lineage = UpstreamLineageClass(
             upstreams=[
                 UpstreamClass(

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -149,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
 )
 from datahub.metadata.schema_classes import (
     BrowsePathsClass,
-    ChangeTypeClass,
     ChartInfoClass,
     ChartUsageStatisticsClass,
     DashboardInfoClass,
@@ -529,6 +528,14 @@ class TableauConfig(
         default=False,
         description="Ingest details for tables external to (not embedded in) tableau as entities.",
     )
+    emit_all_published_datasources: bool = Field(
+        default=False,
+        description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
+    )
+    emit_all_embedded_datasources: bool = Field(
+        default=False,
+        description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
+    )
     env: str = Field(
         default=builder.DEFAULT_ENV,
@@ -2180,32 +2187,32 @@ class TableauSiteSource:
                     else []
                 )
-                # The Tableau SQL parser much worse than our sqlglot based parser,
-                # so relying on metadata parsed by Tableau from SQL queries can be
-                # less accurate. This option allows us to ignore Tableau's parser and
-                # only use our own.
-                if self.config.force_extraction_of_lineage_from_custom_sql_queries:
-                    logger.debug("Extracting TLL & CLL from custom sql (forced)")
+                tableau_table_list = csql.get(c.TABLES, [])
+                if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
+                    not tableau_table_list
+                    and self.config.extract_lineage_from_unsupported_custom_sql_queries
+                ):
+                    if not tableau_table_list:
+                        # custom sql tables may contain unsupported sql, causing incomplete lineage
+                        # we extract the lineage from the raw queries
+                        logger.debug(
+                            "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
+                        )
+                    else:
+                        # The Tableau SQL parser is much worse than our sqlglot based parser,
+                        # so relying on metadata parsed by Tableau from SQL queries can be
+                        # less accurate. This option allows us to ignore Tableau's parser and
+                        # only use our own.
+                        logger.debug("Parsing TLL & CLL from custom sql (forced)")
                     yield from self._create_lineage_from_unsupported_csql(
                         csql_urn, csql, columns
                     )
                 else:
-                    tables = csql.get(c.TABLES, [])
-                    if tables:
-                        # lineage from custom sql -> datasets/tables #
-                        yield from self._create_lineage_to_upstream_tables(
-                            csql_urn, tables, datasource
-                        )
-                    elif (
-                        self.config.extract_lineage_from_unsupported_custom_sql_queries
-                    ):
-                        logger.debug("Extracting TLL & CLL from custom sql")
-                        # custom sql tables may contain unsupported sql, causing incomplete lineage
-                        # we extract the lineage from the raw queries
-                        yield from self._create_lineage_from_unsupported_csql(
-                            csql_urn, csql, columns
-                        )
+                    # lineage from custom sql -> datasets/tables #
+                    yield from self._create_lineage_to_upstream_tables(
+                        csql_urn, tableau_table_list, datasource
+                    )
             #  Schema Metadata
             schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2243,7 +2250,6 @@ class TableauSiteSource:
             yield self.get_metadata_change_event(dataset_snapshot)
             yield self.get_metadata_change_proposal(
                 dataset_snapshot.urn,
-                aspect_name=c.SUB_TYPES,
                 aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
             )
@@ -2408,7 +2414,6 @@ class TableauSiteSource:
             upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
             yield self.get_metadata_change_proposal(
                 csql_urn,
-                aspect_name=c.UPSTREAM_LINEAGE,
                 aspect=upstream_lineage,
             )
             self.report.num_tables_with_upstream_lineage += 1
@@ -2594,7 +2599,6 @@ class TableauSiteSource:
         )
         yield self.get_metadata_change_proposal(
             csql_urn,
-            aspect_name=c.UPSTREAM_LINEAGE,
             aspect=upstream_lineage,
         )
         self.report.num_tables_with_upstream_lineage += 1
@@ -2640,14 +2644,10 @@ class TableauSiteSource:
     def get_metadata_change_proposal(
         self,
         urn: str,
-        aspect_name: str,
         aspect: Union["UpstreamLineage", "SubTypesClass"],
     ) -> MetadataWorkUnit:
         return MetadataChangeProposalWrapper(
-            entityType=c.DATASET,
-            changeType=ChangeTypeClass.UPSERT,
             entityUrn=urn,
-            aspectName=aspect_name,
             aspect=aspect,
         ).as_workunit()
@@ -2755,7 +2755,6 @@ class TableauSiteSource:
                 )
                 yield self.get_metadata_change_proposal(
                     datasource_urn,
-                    aspect_name=c.UPSTREAM_LINEAGE,
                     aspect=upstream_lineage,
                 )
                 self.report.num_tables_with_upstream_lineage += 1
@@ -2774,7 +2773,6 @@ class TableauSiteSource:
         yield self.get_metadata_change_event(dataset_snapshot)
         yield self.get_metadata_change_proposal(
             dataset_snapshot.urn,
-            aspect_name=c.SUB_TYPES,
             aspect=SubTypesClass(
                 typeNames=(
                     ["Embedded Data Source"]
@@ -2860,7 +2858,11 @@ class TableauSiteSource:
         return datasource
     def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
-        datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
+        datasource_filter = (
+            {}
+            if self.config.emit_all_published_datasources
+            else {c.ID_WITH_IN: self.datasource_ids_being_used}
+        )
         for datasource in self.get_connection_objects(
             query=published_datasource_graphql_query,
@@ -3553,7 +3555,11 @@ class TableauSiteSource:
         return browse_paths
     def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
-        datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
+        datasource_filter = (
+            {}
+            if self.config.emit_all_embedded_datasources
+            else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
+        )
         for datasource in self.get_connection_objects(
             query=embedded_datasource_graphql_query,

datahub/ingestion/source/tableau/tableau_constant.py CHANGED Viewed

@@ -50,7 +50,6 @@ TABLES = "tables"
 DESCRIPTION = "description"
 SQL = "SQL"
 QUERY = "query"
-SUB_TYPES = "subTypes"
 VIEW = "view"
 CUSTOM_SQL = "Custom SQL"
 REMOTE_TYPE = "remoteType"
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
 PUBLISHED_DATA_SOURCE = "PublishedDatasource"
 LUID = "luid"
 EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
-UPSTREAM_LINEAGE = "upstreamLineage"
 OWNER = "owner"
 USERNAME = "username"
 HAS_EXTRACTS = "hasExtracts"

datahub/ingestion/source/unity/proxy.py CHANGED Viewed

@@ -507,9 +507,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
     def _execute_sql_query(self, query: str) -> List[List[str]]:
         """Execute SQL query using databricks-sql connector for better performance"""
         try:
-            with connect(
-                **self._sql_connection_params
-            ) as connection, connection.cursor() as cursor:
+            with (
+                connect(**self._sql_connection_params) as connection,
+                connection.cursor() as cursor,
+            ):
                 cursor.execute(query)
                 return cursor.fetchall()

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -56,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
 from datahub.ingestion.source.common.subtypes import (
     DatasetContainerSubTypes,
     DatasetSubTypes,
+    SourceCapabilityModifier,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -152,7 +153,14 @@ logger: logging.Logger = logging.getLogger(__name__)
 @capability(SourceCapability.USAGE_STATS, "Enabled by default")
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.CATALOG,
+        SourceCapabilityModifier.SCHEMA,
+    ],
+)
 @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
 @capability(
     SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
@@ -768,10 +776,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
     def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
         domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
-        schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
-            schema.catalog.name
-        ).get(f"{schema.catalog.name}.{schema.name}", [])
-        if schema_tags:
+        schema_tags = []
+        if self.config.include_tags:
+            schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
+                schema.catalog.name
+            ).get(f"{schema.catalog.name}.{schema.name}", [])
             logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
             # Generate platform resources for schema tags
             yield from self.gen_platform_resources(schema_tags)
@@ -809,10 +818,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
     def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
         domain_urn = self._gen_domain_urn(catalog.name)
-        catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog.name).get(
-            catalog.name, []
-        )
-        if catalog_tags:
+        catalog_tags = []
+        if self.config.include_tags:
+            catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
+                catalog.name
+            ).get(catalog.name, [])
             logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
             # Generate platform resources for schema tags
             yield from self.gen_platform_resources(catalog_tags)

datahub/integrations/assertion/snowflake/compiler.py CHANGED Viewed

@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
         dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
         dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
-        with (dmf_definitions_path).open("w") as definitions, (
-            dmf_associations_path
-        ).open("w") as associations:
+        with (
+            (dmf_definitions_path).open("w") as definitions,
+            (dmf_associations_path).open("w") as associations,
+        ):
             for assertion_spec in assertion_config_spec.assertions:
                 result.report.num_processed += 1
                 try:

acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc6py3-none-any.whl → 1.1.0.5rc8py3-none-any.whl