PyPI - acryl-datahub - Versions diffs - 0.15.0.5rc7__py3-none-any.whl → 0.15.0.5rc9__py3-none-any.whl - Mend

acryl-datahub 0.15.0.5rc7py3-none-any.whl → 0.15.0.5rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show

datahub/entrypoints.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import multiprocessing
 import os
 import platform
 import sys
@@ -183,6 +184,18 @@ datahub.add_command(datacontract)
 datahub.add_command(assertions)
 datahub.add_command(container)
+try:
+    from datahub.cli.iceberg_cli import iceberg
+    datahub.add_command(iceberg)
+except ImportError as e:
+    logger.debug(f"Failed to load datahub iceberg command: {e}")
+    datahub.add_command(
+        make_shim_command(
+            "iceberg", "run `pip install 'acryl-datahub[iceberg-catalog]'`"
+        )
+    )
 try:
     from datahub.cli.lite_cli import lite
@@ -205,6 +218,14 @@ except ImportError as e:
 def main(**kwargs):
+    # We use threads in a variety of places within our CLI. The multiprocessing
+    # "fork" start method is not safe to use with threads.
+    # MacOS and Windows already default to "spawn", and Linux will as well starting in Python 3.14.
+    # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # Eventually it may make sense to use "forkserver" as the default where available,
+    # but we can revisit that in the future.
+    multiprocessing.set_start_method("spawn", force=True)
     # This wrapper prevents click from suppressing errors.
     try:
         sys.exit(datahub(standalone_mode=False, **kwargs))

datahub/ingestion/api/incremental_lineage_helper.py CHANGED Viewed

@@ -102,6 +102,10 @@ def convert_dashboard_info_to_patch(
     if aspect.datasets:
         patch_builder.add_datasets(aspect.datasets)
+    if aspect.dashboards:
+        for dashboard in aspect.dashboards:
+            patch_builder.add_dashboard(dashboard)
     if aspect.access:
         patch_builder.set_access(aspect.access)

datahub/ingestion/glossary/classification_mixin.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import concurrent.futures
 import logging
+import multiprocessing
 from dataclasses import dataclass, field
 from functools import partial
 from math import ceil
@@ -182,6 +183,11 @@ class ClassificationHandler:
         with concurrent.futures.ProcessPoolExecutor(
             max_workers=self.config.classification.max_workers,
+            # The fork start method, which is the default on Linux for Python < 3.14, is not
+            # safe when the main process uses threads. The default start method on windows/macOS is
+            # already spawn, and will be changed to spawn for Linux in Python 3.14.
+            # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+            mp_context=multiprocessing.get_context("spawn"),
         ) as executor:
             column_info_proposal_futures = [
                 executor.submit(

datahub/ingestion/glossary/classifier.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
@@ -37,8 +38,8 @@ class ClassificationConfig(ConfigModel):
     )
     max_workers: int = Field(
-        default=1,
-        description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
+        default=(os.cpu_count() or 4),
+        description="Number of worker processes to use for classification. Set to 1 to disable.",
     )
     table_pattern: AllowDenyPattern = Field(

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -113,6 +113,7 @@ from datahub.metadata.schema_classes import (
 )
 from datahub.utilities.delta import delta_type_to_hive_type
 from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -220,7 +221,7 @@ class GlueSourceConfig(
 class GlueSourceReport(StaleEntityRemovalSourceReport):
     catalog_id: Optional[str] = None
     tables_scanned = 0
-    filtered: List[str] = dataclass_field(default_factory=list)
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
     databases: EntityFilterReport = EntityFilterReport.field(type="database")
     num_job_script_location_missing: int = 0
@@ -746,7 +747,7 @@ class GlueSource(StatefulIngestionSourceBase):
                 for tables in self.get_tables_from_database(database):
                     all_tables.append(tables)
             except Exception as e:
-                self.report.failure(
+                self.report.warning(
                     message="Failed to get tables from database",
                     context=database["Name"],
                     exc=e,

datahub/ingestion/source/identity/azure_ad.py CHANGED Viewed

@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mce_builder import make_group_urn, make_user_urn
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
     OriginTypeClass,
     StatusClass,
 )
+from datahub.utilities.lossy_collections import LossyList
 logger = logging.getLogger(__name__)
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
         description="regex patterns for groups to include in ingestion.",
     )
-    # If enabled, report will contain names of filtered users and groups.
-    filtered_tracking: bool = Field(
-        default=True,
-        description="If enabled, report will contain names of filtered users and groups.",
-    )
+    _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
     # Optional: Whether to mask sensitive information from workunit ID's. On by default.
     mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
 @dataclass
 class AzureADSourceReport(StaleEntityRemovalSourceReport):
-    filtered: List[str] = field(default_factory=list)
-    filtered_tracking: bool = field(default=True, repr=False)
-    filtered_count: int = field(default=0)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     def report_filtered(self, name: str) -> None:
-        self.filtered_count += 1
-        if self.filtered_tracking:
-            self.filtered.append(name)
+        self.filtered.append(name)
 # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
     def __init__(self, config: AzureADConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
         self.config = config
-        self.report = AzureADSourceReport(
-            filtered_tracking=self.config.filtered_tracking
-        )
+        self.report = AzureADSourceReport()
         session = requests.Session()
         retries = Retry(
             total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -24,6 +24,7 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.source_common import DatasetLineageProviderConfigBase
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import (
     ContainerKey,
@@ -155,10 +156,7 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
     workspace: str = Field(
         description="The Mode workspace name. Find it in Settings > Workspace > Details."
     )
-    default_schema: str = Field(
-        default="public",
-        description="Default schema to use when schema is not provided in an SQL query",
-    )
+    _default_schema = pydantic_removed_field("default_schema")
     space_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern(

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -249,6 +249,12 @@ class SnowflakeV2Config(
         description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
     )
+    structured_properties_template_cache_invalidation_interval: int = Field(
+        hidden_from_docs=True,
+        default=60,
+        description="Interval in seconds to invalidate the structured properties template cache.",
+    )
     include_external_url: bool = Field(
         default=True,
         description="Whether to populate Snowsight url for Snowflake Objects",
@@ -302,6 +308,13 @@ class SnowflakeV2Config(
         " assertions CLI in snowflake",
     )
+    pushdown_deny_usernames: List[str] = Field(
+        default=[],
+        description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
+        "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
+        "Only applicable if `use_queries_v2` is enabled.",
+    )
     @validator("convert_urns_to_lowercase")
     def validate_convert_urns_to_lowercase(cls, v):
         if not v:

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -159,6 +159,17 @@ class SnowflakeQuery:
         and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
         order by table_schema, table_name"""
+    @staticmethod
+    def get_all_tags():
+        return """
+        SELECT tag_database as "TAG_DATABASE",
+        tag_schema AS "TAG_SCHEMA",
+        tag_name AS "TAG_NAME",
+        FROM snowflake.account_usage.tag_references
+        GROUP BY TAG_DATABASE , TAG_SCHEMA, tag_name
+        ORDER BY TAG_DATABASE, TAG_SCHEMA, TAG_NAME  ASC;
+        """
     @staticmethod
     def get_all_tags_on_object_with_propagation(
         db_name: str, quoted_identifier: str, domain: str

datahub/ingestion/source/snowflake/snowflake_report.py CHANGED Viewed

@@ -114,6 +114,7 @@ class SnowflakeV2Report(
     num_tables_with_known_upstreams: int = 0
     num_upstream_lineage_edge_parsing_failed: int = 0
     num_secure_views_missing_definition: int = 0
+    num_structured_property_templates_created: int = 0
     data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None

datahub/ingestion/source/snowflake/snowflake_schema.py CHANGED Viewed

@@ -285,6 +285,23 @@ class SnowflakeDataDictionary(SupportsAsObj):
         return secure_view_definitions
+    def get_all_tags(self) -> List[SnowflakeTag]:
+        cur = self.connection.query(
+            SnowflakeQuery.get_all_tags(),
+        )
+        tags = [
+            SnowflakeTag(
+                database=tag["TAG_DATABASE"],
+                schema=tag["TAG_SCHEMA"],
+                name=tag["TAG_NAME"],
+                value="",
+            )
+            for tag in cur
+        ]
+        return tags
     @serialized_lru_cache(maxsize=1)
     def get_tables_for_database(
         self, db_name: str

datahub/ingestion/source/snowflake/snowflake_schema_gen.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import itertools
 import logging
+import time
 from typing import Dict, Iterable, List, Optional, Union
 from datahub.configuration.pattern_utils import is_schema_allowed
 from datahub.emitter.mce_builder import (
-    get_sys_time,
     make_data_platform_urn,
     make_dataset_urn_with_platform_instance,
     make_schema_field_urn,
@@ -74,7 +74,6 @@ from datahub.ingestion.source_report.ingestion_stage import (
     PROFILING,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
-    AuditStamp,
     GlobalTags,
     Status,
     SubTypes,
@@ -101,15 +100,8 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     StringType,
     TimeType,
 )
-from datahub.metadata.com.linkedin.pegasus2avro.structured import (
-    StructuredPropertyDefinition,
-)
 from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
 from datahub.metadata.urns import (
-    ContainerUrn,
-    DatasetUrn,
-    DataTypeUrn,
-    EntityTypeUrn,
     SchemaFieldUrn,
     StructuredPropertyUrn,
 )
@@ -191,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         self.domain_registry: Optional[DomainRegistry] = domain_registry
         self.classification_handler = ClassificationHandler(self.config, self.report)
         self.tag_extractor = SnowflakeTagExtractor(
-            config, self.data_dictionary, self.report
+            config, self.data_dictionary, self.report, identifiers
         )
         self.profiler: Optional[SnowflakeProfiler] = profiler
         self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
@@ -217,6 +209,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         return self.identifiers.snowflake_identifier(identifier)
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        if self.config.extract_tags_as_structured_properties:
+            logger.info("Creating structured property templates for tags")
+            yield from self.tag_extractor.create_structured_property_templates()
+            # We have to wait until cache invalidates to make sure the structured property template is available
+            logger.info(
+                f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
+            )
+            time.sleep(
+                self.config.structured_properties_template_cache_invalidation_interval
+            )
         self.databases = []
         for database in self.get_databases() or []:
             self.report.report_entity_scanned(database.name, "database")
@@ -698,6 +700,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
     def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
         use_sp = self.config.extract_tags_as_structured_properties
         identifier = (
             self.snowflake_identifier(tag.structured_property_identifier())
             if use_sp
@@ -708,10 +711,11 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             return
         self.report.report_tag_processed(identifier)
         if use_sp:
-            yield from self.gen_tag_as_structured_property_workunits(tag)
-        else:
-            yield from self.gen_tag_workunits(tag)
+            return
+        yield from self.gen_tag_workunits(tag)
     def _format_tags_as_structured_properties(
         self, tags: List[SnowflakeTag]
@@ -732,6 +736,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
         if table.tags:
             for tag in table.tags:
                 yield from self._process_tag(tag)
         for column_name in table.column_tags:
             for tag in table.column_tags[column_name]:
                 yield from self._process_tag(tag)
@@ -903,29 +908,6 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
             entityUrn=tag_urn, aspect=tag_properties_aspect
         ).as_workunit()
-    def gen_tag_as_structured_property_workunits(
-        self, tag: SnowflakeTag
-    ) -> Iterable[MetadataWorkUnit]:
-        identifier = self.snowflake_identifier(tag.structured_property_identifier())
-        urn = StructuredPropertyUrn(identifier).urn()
-        aspect = StructuredPropertyDefinition(
-            qualifiedName=identifier,
-            displayName=tag.name,
-            valueType=DataTypeUrn("datahub.string").urn(),
-            entityTypes=[
-                EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
-                EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
-                EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
-            ],
-            lastModified=AuditStamp(
-                time=get_sys_time(), actor="urn:li:corpuser:datahub"
-            ),
-        )
-        yield MetadataChangeProposalWrapper(
-            entityUrn=urn,
-            aspect=aspect,
-        ).as_workunit()
     def gen_column_tags_as_structured_properties(
         self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
     ) -> Iterable[MetadataWorkUnit]:

datahub/ingestion/source/snowflake/snowflake_tag.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import logging
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
+from datahub.emitter.mce_builder import get_sys_time
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
 from datahub.ingestion.source.snowflake.snowflake_config import (
     SnowflakeV2Config,
@@ -12,7 +15,22 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
     SnowflakeTag,
     _SnowflakeTagCache,
 )
-from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
+from datahub.ingestion.source.snowflake.snowflake_utils import (
+    SnowflakeCommonMixin,
+    SnowflakeIdentifierBuilder,
+)
+from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
+from datahub.metadata.com.linkedin.pegasus2avro.structured import (
+    StructuredPropertyDefinition,
+)
+from datahub.metadata.urns import (
+    ContainerUrn,
+    DatasetUrn,
+    DataTypeUrn,
+    EntityTypeUrn,
+    SchemaFieldUrn,
+    StructuredPropertyUrn,
+)
 logger: logging.Logger = logging.getLogger(__name__)
@@ -23,11 +41,12 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
         config: SnowflakeV2Config,
         data_dictionary: SnowflakeDataDictionary,
         report: SnowflakeV2Report,
+        snowflake_identifiers: SnowflakeIdentifierBuilder,
     ) -> None:
         self.config = config
         self.data_dictionary = data_dictionary
         self.report = report
+        self.snowflake_identifiers = snowflake_identifiers
         self.tag_cache: Dict[str, _SnowflakeTagCache] = {}
     def _get_tags_on_object_without_propagation(
@@ -59,6 +78,41 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
             raise ValueError(f"Unknown domain {domain}")
         return tags
+    def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
+        for tag in self.data_dictionary.get_all_tags():
+            if not self.config.structured_property_pattern.allowed(
+                tag.tag_identifier()
+            ):
+                continue
+            if self.config.extract_tags_as_structured_properties:
+                self.report.num_structured_property_templates_created += 1
+                yield from self.gen_tag_as_structured_property_workunits(tag)
+    def gen_tag_as_structured_property_workunits(
+        self, tag: SnowflakeTag
+    ) -> Iterable[MetadataWorkUnit]:
+        identifier = self.snowflake_identifiers.snowflake_identifier(
+            tag.structured_property_identifier()
+        )
+        urn = StructuredPropertyUrn(identifier).urn()
+        aspect = StructuredPropertyDefinition(
+            qualifiedName=identifier,
+            displayName=tag.name,
+            valueType=DataTypeUrn("datahub.string").urn(),
+            entityTypes=[
+                EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
+                EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
+                EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
+            ],
+            lastModified=AuditStamp(
+                time=get_sys_time(), actor="urn:li:corpuser:datahub"
+            ),
+        )
+        yield MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            aspect=aspect,
+        ).as_workunit()
     def _get_tags_on_object_with_propagation(
         self,
         domain: str,

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -567,6 +567,7 @@ class SnowflakeV2Source(
                         include_queries=self.config.include_queries,
                         include_query_usage_statistics=self.config.include_query_usage_statistics,
                         user_email_pattern=self.config.user_email_pattern,
+                        pushdown_deny_usernames=self.config.pushdown_deny_usernames,
                     ),
                     structured_report=self.report,
                     filters=self.filters,

datahub/ingestion/source/sql/mssql/job_models.py CHANGED Viewed

@@ -7,7 +7,12 @@ from datahub.emitter.mce_builder import (
     make_data_platform_urn,
     make_dataplatform_instance_urn,
 )
+from datahub.emitter.mcp_builder import (
+    DatabaseKey,
+    SchemaKey,
+)
 from datahub.metadata.schema_classes import (
+    ContainerClass,
     DataFlowInfoClass,
     DataJobInfoClass,
     DataJobInputOutputClass,
@@ -171,11 +176,7 @@ class MSSQLDataJob:
             flow_id=self.entity.flow.formatted_name,
             job_id=self.entity.formatted_name,
             cluster=self.entity.flow.cluster,
-            platform_instance=(
-                self.entity.flow.platform_instance
-                if self.entity.flow.platform_instance
-                else None
-            ),
+            platform_instance=self.entity.flow.platform_instance,
         )
     def add_property(
@@ -222,6 +223,26 @@ class MSSQLDataJob:
             )
         return None
+    @property
+    def as_container_aspect(self) -> ContainerClass:
+        key_args = dict(
+            platform=self.entity.flow.orchestrator,
+            instance=self.entity.flow.platform_instance,
+            env=self.entity.flow.env,
+            database=self.entity.flow.db,
+        )
+        container_key = (
+            SchemaKey(
+                schema=self.entity.schema,
+                **key_args,
+            )
+            if isinstance(self.entity, StoredProcedure)
+            else DatabaseKey(
+                **key_args,
+            )
+        )
+        return ContainerClass(container=container_key.as_urn())
 @dataclass
 class MSSQLDataFlow:
@@ -244,9 +265,7 @@ class MSSQLDataFlow:
             orchestrator=self.entity.orchestrator,
             flow_id=self.entity.formatted_name,
             cluster=self.entity.cluster,
-            platform_instance=(
-                self.entity.platform_instance if self.entity.platform_instance else None
-            ),
+            platform_instance=self.entity.platform_instance,
         )
     @property
@@ -267,3 +286,13 @@ class MSSQLDataFlow:
                 ),
             )
         return None
+    @property
+    def as_container_aspect(self) -> ContainerClass:
+        databaseKey = DatabaseKey(
+            platform=self.entity.orchestrator,
+            instance=self.entity.platform_instance,
+            env=self.entity.env,
+            database=self.entity.db,
+        )
+        return ContainerClass(container=databaseKey.as_urn())

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -108,6 +108,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
         default=True,
         description="Enable lineage extraction for stored procedures",
     )
+    include_containers_for_pipelines: bool = Field(
+        default=False,
+        description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
+    )
     @pydantic.validator("uri_args")
     def passwords_match(cls, v, values, **kwargs):
@@ -641,6 +645,12 @@ class SQLServerSource(SQLAlchemySource):
                 aspect=data_platform_instance_aspect,
             ).as_workunit()
+        if self.config.include_containers_for_pipelines:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=data_job.urn,
+                aspect=data_job.as_container_aspect,
+            ).as_workunit()
         if include_lineage:
             yield MetadataChangeProposalWrapper(
                 entityUrn=data_job.urn,
@@ -683,6 +693,13 @@ class SQLServerSource(SQLAlchemySource):
                 entityUrn=data_flow.urn,
                 aspect=data_platform_instance_aspect,
             ).as_workunit()
+        if self.config.include_containers_for_pipelines:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=data_flow.urn,
+                aspect=data_flow.as_container_aspect,
+            ).as_workunit()
         # TODO: Add SubType when it appear
     def get_inspectors(self) -> Iterable[Inspector]:

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -2428,10 +2428,12 @@ class TableauSiteSource:
             ]
         ],
     ) -> Optional["SqlParsingResult"]:
-        database_info = datasource.get(c.DATABASE) or {
-            c.NAME: c.UNKNOWN.lower(),
-            c.CONNECTION_TYPE: datasource.get(c.CONNECTION_TYPE),
-        }
+        database_field = datasource.get(c.DATABASE) or {}
+        database_id: Optional[str] = database_field.get(c.ID)
+        database_name: Optional[str] = database_field.get(c.NAME) or c.UNKNOWN.lower()
+        database_connection_type: Optional[str] = database_field.get(
+            c.CONNECTION_TYPE
+        ) or datasource.get(c.CONNECTION_TYPE)
         if (
             datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False)
@@ -2440,10 +2442,7 @@ class TableauSiteSource:
             logger.debug(f"datasource {datasource_urn} is not created from custom sql")
             return None
-        if (
-            database_info.get(c.NAME) is None
-            or database_info.get(c.CONNECTION_TYPE) is None
-        ):
+        if database_connection_type is None:
             logger.debug(
                 f"database information is missing from datasource {datasource_urn}"
             )
@@ -2459,14 +2458,14 @@ class TableauSiteSource:
         logger.debug(f"Parsing sql={query}")
-        upstream_db = database_info.get(c.NAME)
+        upstream_db = database_name
         if func_overridden_info is not None:
             # Override the information as per configuration
             upstream_db, platform_instance, platform, _ = func_overridden_info(
-                database_info[c.CONNECTION_TYPE],
-                database_info.get(c.NAME),
-                database_info.get(c.ID),
+                database_connection_type,
+                database_name,
+                database_id,
                 self.config.platform_instance_map,
                 self.config.lineage_overrides,
                 self.config.database_hostname_to_platform_instance_map,
@@ -2534,6 +2533,9 @@ class TableauSiteSource:
             platform_instance=self.config.platform_instance,
             func_overridden_info=get_overridden_info,
         )
+        logger.debug(
+            f"_create_lineage_from_unsupported_csql parsed_result = {parsed_result}"
+        )
         if parsed_result is None:
             return

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -761,7 +761,7 @@ class TableauUpstreamReference:
 def get_overridden_info(
-    connection_type: Optional[str],
+    connection_type: str,
     upstream_db: Optional[str],
     upstream_db_id: Optional[str],
     platform_instance_map: Optional[Dict[str, str]],

acryl-datahub 0.15.0.5rc7__py3-none-any.whl → 0.15.0.5rc9__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.5rc7py3-none-any.whl → 0.15.0.5rc9py3-none-any.whl