PyPI - acryl-datahub - Versions diffs - 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl - Mend

acryl-datahub 1.2.0.9rc1py3-none-any.whl → 1.2.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/assertion/assertion.py +1 -1
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/dataproduct/dataproduct.py +6 -3
datahub/api/entities/dataset/dataset.py +9 -18
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/api/graphql/operation.py +10 -6
datahub/cli/docker_check.py +2 -2
datahub/configuration/common.py +29 -1
datahub/configuration/connection_resolver.py +5 -2
datahub/configuration/import_resolver.py +7 -4
datahub/configuration/pydantic_migration_helpers.py +0 -9
datahub/configuration/source_common.py +3 -2
datahub/configuration/validate_field_deprecation.py +5 -2
datahub/configuration/validate_field_removal.py +5 -2
datahub/configuration/validate_field_rename.py +6 -5
datahub/configuration/validate_multiline_string.py +5 -2
datahub/ingestion/autogenerated/capability_summary.json +45 -1
datahub/ingestion/run/pipeline_config.py +2 -2
datahub/ingestion/source/azure/azure_common.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
datahub/ingestion/source/datahub/config.py +8 -9
datahub/ingestion/source/dbt/dbt_common.py +65 -5
datahub/ingestion/source/delta_lake/config.py +1 -1
datahub/ingestion/source/dremio/dremio_config.py +3 -4
datahub/ingestion/source/feast.py +8 -10
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/gcs/gcs_source.py +19 -2
datahub/ingestion/source/ge_data_profiler.py +15 -2
datahub/ingestion/source/ge_profiling_config.py +26 -22
datahub/ingestion/source/grafana/grafana_config.py +2 -2
datahub/ingestion/source/grafana/models.py +12 -14
datahub/ingestion/source/hex/hex.py +6 -1
datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
datahub/ingestion/source/kafka_connect/common.py +2 -2
datahub/ingestion/source/looker/looker_common.py +76 -75
datahub/ingestion/source/looker/looker_config.py +15 -4
datahub/ingestion/source/looker/looker_source.py +493 -547
datahub/ingestion/source/looker/lookml_config.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +46 -88
datahub/ingestion/source/metabase.py +9 -2
datahub/ingestion/source/metadata/business_glossary.py +7 -7
datahub/ingestion/source/metadata/lineage.py +1 -1
datahub/ingestion/source/mode.py +13 -5
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/config.py +14 -21
datahub/ingestion/source/preset.py +1 -1
datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
datahub/ingestion/source/redash.py +1 -1
datahub/ingestion/source/redshift/config.py +6 -3
datahub/ingestion/source/redshift/query.py +23 -19
datahub/ingestion/source/s3/source.py +26 -24
datahub/ingestion/source/salesforce.py +13 -9
datahub/ingestion/source/schema/json_schema.py +14 -14
datahub/ingestion/source/sigma/data_classes.py +3 -0
datahub/ingestion/source/snaplogic/__init__.py +0 -0
datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +12 -7
datahub/ingestion/source/sql/cockroachdb.py +5 -3
datahub/ingestion/source/sql/druid.py +2 -2
datahub/ingestion/source/sql/hive.py +4 -3
datahub/ingestion/source/sql/hive_metastore.py +7 -9
datahub/ingestion/source/sql/mssql/source.py +2 -2
datahub/ingestion/source/sql/mysql.py +2 -2
datahub/ingestion/source/sql/oracle.py +3 -3
datahub/ingestion/source/sql/presto.py +2 -1
datahub/ingestion/source/sql/teradata.py +4 -4
datahub/ingestion/source/sql/trino.py +2 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
datahub/ingestion/source/sql/vertica.py +1 -1
datahub/ingestion/source/sql_queries.py +6 -6
datahub/ingestion/source/state/checkpoint.py +5 -1
datahub/ingestion/source/state/entity_removal_state.py +5 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
datahub/ingestion/source/superset.py +122 -15
datahub/ingestion/source/tableau/tableau.py +68 -14
datahub/ingestion/source/tableau/tableau_common.py +5 -0
datahub/ingestion/source/tableau/tableau_constant.py +1 -0
datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
datahub/ingestion/source/unity/config.py +7 -3
datahub/ingestion/source/usage/usage_common.py +3 -3
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/set_browse_path.py +112 -0
datahub/metadata/_internal_schema_classes.py +728 -528
datahub/metadata/_urns/urn_defs.py +1702 -1702
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/schema.avsc +17434 -17732
datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
datahub/metadata/schemas/LogicalParent.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
datahub/sdk/_shared.py +126 -0
datahub/sdk/chart.py +87 -30
datahub/sdk/dashboard.py +79 -34
datahub/sdk/entity_client.py +11 -4
datahub/sdk/lineage_client.py +3 -3
datahub/sdk/search_filters.py +1 -7
datahub/sql_parsing/split_statements.py +13 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -9,9 +9,10 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import dateutil.parser as dp
 import requests
 import sqlglot
-from pydantic import BaseModel
-from pydantic.class_validators import root_validator, validator
+from pydantic import BaseModel, root_validator, validator
 from pydantic.fields import Field
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 import datahub.emitter.mce_builder as builder
 from datahub.configuration.common import AllowDenyPattern
@@ -109,6 +110,12 @@ logger = logging.getLogger(__name__)
 PAGE_SIZE = 25
+# Retry configuration constants
+RETRY_MAX_TIMES = 3
+RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
+RETRY_BACKOFF_FACTOR = 1
+RETRY_ALLOWED_METHODS = ["GET"]
 chart_type_from_viz_type = {
     "line": ChartTypeClass.LINE,
@@ -282,6 +289,7 @@ def get_filter_name(filter_obj):
 )
 @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
 @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
+@capability(SourceCapability.TAGS, "Supported by default")
 class SupersetSource(StatefulIngestionSourceBase):
     """
     This plugin extracts the following:
@@ -327,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
         logger.debug("Got access token from superset")
         requests_session = requests.Session()
+        # Configure retry strategy for transient failures
+        retry_strategy = Retry(
+            total=RETRY_MAX_TIMES,
+            status_forcelist=RETRY_STATUS_CODES,
+            backoff_factor=RETRY_BACKOFF_FACTOR,
+            allowed_methods=RETRY_ALLOWED_METHODS,
+            raise_on_status=False,
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        requests_session.mount("http://", adapter)
+        requests_session.mount("https://", adapter)
         requests_session.headers.update(
             {
                 "Authorization": f"Bearer {self.access_token}",
@@ -359,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
             )
             if response.status_code != 200:
-                logger.warning(f"Failed to get {entity_type} data: {response.text}")
-                continue
+                self.report.warning(
+                    title="Failed to fetch data from Superset API",
+                    message="Incomplete metadata extraction due to Superset API failure",
+                    context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
+                )
+                # we stop pagination for this entity type and we continue the overall ingestion
+                break
             payload = response.json()
             # Update total_items with the actual count from the response
@@ -521,6 +547,11 @@ class SupersetSource(StatefulIngestionSourceBase):
         )
         dashboard_snapshot.aspects.append(owners_info)
+        superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
+        tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
+        if tags:
+            dashboard_snapshot.aspects.append(tags)
         return dashboard_snapshot
     def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
@@ -919,6 +950,12 @@ class SupersetSource(StatefulIngestionSourceBase):
             lastModified=last_modified,
         )
         chart_snapshot.aspects.append(owners_info)
+        superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
+        tags = self._merge_tags_with_existing(chart_urn, superset_tags)
+        if tags:
+            chart_snapshot.aspects.append(tags)
         yield MetadataWorkUnit(
             id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
         )
@@ -1288,17 +1325,18 @@ class SupersetSource(StatefulIngestionSourceBase):
             externalUrl=dataset_url,
             lastModified=TimeStamp(time=modified_ts),
         )
-        global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
-        aspects_items: List[Any] = []
-        aspects_items.extend(
-            [
-                self.gen_schema_metadata(dataset_response),
-                dataset_info,
-                upstream_lineage,
-                global_tags,
-            ]
-        )
+        dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
+        tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
+        aspects_items: List[Any] = [
+            self.gen_schema_metadata(dataset_response),
+            dataset_info,
+            upstream_lineage,
+        ]
+        if tags:
+            aspects_items.append(tags)
         dataset_snapshot = DatasetSnapshot(
             urn=datasource_urn,
@@ -1320,6 +1358,75 @@ class SupersetSource(StatefulIngestionSourceBase):
         return dataset_snapshot
+    def _extract_and_map_tags(
+        self, raw_tags: List[Dict[str, Any]]
+    ) -> Optional[GlobalTagsClass]:
+        """Extract and map Superset tags to DataHub GlobalTagsClass.
+        Filters out system-generated tags (type != 1) and only processes user-defined tags
+        from the Superset API response.
+        Args:
+            raw_tags: List of tag dictionaries from Superset API
+        Returns:
+            GlobalTagsClass with user-defined tags, or None if no tags found
+        """
+        user_tags = [
+            tag.get("name", "")
+            for tag in raw_tags
+            if tag.get("type") == 1 and tag.get("name")
+        ]
+        if not user_tags:
+            return None
+        tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
+        return GlobalTagsClass(
+            tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
+        )
+    def _merge_tags_with_existing(
+        self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
+    ) -> Optional[GlobalTagsClass]:
+        """Merge new tags with existing ones from DataHub to preserve manually added tags.
+        This method ensures that tags manually added via DataHub UI are not overwritten
+        during ingestion. It fetches existing tags from the graph and merges them with
+        new tags from the source system, avoiding duplicates.
+        Args:
+            entity_urn: URN of the entity to check for existing tags
+            new_tags: New tags to add as GlobalTagsClass object
+        Returns:
+            GlobalTagsClass with merged tags preserving existing ones, or None if no tags
+        """
+        if not new_tags or not new_tags.tags:
+            return None
+        # Fetch existing tags from DataHub
+        existing_global_tags = None
+        if self.ctx.graph:
+            existing_global_tags = self.ctx.graph.get_aspect(
+                entity_urn=entity_urn, aspect_type=GlobalTagsClass
+            )
+        # Merge existing tags with new ones, avoiding duplicates
+        all_tags = []
+        existing_tag_urns = set()
+        if existing_global_tags and existing_global_tags.tags:
+            all_tags.extend(existing_global_tags.tags)
+            existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
+        # Add new tags that don't already exist
+        for new_tag in new_tags.tags:
+            if new_tag.tag not in existing_tag_urns:
+                all_tags.append(new_tag)
+        return GlobalTagsClass(tags=all_tags) if all_tags else None
     def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
         dataset_name = ""
         try:

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 import re
 import time
 from collections import OrderedDict, defaultdict
+from copy import deepcopy
 from dataclasses import dataclass, field as dataclass_field
 from datetime import datetime, timedelta, timezone
 from functools import lru_cache
@@ -474,6 +475,13 @@ class TableauPageSizeConfig(ConfigModel):
         return self.database_table_page_size or self.page_size
+_IngestHiddenAssetsOptionsType = Literal["worksheet", "dashboard"]
+_IngestHiddenAssetsOptions: List[_IngestHiddenAssetsOptionsType] = [
+    "worksheet",
+    "dashboard",
+]
 class TableauConfig(
     DatasetLineageProviderConfigBase,
     StatefulIngestionConfigBase,
@@ -524,6 +532,10 @@ class TableauConfig(
         default=False,
         description="Ingest Owner from source. This will override Owner info entered from UI",
     )
+    use_email_as_username: bool = Field(
+        default=False,
+        description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
+    )
     ingest_tables_external: bool = Field(
         default=False,
         description="Ingest details for tables external to (not embedded in) tableau as entities.",
@@ -582,13 +594,13 @@ class TableauConfig(
     )
     extract_lineage_from_unsupported_custom_sql_queries: bool = Field(
-        default=False,
-        description="[Experimental] Whether to extract lineage from unsupported custom sql queries using SQL parsing",
+        default=True,
+        description="[Experimental] Extract lineage from Custom SQL queries using DataHub's SQL parser in cases where the Tableau Catalog API fails to return lineage for the query.",
     )
     force_extraction_of_lineage_from_custom_sql_queries: bool = Field(
         default=False,
-        description="[Experimental] Force extraction of lineage from custom sql queries using SQL parsing, ignoring Tableau metadata",
+        description="[Experimental] Force extraction of lineage from Custom SQL queries using DataHub's SQL parser, even when the Tableau Catalog API returns lineage already.",
     )
     sql_parsing_disable_schema_awareness: bool = Field(
@@ -621,8 +633,8 @@ class TableauConfig(
         description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
     )
-    ingest_hidden_assets: Union[List[Literal["worksheet", "dashboard"]], bool] = Field(
-        default=["worksheet", "dashboard"],
+    ingest_hidden_assets: Union[List[_IngestHiddenAssetsOptionsType], bool] = Field(
+        _IngestHiddenAssetsOptions,
         description=(
             "When enabled, hidden worksheets and dashboards are ingested into Datahub."
             " If a dashboard or worksheet is hidden in Tableau the luid is blank."
@@ -644,6 +656,11 @@ class TableauConfig(
     # pre = True because we want to take some decision before pydantic initialize the configuration to default values
     @root_validator(pre=True)
     def projects_backward_compatibility(cls, values: Dict) -> Dict:
+        # In-place update of the input dict would cause state contamination. This was discovered through test failures
+        # in test_hex.py where the same dict is reused.
+        # So a copy is performed first.
+        values = deepcopy(values)
         projects = values.get("projects")
         project_pattern = values.get("project_pattern")
         project_path_pattern = values.get("project_path_pattern")
@@ -655,6 +672,7 @@ class TableauConfig(
             values["project_pattern"] = AllowDenyPattern(
                 allow=[f"^{prj}$" for prj in projects]
             )
+            values.pop("projects")
         elif (project_pattern or project_path_pattern) and projects:
             raise ValueError(
                 "projects is deprecated. Please use project_path_pattern only."
@@ -666,7 +684,7 @@ class TableauConfig(
         return values
-    @root_validator()
+    @root_validator(skip_on_failure=True)
     def validate_config_values(cls, values: Dict) -> Dict:
         tags_for_hidden_assets = values.get("tags_for_hidden_assets")
         ingest_tags = values.get("ingest_tags")
@@ -678,6 +696,14 @@ class TableauConfig(
             raise ValueError(
                 "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
             )
+        use_email_as_username = values.get("use_email_as_username")
+        ingest_owner = values.get("ingest_owner")
+        if use_email_as_username and not ingest_owner:
+            raise ValueError(
+                "use_email_as_username requires ingest_owner to be enabled."
+            )
         return values
@@ -839,6 +865,9 @@ class TableauSourceReport(
         default_factory=(lambda: defaultdict(int))
     )
+    # Owner extraction statistics
+    num_email_fallback_to_username: int = 0
 def report_user_role(report: TableauSourceReport, server: Server) -> None:
     title: str = "Insufficient Permissions"
@@ -2716,13 +2745,12 @@ class TableauSiteSource:
             dataset_snapshot.aspects.append(browse_paths)
         # Ownership
-        owner = (
-            self._get_ownership(datasource_info[c.OWNER][c.USERNAME])
-            if datasource_info
-            and datasource_info.get(c.OWNER)
-            and datasource_info[c.OWNER].get(c.USERNAME)
+        owner_identifier = (
+            self._get_owner_identifier(datasource_info[c.OWNER])
+            if datasource_info and datasource_info.get(c.OWNER)
             else None
         )
+        owner = self._get_ownership(owner_identifier) if owner_identifier else None
         if owner is not None:
             dataset_snapshot.aspects.append(owner)
@@ -3127,7 +3155,7 @@ class TableauSiteSource:
         creator: Optional[str] = None
         if workbook is not None and workbook.get(c.OWNER) is not None:
-            creator = workbook[c.OWNER].get(c.USERNAME)
+            creator = self._get_owner_identifier(workbook[c.OWNER])
         created_at = sheet.get(c.CREATED_AT, datetime.now())
         updated_at = sheet.get(c.UPDATED_AT, datetime.now())
         last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3276,7 +3304,7 @@ class TableauSiteSource:
     def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
         workbook_container_key = self.gen_workbook_key(workbook[c.ID])
-        creator = workbook.get(c.OWNER, {}).get(c.USERNAME)
+        creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
         owner_urn = (
             builder.make_user_urn(creator)
@@ -3458,7 +3486,7 @@ class TableauSiteSource:
         creator: Optional[str] = None
         if workbook is not None and workbook.get(c.OWNER) is not None:
-            creator = workbook[c.OWNER].get(c.USERNAME)
+            creator = self._get_owner_identifier(workbook[c.OWNER])
         created_at = dashboard.get(c.CREATED_AT, datetime.now())
         updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
         last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3605,6 +3633,20 @@ class TableauSiteSource:
             )
         return last_modified
+    def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
+        """Extract owner identifier (email or username) based on configuration."""
+        if not owner_dict:
+            return None
+        if self.config.use_email_as_username:
+            email = owner_dict.get(c.EMAIL)
+            if email:
+                return email
+            # Fall back to username if email is not available
+            self.report.num_email_fallback_to_username += 1
+        return owner_dict.get(c.USERNAME)
     @lru_cache(maxsize=None)
     def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
         if self.config.ingest_owner and user:
@@ -3828,3 +3870,15 @@ class TableauSiteSource:
                     self.report.emit_upstream_tables_timer[self.site_content_url] = (
                         timer.elapsed_seconds(digits=2)
                     )
+            # Log owner extraction statistics if there were fallbacks
+            if (
+                self.config.use_email_as_username
+                and self.config.ingest_owner
+                and self.report.num_email_fallback_to_username > 0
+            ):
+                logger.info(
+                    f"Owner extraction summary for site '{self.site_content_url}': "
+                    f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
+                    f"(email was not available)"
+                )

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -65,6 +65,7 @@ workbook_graphql_query = """
       projectName
       owner {
         username
+        email
       }
       description
       uri
@@ -107,6 +108,7 @@ sheet_graphql_query = """
         luid
         owner {
           username
+          email
         }
     }
     datasourceFields {
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
         luid
         owner {
           username
+          email
         }
     }
 }
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
         luid
         owner {
           username
+          email
         }
     }
 }
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
     }
     owner {
       username
+      email
     }
     description
     uri

datahub/ingestion/source/tableau/tableau_constant.py CHANGED Viewed

@@ -59,6 +59,7 @@ LUID = "luid"
 EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
 OWNER = "owner"
 USERNAME = "username"
+EMAIL = "email"
 HAS_EXTRACTS = "hasExtracts"
 EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
 EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"

datahub/ingestion/source/tableau/tableau_server_wrapper.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from typing import Optional
 from tableauserverclient import Server, UserItem
@@ -10,6 +11,7 @@ class UserInfo:
     user_name: str
     site_role: str
     site_id: str
+    email: Optional[str] = None
     def has_site_administrator_explorer_privileges(self):
         return self.site_role in [
@@ -34,4 +36,5 @@ class UserInfo:
             user_name=user.name,
             site_role=user.site_role,
             site_id=server.site_id,
+            email=user.email,
         )

datahub/ingestion/source/unity/config.py CHANGED Viewed

@@ -8,7 +8,12 @@ import pydantic
 from pydantic import Field
 from typing_extensions import Literal
-from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
+from datahub.configuration.common import (
+    AllowDenyPattern,
+    ConfigEnum,
+    ConfigModel,
+    HiddenFromDocs,
+)
 from datahub.configuration.source_common import (
     DatasetSourceConfigMixin,
     LowerCaseDatasetUrnConfigMixin,
@@ -285,10 +290,9 @@ class UnityCatalogSourceConfig(
         description="Limit the number of columns to get column level lineage. ",
     )
-    lineage_max_workers: int = pydantic.Field(
+    lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
         default=5 * (os.cpu_count() or 4),
         description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
-        hidden_from_docs=True,
     )
     databricks_api_page_size: int = pydantic.Field(

datahub/ingestion/source/usage/usage_common.py CHANGED Viewed

@@ -18,7 +18,7 @@ import pydantic
 from pydantic.fields import Field
 import datahub.emitter.mce_builder as builder
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
 from datahub.configuration.time_window_config import (
     BaseTimeWindowConfig,
     BucketDuration,
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
 class BaseUsageConfig(BaseTimeWindowConfig):
-    queries_character_limit: int = Field(
+    queries_character_limit: HiddenFromDocs[int] = Field(
+        # Hidden since we don't want to encourage people to break elasticsearch.
         default=DEFAULT_QUERIES_CHARACTER_LIMIT,
         description=(
             "Total character limit for all queries in a single usage aspect."
             " Queries will be truncated to length `queries_character_limit / top_n_queries`."
         ),
-        hidden_from_docs=True,  # Don't want to encourage people to break elasticsearch
     )
     top_n_queries: pydantic.PositiveInt = Field(

datahub/ingestion/source_config/pulsar.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 from typing import Dict, List, Optional, Union
 from urllib.parse import urlparse
+import pydantic
 from pydantic import Field, validator
 from datahub.configuration.common import AllowDenyPattern
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
             )
         return client_secret
-    @validator("web_service_url")
+    @pydantic.field_validator("web_service_url", mode="after")
+    @classmethod
     def web_service_url_scheme_host_port(cls, val: str) -> str:
         # Tokenize the web url
         url = urlparse(val)

datahub/ingestion/transformer/set_browse_path.py ADDED Viewed

@@ -0,0 +1,112 @@
+import re
+from collections import defaultdict
+from typing import Dict, List, Optional, cast
+from datahub.configuration.common import (
+    TransformerSemanticsConfigModel,
+)
+from datahub.emitter.mce_builder import Aspect
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.transformer.base_transformer import (
+    BaseTransformer,
+    SingleAspectTransformer,
+)
+from datahub.metadata.schema_classes import (
+    BrowsePathEntryClass,
+    BrowsePathsV2Class,
+)
+from datahub.utilities.urns.urn import guess_entity_type
+class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
+    path: List[str]
+class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
+    ctx: PipelineContext
+    config: SetBrowsePathTransformerConfig
+    def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
+        super().__init__()
+        self.ctx = ctx
+        self.config = config
+    def aspect_name(self) -> str:
+        return "browsePathsV2"
+    def entity_types(self) -> List[str]:
+        # This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
+        return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
+    @classmethod
+    def create(
+        cls, config_dict: dict, ctx: PipelineContext
+    ) -> "SetBrowsePathTransformer":
+        config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
+        return cls(config, ctx)
+    @staticmethod
+    def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
+        template_vars: Dict[str, List[str]] = {}
+        model: Dict[str, List[str]] = defaultdict(list)
+        for entry in existing_browse_paths.path or []:
+            if entry.urn:
+                entity_type = guess_entity_type(entry.urn)
+                model[entity_type].append(entry.urn)
+        for entity_type, urns in model.items():
+            template_vars[f"{entity_type}[*]"] = urns
+            for i, urn in enumerate(urns):
+                template_vars[f"{entity_type}[{i}]"] = [urn]
+        return template_vars
+    @classmethod
+    def _expand_nodes(
+        cls, templates: List[str], template_vars: Dict[str, List[str]]
+    ) -> BrowsePathsV2Class:
+        expanded_nodes: List[str] = []
+        for node in templates:
+            resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
+            expanded_nodes.extend(resolved_nodes)
+        processed_entries: List[BrowsePathEntryClass] = []
+        for node in expanded_nodes:
+            if not node or node.isspace():
+                continue
+            processed_entries.append(
+                BrowsePathEntryClass(
+                    id=node, urn=node if node.startswith("urn:") else None
+                )
+            )
+        return BrowsePathsV2Class(path=processed_entries)
+    def transform_aspect(
+        self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
+    ) -> Optional[Aspect]:
+        template_vars: Dict[str, List[str]] = {}
+        if aspect is not None:
+            assert isinstance(aspect, BrowsePathsV2Class)
+            template_vars = self._build_model(aspect)
+        new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
+            self.config.path, template_vars
+        )
+        if aspect is not None and not self.config.replace_existing:
+            for node in aspect.path:
+                new_browse_paths.path.append(node)
+        return cast(Aspect, new_browse_paths)
+    @staticmethod
+    def _resolve_template_to_nodes(
+        template_str: str, template_vars: Dict[str, List[str]]
+    ) -> List[str]:
+        # This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
+        # proper templating engine, like jinja).
+        template_str = template_str.strip()
+        var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
+        if not var_pattern:
+            return [template_str]
+        return template_vars.get(var_pattern[0], [])

acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.9rc1py3-none-any.whl → 1.2.0.10py3-none-any.whl