acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/autogenerated/capability_summary.json +45 -1
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +76 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +493 -547
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/metabase.py +9 -2
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +122 -15
- datahub/ingestion/source/tableau/tableau.py +68 -14
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/metadata/_internal_schema_classes.py +728 -528
- datahub/metadata/_urns/urn_defs.py +1702 -1702
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +17434 -17732
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/_shared.py +126 -0
- datahub/sdk/chart.py +87 -30
- datahub/sdk/dashboard.py +79 -34
- datahub/sdk/entity_client.py +11 -4
- datahub/sdk/lineage_client.py +3 -3
- datahub/sdk/search_filters.py +1 -7
- datahub/sql_parsing/split_statements.py +13 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
|
@@ -9,9 +9,10 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
11
|
import sqlglot
|
|
12
|
-
from pydantic import BaseModel
|
|
13
|
-
from pydantic.class_validators import root_validator, validator
|
|
12
|
+
from pydantic import BaseModel, root_validator, validator
|
|
14
13
|
from pydantic.fields import Field
|
|
14
|
+
from requests.adapters import HTTPAdapter
|
|
15
|
+
from urllib3.util.retry import Retry
|
|
15
16
|
|
|
16
17
|
import datahub.emitter.mce_builder as builder
|
|
17
18
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -109,6 +110,12 @@ logger = logging.getLogger(__name__)
|
|
|
109
110
|
|
|
110
111
|
PAGE_SIZE = 25
|
|
111
112
|
|
|
113
|
+
# Retry configuration constants
|
|
114
|
+
RETRY_MAX_TIMES = 3
|
|
115
|
+
RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
|
|
116
|
+
RETRY_BACKOFF_FACTOR = 1
|
|
117
|
+
RETRY_ALLOWED_METHODS = ["GET"]
|
|
118
|
+
|
|
112
119
|
|
|
113
120
|
chart_type_from_viz_type = {
|
|
114
121
|
"line": ChartTypeClass.LINE,
|
|
@@ -282,6 +289,7 @@ def get_filter_name(filter_obj):
|
|
|
282
289
|
)
|
|
283
290
|
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
|
284
291
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
|
292
|
+
@capability(SourceCapability.TAGS, "Supported by default")
|
|
285
293
|
class SupersetSource(StatefulIngestionSourceBase):
|
|
286
294
|
"""
|
|
287
295
|
This plugin extracts the following:
|
|
@@ -327,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
327
335
|
logger.debug("Got access token from superset")
|
|
328
336
|
|
|
329
337
|
requests_session = requests.Session()
|
|
338
|
+
|
|
339
|
+
# Configure retry strategy for transient failures
|
|
340
|
+
retry_strategy = Retry(
|
|
341
|
+
total=RETRY_MAX_TIMES,
|
|
342
|
+
status_forcelist=RETRY_STATUS_CODES,
|
|
343
|
+
backoff_factor=RETRY_BACKOFF_FACTOR,
|
|
344
|
+
allowed_methods=RETRY_ALLOWED_METHODS,
|
|
345
|
+
raise_on_status=False,
|
|
346
|
+
)
|
|
347
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
348
|
+
requests_session.mount("http://", adapter)
|
|
349
|
+
requests_session.mount("https://", adapter)
|
|
350
|
+
|
|
330
351
|
requests_session.headers.update(
|
|
331
352
|
{
|
|
332
353
|
"Authorization": f"Bearer {self.access_token}",
|
|
@@ -359,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
359
380
|
)
|
|
360
381
|
|
|
361
382
|
if response.status_code != 200:
|
|
362
|
-
|
|
363
|
-
|
|
383
|
+
self.report.warning(
|
|
384
|
+
title="Failed to fetch data from Superset API",
|
|
385
|
+
message="Incomplete metadata extraction due to Superset API failure",
|
|
386
|
+
context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
|
|
387
|
+
)
|
|
388
|
+
# we stop pagination for this entity type and we continue the overall ingestion
|
|
389
|
+
break
|
|
364
390
|
|
|
365
391
|
payload = response.json()
|
|
366
392
|
# Update total_items with the actual count from the response
|
|
@@ -521,6 +547,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
521
547
|
)
|
|
522
548
|
dashboard_snapshot.aspects.append(owners_info)
|
|
523
549
|
|
|
550
|
+
superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
|
|
551
|
+
tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
|
|
552
|
+
if tags:
|
|
553
|
+
dashboard_snapshot.aspects.append(tags)
|
|
554
|
+
|
|
524
555
|
return dashboard_snapshot
|
|
525
556
|
|
|
526
557
|
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
@@ -919,6 +950,12 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
919
950
|
lastModified=last_modified,
|
|
920
951
|
)
|
|
921
952
|
chart_snapshot.aspects.append(owners_info)
|
|
953
|
+
|
|
954
|
+
superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
|
|
955
|
+
tags = self._merge_tags_with_existing(chart_urn, superset_tags)
|
|
956
|
+
if tags:
|
|
957
|
+
chart_snapshot.aspects.append(tags)
|
|
958
|
+
|
|
922
959
|
yield MetadataWorkUnit(
|
|
923
960
|
id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
924
961
|
)
|
|
@@ -1288,17 +1325,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1288
1325
|
externalUrl=dataset_url,
|
|
1289
1326
|
lastModified=TimeStamp(time=modified_ts),
|
|
1290
1327
|
)
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1328
|
+
|
|
1329
|
+
dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
1330
|
+
tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
|
|
1331
|
+
|
|
1332
|
+
aspects_items: List[Any] = [
|
|
1333
|
+
self.gen_schema_metadata(dataset_response),
|
|
1334
|
+
dataset_info,
|
|
1335
|
+
upstream_lineage,
|
|
1336
|
+
]
|
|
1337
|
+
|
|
1338
|
+
if tags:
|
|
1339
|
+
aspects_items.append(tags)
|
|
1302
1340
|
|
|
1303
1341
|
dataset_snapshot = DatasetSnapshot(
|
|
1304
1342
|
urn=datasource_urn,
|
|
@@ -1320,6 +1358,75 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1320
1358
|
|
|
1321
1359
|
return dataset_snapshot
|
|
1322
1360
|
|
|
1361
|
+
def _extract_and_map_tags(
|
|
1362
|
+
self, raw_tags: List[Dict[str, Any]]
|
|
1363
|
+
) -> Optional[GlobalTagsClass]:
|
|
1364
|
+
"""Extract and map Superset tags to DataHub GlobalTagsClass.
|
|
1365
|
+
|
|
1366
|
+
Filters out system-generated tags (type != 1) and only processes user-defined tags
|
|
1367
|
+
from the Superset API response.
|
|
1368
|
+
|
|
1369
|
+
Args:
|
|
1370
|
+
raw_tags: List of tag dictionaries from Superset API
|
|
1371
|
+
|
|
1372
|
+
Returns:
|
|
1373
|
+
GlobalTagsClass with user-defined tags, or None if no tags found
|
|
1374
|
+
"""
|
|
1375
|
+
user_tags = [
|
|
1376
|
+
tag.get("name", "")
|
|
1377
|
+
for tag in raw_tags
|
|
1378
|
+
if tag.get("type") == 1 and tag.get("name")
|
|
1379
|
+
]
|
|
1380
|
+
|
|
1381
|
+
if not user_tags:
|
|
1382
|
+
return None
|
|
1383
|
+
|
|
1384
|
+
tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
|
|
1385
|
+
return GlobalTagsClass(
|
|
1386
|
+
tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
def _merge_tags_with_existing(
|
|
1390
|
+
self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
|
|
1391
|
+
) -> Optional[GlobalTagsClass]:
|
|
1392
|
+
"""Merge new tags with existing ones from DataHub to preserve manually added tags.
|
|
1393
|
+
|
|
1394
|
+
This method ensures that tags manually added via DataHub UI are not overwritten
|
|
1395
|
+
during ingestion. It fetches existing tags from the graph and merges them with
|
|
1396
|
+
new tags from the source system, avoiding duplicates.
|
|
1397
|
+
|
|
1398
|
+
Args:
|
|
1399
|
+
entity_urn: URN of the entity to check for existing tags
|
|
1400
|
+
new_tags: New tags to add as GlobalTagsClass object
|
|
1401
|
+
|
|
1402
|
+
Returns:
|
|
1403
|
+
GlobalTagsClass with merged tags preserving existing ones, or None if no tags
|
|
1404
|
+
"""
|
|
1405
|
+
if not new_tags or not new_tags.tags:
|
|
1406
|
+
return None
|
|
1407
|
+
|
|
1408
|
+
# Fetch existing tags from DataHub
|
|
1409
|
+
existing_global_tags = None
|
|
1410
|
+
if self.ctx.graph:
|
|
1411
|
+
existing_global_tags = self.ctx.graph.get_aspect(
|
|
1412
|
+
entity_urn=entity_urn, aspect_type=GlobalTagsClass
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
# Merge existing tags with new ones, avoiding duplicates
|
|
1416
|
+
all_tags = []
|
|
1417
|
+
existing_tag_urns = set()
|
|
1418
|
+
|
|
1419
|
+
if existing_global_tags and existing_global_tags.tags:
|
|
1420
|
+
all_tags.extend(existing_global_tags.tags)
|
|
1421
|
+
existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
|
|
1422
|
+
|
|
1423
|
+
# Add new tags that don't already exist
|
|
1424
|
+
for new_tag in new_tags.tags:
|
|
1425
|
+
if new_tag.tag not in existing_tag_urns:
|
|
1426
|
+
all_tags.append(new_tag)
|
|
1427
|
+
|
|
1428
|
+
return GlobalTagsClass(tags=all_tags) if all_tags else None
|
|
1429
|
+
|
|
1323
1430
|
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1324
1431
|
dataset_name = ""
|
|
1325
1432
|
try:
|
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
5
|
from collections import OrderedDict, defaultdict
|
|
6
|
+
from copy import deepcopy
|
|
6
7
|
from dataclasses import dataclass, field as dataclass_field
|
|
7
8
|
from datetime import datetime, timedelta, timezone
|
|
8
9
|
from functools import lru_cache
|
|
@@ -474,6 +475,13 @@ class TableauPageSizeConfig(ConfigModel):
|
|
|
474
475
|
return self.database_table_page_size or self.page_size
|
|
475
476
|
|
|
476
477
|
|
|
478
|
+
_IngestHiddenAssetsOptionsType = Literal["worksheet", "dashboard"]
|
|
479
|
+
_IngestHiddenAssetsOptions: List[_IngestHiddenAssetsOptionsType] = [
|
|
480
|
+
"worksheet",
|
|
481
|
+
"dashboard",
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
|
|
477
485
|
class TableauConfig(
|
|
478
486
|
DatasetLineageProviderConfigBase,
|
|
479
487
|
StatefulIngestionConfigBase,
|
|
@@ -524,6 +532,10 @@ class TableauConfig(
|
|
|
524
532
|
default=False,
|
|
525
533
|
description="Ingest Owner from source. This will override Owner info entered from UI",
|
|
526
534
|
)
|
|
535
|
+
use_email_as_username: bool = Field(
|
|
536
|
+
default=False,
|
|
537
|
+
description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
|
|
538
|
+
)
|
|
527
539
|
ingest_tables_external: bool = Field(
|
|
528
540
|
default=False,
|
|
529
541
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
@@ -582,13 +594,13 @@ class TableauConfig(
|
|
|
582
594
|
)
|
|
583
595
|
|
|
584
596
|
extract_lineage_from_unsupported_custom_sql_queries: bool = Field(
|
|
585
|
-
default=
|
|
586
|
-
description="[Experimental]
|
|
597
|
+
default=True,
|
|
598
|
+
description="[Experimental] Extract lineage from Custom SQL queries using DataHub's SQL parser in cases where the Tableau Catalog API fails to return lineage for the query.",
|
|
587
599
|
)
|
|
588
600
|
|
|
589
601
|
force_extraction_of_lineage_from_custom_sql_queries: bool = Field(
|
|
590
602
|
default=False,
|
|
591
|
-
description="[Experimental] Force extraction of lineage from
|
|
603
|
+
description="[Experimental] Force extraction of lineage from Custom SQL queries using DataHub's SQL parser, even when the Tableau Catalog API returns lineage already.",
|
|
592
604
|
)
|
|
593
605
|
|
|
594
606
|
sql_parsing_disable_schema_awareness: bool = Field(
|
|
@@ -621,8 +633,8 @@ class TableauConfig(
|
|
|
621
633
|
description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
|
|
622
634
|
)
|
|
623
635
|
|
|
624
|
-
ingest_hidden_assets: Union[List[
|
|
625
|
-
|
|
636
|
+
ingest_hidden_assets: Union[List[_IngestHiddenAssetsOptionsType], bool] = Field(
|
|
637
|
+
_IngestHiddenAssetsOptions,
|
|
626
638
|
description=(
|
|
627
639
|
"When enabled, hidden worksheets and dashboards are ingested into Datahub."
|
|
628
640
|
" If a dashboard or worksheet is hidden in Tableau the luid is blank."
|
|
@@ -644,6 +656,11 @@ class TableauConfig(
|
|
|
644
656
|
# pre = True because we want to take some decision before pydantic initialize the configuration to default values
|
|
645
657
|
@root_validator(pre=True)
|
|
646
658
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
659
|
+
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
660
|
+
# in test_hex.py where the same dict is reused.
|
|
661
|
+
# So a copy is performed first.
|
|
662
|
+
values = deepcopy(values)
|
|
663
|
+
|
|
647
664
|
projects = values.get("projects")
|
|
648
665
|
project_pattern = values.get("project_pattern")
|
|
649
666
|
project_path_pattern = values.get("project_path_pattern")
|
|
@@ -655,6 +672,7 @@ class TableauConfig(
|
|
|
655
672
|
values["project_pattern"] = AllowDenyPattern(
|
|
656
673
|
allow=[f"^{prj}$" for prj in projects]
|
|
657
674
|
)
|
|
675
|
+
values.pop("projects")
|
|
658
676
|
elif (project_pattern or project_path_pattern) and projects:
|
|
659
677
|
raise ValueError(
|
|
660
678
|
"projects is deprecated. Please use project_path_pattern only."
|
|
@@ -666,7 +684,7 @@ class TableauConfig(
|
|
|
666
684
|
|
|
667
685
|
return values
|
|
668
686
|
|
|
669
|
-
@root_validator()
|
|
687
|
+
@root_validator(skip_on_failure=True)
|
|
670
688
|
def validate_config_values(cls, values: Dict) -> Dict:
|
|
671
689
|
tags_for_hidden_assets = values.get("tags_for_hidden_assets")
|
|
672
690
|
ingest_tags = values.get("ingest_tags")
|
|
@@ -678,6 +696,14 @@ class TableauConfig(
|
|
|
678
696
|
raise ValueError(
|
|
679
697
|
"tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
|
|
680
698
|
)
|
|
699
|
+
|
|
700
|
+
use_email_as_username = values.get("use_email_as_username")
|
|
701
|
+
ingest_owner = values.get("ingest_owner")
|
|
702
|
+
if use_email_as_username and not ingest_owner:
|
|
703
|
+
raise ValueError(
|
|
704
|
+
"use_email_as_username requires ingest_owner to be enabled."
|
|
705
|
+
)
|
|
706
|
+
|
|
681
707
|
return values
|
|
682
708
|
|
|
683
709
|
|
|
@@ -839,6 +865,9 @@ class TableauSourceReport(
|
|
|
839
865
|
default_factory=(lambda: defaultdict(int))
|
|
840
866
|
)
|
|
841
867
|
|
|
868
|
+
# Owner extraction statistics
|
|
869
|
+
num_email_fallback_to_username: int = 0
|
|
870
|
+
|
|
842
871
|
|
|
843
872
|
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
844
873
|
title: str = "Insufficient Permissions"
|
|
@@ -2716,13 +2745,12 @@ class TableauSiteSource:
|
|
|
2716
2745
|
dataset_snapshot.aspects.append(browse_paths)
|
|
2717
2746
|
|
|
2718
2747
|
# Ownership
|
|
2719
|
-
|
|
2720
|
-
self.
|
|
2721
|
-
if datasource_info
|
|
2722
|
-
and datasource_info.get(c.OWNER)
|
|
2723
|
-
and datasource_info[c.OWNER].get(c.USERNAME)
|
|
2748
|
+
owner_identifier = (
|
|
2749
|
+
self._get_owner_identifier(datasource_info[c.OWNER])
|
|
2750
|
+
if datasource_info and datasource_info.get(c.OWNER)
|
|
2724
2751
|
else None
|
|
2725
2752
|
)
|
|
2753
|
+
owner = self._get_ownership(owner_identifier) if owner_identifier else None
|
|
2726
2754
|
if owner is not None:
|
|
2727
2755
|
dataset_snapshot.aspects.append(owner)
|
|
2728
2756
|
|
|
@@ -3127,7 +3155,7 @@ class TableauSiteSource:
|
|
|
3127
3155
|
|
|
3128
3156
|
creator: Optional[str] = None
|
|
3129
3157
|
if workbook is not None and workbook.get(c.OWNER) is not None:
|
|
3130
|
-
creator = workbook[c.OWNER]
|
|
3158
|
+
creator = self._get_owner_identifier(workbook[c.OWNER])
|
|
3131
3159
|
created_at = sheet.get(c.CREATED_AT, datetime.now())
|
|
3132
3160
|
updated_at = sheet.get(c.UPDATED_AT, datetime.now())
|
|
3133
3161
|
last_modified = self.get_last_modified(creator, created_at, updated_at)
|
|
@@ -3276,7 +3304,7 @@ class TableauSiteSource:
|
|
|
3276
3304
|
|
|
3277
3305
|
def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
|
|
3278
3306
|
workbook_container_key = self.gen_workbook_key(workbook[c.ID])
|
|
3279
|
-
creator = workbook.get(c.OWNER, {})
|
|
3307
|
+
creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
|
|
3280
3308
|
|
|
3281
3309
|
owner_urn = (
|
|
3282
3310
|
builder.make_user_urn(creator)
|
|
@@ -3458,7 +3486,7 @@ class TableauSiteSource:
|
|
|
3458
3486
|
|
|
3459
3487
|
creator: Optional[str] = None
|
|
3460
3488
|
if workbook is not None and workbook.get(c.OWNER) is not None:
|
|
3461
|
-
creator = workbook[c.OWNER]
|
|
3489
|
+
creator = self._get_owner_identifier(workbook[c.OWNER])
|
|
3462
3490
|
created_at = dashboard.get(c.CREATED_AT, datetime.now())
|
|
3463
3491
|
updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
|
|
3464
3492
|
last_modified = self.get_last_modified(creator, created_at, updated_at)
|
|
@@ -3605,6 +3633,20 @@ class TableauSiteSource:
|
|
|
3605
3633
|
)
|
|
3606
3634
|
return last_modified
|
|
3607
3635
|
|
|
3636
|
+
def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
|
|
3637
|
+
"""Extract owner identifier (email or username) based on configuration."""
|
|
3638
|
+
if not owner_dict:
|
|
3639
|
+
return None
|
|
3640
|
+
|
|
3641
|
+
if self.config.use_email_as_username:
|
|
3642
|
+
email = owner_dict.get(c.EMAIL)
|
|
3643
|
+
if email:
|
|
3644
|
+
return email
|
|
3645
|
+
# Fall back to username if email is not available
|
|
3646
|
+
self.report.num_email_fallback_to_username += 1
|
|
3647
|
+
|
|
3648
|
+
return owner_dict.get(c.USERNAME)
|
|
3649
|
+
|
|
3608
3650
|
@lru_cache(maxsize=None)
|
|
3609
3651
|
def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
|
|
3610
3652
|
if self.config.ingest_owner and user:
|
|
@@ -3828,3 +3870,15 @@ class TableauSiteSource:
|
|
|
3828
3870
|
self.report.emit_upstream_tables_timer[self.site_content_url] = (
|
|
3829
3871
|
timer.elapsed_seconds(digits=2)
|
|
3830
3872
|
)
|
|
3873
|
+
|
|
3874
|
+
# Log owner extraction statistics if there were fallbacks
|
|
3875
|
+
if (
|
|
3876
|
+
self.config.use_email_as_username
|
|
3877
|
+
and self.config.ingest_owner
|
|
3878
|
+
and self.report.num_email_fallback_to_username > 0
|
|
3879
|
+
):
|
|
3880
|
+
logger.info(
|
|
3881
|
+
f"Owner extraction summary for site '{self.site_content_url}': "
|
|
3882
|
+
f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
|
|
3883
|
+
f"(email was not available)"
|
|
3884
|
+
)
|
|
@@ -65,6 +65,7 @@ workbook_graphql_query = """
|
|
|
65
65
|
projectName
|
|
66
66
|
owner {
|
|
67
67
|
username
|
|
68
|
+
email
|
|
68
69
|
}
|
|
69
70
|
description
|
|
70
71
|
uri
|
|
@@ -107,6 +108,7 @@ sheet_graphql_query = """
|
|
|
107
108
|
luid
|
|
108
109
|
owner {
|
|
109
110
|
username
|
|
111
|
+
email
|
|
110
112
|
}
|
|
111
113
|
}
|
|
112
114
|
datasourceFields {
|
|
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
|
|
|
185
187
|
luid
|
|
186
188
|
owner {
|
|
187
189
|
username
|
|
190
|
+
email
|
|
188
191
|
}
|
|
189
192
|
}
|
|
190
193
|
}
|
|
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
|
|
|
268
271
|
luid
|
|
269
272
|
owner {
|
|
270
273
|
username
|
|
274
|
+
email
|
|
271
275
|
}
|
|
272
276
|
}
|
|
273
277
|
}
|
|
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
|
|
|
424
428
|
}
|
|
425
429
|
owner {
|
|
426
430
|
username
|
|
431
|
+
email
|
|
427
432
|
}
|
|
428
433
|
description
|
|
429
434
|
uri
|
|
@@ -59,6 +59,7 @@ LUID = "luid"
|
|
|
59
59
|
EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
|
|
60
60
|
OWNER = "owner"
|
|
61
61
|
USERNAME = "username"
|
|
62
|
+
EMAIL = "email"
|
|
62
63
|
HAS_EXTRACTS = "hasExtracts"
|
|
63
64
|
EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
|
|
64
65
|
EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from tableauserverclient import Server, UserItem
|
|
4
5
|
|
|
@@ -10,6 +11,7 @@ class UserInfo:
|
|
|
10
11
|
user_name: str
|
|
11
12
|
site_role: str
|
|
12
13
|
site_id: str
|
|
14
|
+
email: Optional[str] = None
|
|
13
15
|
|
|
14
16
|
def has_site_administrator_explorer_privileges(self):
|
|
15
17
|
return self.site_role in [
|
|
@@ -34,4 +36,5 @@ class UserInfo:
|
|
|
34
36
|
user_name=user.name,
|
|
35
37
|
site_role=user.site_role,
|
|
36
38
|
site_id=server.site_id,
|
|
39
|
+
email=user.email,
|
|
37
40
|
)
|
|
@@ -8,7 +8,12 @@ import pydantic
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import
|
|
11
|
+
from datahub.configuration.common import (
|
|
12
|
+
AllowDenyPattern,
|
|
13
|
+
ConfigEnum,
|
|
14
|
+
ConfigModel,
|
|
15
|
+
HiddenFromDocs,
|
|
16
|
+
)
|
|
12
17
|
from datahub.configuration.source_common import (
|
|
13
18
|
DatasetSourceConfigMixin,
|
|
14
19
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -285,10 +290,9 @@ class UnityCatalogSourceConfig(
|
|
|
285
290
|
description="Limit the number of columns to get column level lineage. ",
|
|
286
291
|
)
|
|
287
292
|
|
|
288
|
-
lineage_max_workers: int = pydantic.Field(
|
|
293
|
+
lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
|
|
289
294
|
default=5 * (os.cpu_count() or 4),
|
|
290
295
|
description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
|
|
291
|
-
hidden_from_docs=True,
|
|
292
296
|
)
|
|
293
297
|
|
|
294
298
|
databricks_api_page_size: int = pydantic.Field(
|
|
@@ -18,7 +18,7 @@ import pydantic
|
|
|
18
18
|
from pydantic.fields import Field
|
|
19
19
|
|
|
20
20
|
import datahub.emitter.mce_builder as builder
|
|
21
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
21
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
22
22
|
from datahub.configuration.time_window_config import (
|
|
23
23
|
BaseTimeWindowConfig,
|
|
24
24
|
BucketDuration,
|
|
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
|
|
|
194
194
|
|
|
195
195
|
|
|
196
196
|
class BaseUsageConfig(BaseTimeWindowConfig):
|
|
197
|
-
queries_character_limit: int = Field(
|
|
197
|
+
queries_character_limit: HiddenFromDocs[int] = Field(
|
|
198
|
+
# Hidden since we don't want to encourage people to break elasticsearch.
|
|
198
199
|
default=DEFAULT_QUERIES_CHARACTER_LIMIT,
|
|
199
200
|
description=(
|
|
200
201
|
"Total character limit for all queries in a single usage aspect."
|
|
201
202
|
" Queries will be truncated to length `queries_character_limit / top_n_queries`."
|
|
202
203
|
),
|
|
203
|
-
hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
|
|
204
204
|
)
|
|
205
205
|
|
|
206
206
|
top_n_queries: pydantic.PositiveInt = Field(
|
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
from typing import Dict, List, Optional, Union
|
|
3
3
|
from urllib.parse import urlparse
|
|
4
4
|
|
|
5
|
+
import pydantic
|
|
5
6
|
from pydantic import Field, validator
|
|
6
7
|
|
|
7
8
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
|
|
|
121
122
|
)
|
|
122
123
|
return client_secret
|
|
123
124
|
|
|
124
|
-
@
|
|
125
|
+
@pydantic.field_validator("web_service_url", mode="after")
|
|
126
|
+
@classmethod
|
|
125
127
|
def web_service_url_scheme_host_port(cls, val: str) -> str:
|
|
126
128
|
# Tokenize the web url
|
|
127
129
|
url = urlparse(val)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Dict, List, Optional, cast
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import (
|
|
6
|
+
TransformerSemanticsConfigModel,
|
|
7
|
+
)
|
|
8
|
+
from datahub.emitter.mce_builder import Aspect
|
|
9
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
10
|
+
from datahub.ingestion.transformer.base_transformer import (
|
|
11
|
+
BaseTransformer,
|
|
12
|
+
SingleAspectTransformer,
|
|
13
|
+
)
|
|
14
|
+
from datahub.metadata.schema_classes import (
|
|
15
|
+
BrowsePathEntryClass,
|
|
16
|
+
BrowsePathsV2Class,
|
|
17
|
+
)
|
|
18
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
|
|
22
|
+
path: List[str]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
|
|
26
|
+
ctx: PipelineContext
|
|
27
|
+
config: SetBrowsePathTransformerConfig
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.ctx = ctx
|
|
32
|
+
self.config = config
|
|
33
|
+
|
|
34
|
+
def aspect_name(self) -> str:
|
|
35
|
+
return "browsePathsV2"
|
|
36
|
+
|
|
37
|
+
def entity_types(self) -> List[str]:
|
|
38
|
+
# This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
|
|
39
|
+
return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def create(
|
|
43
|
+
cls, config_dict: dict, ctx: PipelineContext
|
|
44
|
+
) -> "SetBrowsePathTransformer":
|
|
45
|
+
config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
|
|
46
|
+
return cls(config, ctx)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
|
|
50
|
+
template_vars: Dict[str, List[str]] = {}
|
|
51
|
+
model: Dict[str, List[str]] = defaultdict(list)
|
|
52
|
+
for entry in existing_browse_paths.path or []:
|
|
53
|
+
if entry.urn:
|
|
54
|
+
entity_type = guess_entity_type(entry.urn)
|
|
55
|
+
model[entity_type].append(entry.urn)
|
|
56
|
+
|
|
57
|
+
for entity_type, urns in model.items():
|
|
58
|
+
template_vars[f"{entity_type}[*]"] = urns
|
|
59
|
+
for i, urn in enumerate(urns):
|
|
60
|
+
template_vars[f"{entity_type}[{i}]"] = [urn]
|
|
61
|
+
|
|
62
|
+
return template_vars
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def _expand_nodes(
|
|
66
|
+
cls, templates: List[str], template_vars: Dict[str, List[str]]
|
|
67
|
+
) -> BrowsePathsV2Class:
|
|
68
|
+
expanded_nodes: List[str] = []
|
|
69
|
+
for node in templates:
|
|
70
|
+
resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
|
|
71
|
+
expanded_nodes.extend(resolved_nodes)
|
|
72
|
+
|
|
73
|
+
processed_entries: List[BrowsePathEntryClass] = []
|
|
74
|
+
for node in expanded_nodes:
|
|
75
|
+
if not node or node.isspace():
|
|
76
|
+
continue
|
|
77
|
+
processed_entries.append(
|
|
78
|
+
BrowsePathEntryClass(
|
|
79
|
+
id=node, urn=node if node.startswith("urn:") else None
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
return BrowsePathsV2Class(path=processed_entries)
|
|
83
|
+
|
|
84
|
+
def transform_aspect(
|
|
85
|
+
self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
|
|
86
|
+
) -> Optional[Aspect]:
|
|
87
|
+
template_vars: Dict[str, List[str]] = {}
|
|
88
|
+
if aspect is not None:
|
|
89
|
+
assert isinstance(aspect, BrowsePathsV2Class)
|
|
90
|
+
template_vars = self._build_model(aspect)
|
|
91
|
+
new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
|
|
92
|
+
self.config.path, template_vars
|
|
93
|
+
)
|
|
94
|
+
if aspect is not None and not self.config.replace_existing:
|
|
95
|
+
for node in aspect.path:
|
|
96
|
+
new_browse_paths.path.append(node)
|
|
97
|
+
|
|
98
|
+
return cast(Aspect, new_browse_paths)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _resolve_template_to_nodes(
|
|
102
|
+
template_str: str, template_vars: Dict[str, List[str]]
|
|
103
|
+
) -> List[str]:
|
|
104
|
+
# This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
|
|
105
|
+
# proper templating engine, like jinja).
|
|
106
|
+
template_str = template_str.strip()
|
|
107
|
+
var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
|
|
108
|
+
|
|
109
|
+
if not var_pattern:
|
|
110
|
+
return [template_str]
|
|
111
|
+
|
|
112
|
+
return template_vars.get(var_pattern[0], [])
|