acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
4
|
import time
|
|
5
|
-
from collections import OrderedDict
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from datetime import datetime
|
|
5
|
+
from collections import OrderedDict, defaultdict
|
|
6
|
+
from dataclasses import dataclass, field as dataclass_field
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from typing import (
|
|
10
10
|
Any,
|
|
@@ -35,7 +35,10 @@ from tableauserverclient import (
|
|
|
35
35
|
SiteItem,
|
|
36
36
|
TableauAuth,
|
|
37
37
|
)
|
|
38
|
-
from tableauserverclient.server.endpoint.exceptions import
|
|
38
|
+
from tableauserverclient.server.endpoint.exceptions import (
|
|
39
|
+
InternalServerError,
|
|
40
|
+
NonXMLResponseError,
|
|
41
|
+
)
|
|
39
42
|
from urllib3 import Retry
|
|
40
43
|
|
|
41
44
|
import datahub.emitter.mce_builder as builder
|
|
@@ -49,6 +52,7 @@ from datahub.configuration.source_common import (
|
|
|
49
52
|
DatasetSourceConfigMixin,
|
|
50
53
|
)
|
|
51
54
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
55
|
+
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
52
56
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
53
57
|
from datahub.emitter.mcp_builder import (
|
|
54
58
|
ContainerKey,
|
|
@@ -105,6 +109,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
105
109
|
make_filter,
|
|
106
110
|
make_fine_grained_lineage_class,
|
|
107
111
|
make_upstream_class,
|
|
112
|
+
optimize_query_filter,
|
|
108
113
|
published_datasource_graphql_query,
|
|
109
114
|
query_metadata_cursor_based_pagination,
|
|
110
115
|
sheet_graphql_query,
|
|
@@ -182,6 +187,20 @@ try:
|
|
|
182
187
|
except ImportError:
|
|
183
188
|
REAUTHENTICATE_ERRORS = (NonXMLResponseError,)
|
|
184
189
|
|
|
190
|
+
RETRIABLE_ERROR_CODES = [
|
|
191
|
+
408, # Request Timeout
|
|
192
|
+
429, # Too Many Requests
|
|
193
|
+
500, # Internal Server Error
|
|
194
|
+
502, # Bad Gateway
|
|
195
|
+
503, # Service Unavailable
|
|
196
|
+
504, # Gateway Timeout
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# From experience, this expiry time typically ranges from 50 minutes
|
|
200
|
+
# to 2 hours but might as well be configurable. We will allow upto
|
|
201
|
+
# 10 minutes of such expiry time
|
|
202
|
+
REGULAR_AUTH_EXPIRY_PERIOD = timedelta(minutes=10)
|
|
203
|
+
|
|
185
204
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
186
205
|
|
|
187
206
|
# Replace / with |
|
|
@@ -283,7 +302,7 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
283
302
|
max_retries=Retry(
|
|
284
303
|
total=self.max_retries,
|
|
285
304
|
backoff_factor=1,
|
|
286
|
-
status_forcelist=
|
|
305
|
+
status_forcelist=RETRIABLE_ERROR_CODES,
|
|
287
306
|
)
|
|
288
307
|
)
|
|
289
308
|
server._session.mount("http://", adapter)
|
|
@@ -380,11 +399,6 @@ class TableauConfig(
|
|
|
380
399
|
description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
|
|
381
400
|
)
|
|
382
401
|
|
|
383
|
-
fetch_size: int = Field(
|
|
384
|
-
default=250,
|
|
385
|
-
description="Specifies the number of records to retrieve in each batch during a query execution.",
|
|
386
|
-
)
|
|
387
|
-
|
|
388
402
|
# We've found that even with a small workbook page size (e.g. 10), the Tableau API often
|
|
389
403
|
# returns warnings like this:
|
|
390
404
|
# {
|
|
@@ -499,6 +513,10 @@ class TableauConfig(
|
|
|
499
513
|
"This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
|
|
500
514
|
)
|
|
501
515
|
|
|
516
|
+
_fetch_size = pydantic_removed_field(
|
|
517
|
+
"fetch_size",
|
|
518
|
+
)
|
|
519
|
+
|
|
502
520
|
# pre = True because we want to take some decision before pydantic initialize the configuration to default values
|
|
503
521
|
@root_validator(pre=True)
|
|
504
522
|
def projects_backward_compatibility(cls, values: Dict) -> Dict:
|
|
@@ -618,6 +636,13 @@ class DatabaseTable:
|
|
|
618
636
|
self.parsed_columns = parsed_columns
|
|
619
637
|
|
|
620
638
|
|
|
639
|
+
@dataclass
|
|
640
|
+
class SiteIdContentUrl:
|
|
641
|
+
site_id: str
|
|
642
|
+
site_content_url: str
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
@dataclass
|
|
621
646
|
class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
622
647
|
get_all_datasources_query_failed: bool = False
|
|
623
648
|
num_get_datasource_query_failures: int = 0
|
|
@@ -634,7 +659,14 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
634
659
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
635
660
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
636
661
|
num_hidden_assets_skipped: int = 0
|
|
637
|
-
logged_in_user: List[UserInfo] =
|
|
662
|
+
logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
|
|
663
|
+
last_authenticated_at: Optional[datetime] = None
|
|
664
|
+
|
|
665
|
+
num_expected_tableau_metadata_queries: int = 0
|
|
666
|
+
num_actual_tableau_metadata_queries: int = 0
|
|
667
|
+
tableau_server_error_stats: Dict[str, int] = dataclass_field(
|
|
668
|
+
default_factory=(lambda: defaultdict(int))
|
|
669
|
+
)
|
|
638
670
|
|
|
639
671
|
|
|
640
672
|
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
@@ -645,7 +677,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
|
645
677
|
# the site-role might be different on another site
|
|
646
678
|
logged_in_user: UserInfo = UserInfo.from_server(server=server)
|
|
647
679
|
|
|
648
|
-
if not logged_in_user.
|
|
680
|
+
if not logged_in_user.has_site_administrator_explorer_privileges():
|
|
649
681
|
report.warning(
|
|
650
682
|
title=title,
|
|
651
683
|
message=message,
|
|
@@ -705,6 +737,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
705
737
|
try:
|
|
706
738
|
logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
|
|
707
739
|
self.server = self.config.make_tableau_client(site_content_url)
|
|
740
|
+
self.report.last_authenticated_at = datetime.now(timezone.utc)
|
|
708
741
|
report_user_role(report=self.report, server=self.server)
|
|
709
742
|
# Note that we're not catching ConfigurationError, since we want that to throw.
|
|
710
743
|
except ValueError as e:
|
|
@@ -770,7 +803,6 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
770
803
|
config=self.config,
|
|
771
804
|
ctx=self.ctx,
|
|
772
805
|
site=site,
|
|
773
|
-
site_id=site.id,
|
|
774
806
|
report=self.report,
|
|
775
807
|
server=self.server,
|
|
776
808
|
platform=self.platform,
|
|
@@ -789,8 +821,14 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
789
821
|
site_source = TableauSiteSource(
|
|
790
822
|
config=self.config,
|
|
791
823
|
ctx=self.ctx,
|
|
792
|
-
site=
|
|
793
|
-
|
|
824
|
+
site=(
|
|
825
|
+
site
|
|
826
|
+
if site
|
|
827
|
+
else SiteIdContentUrl(
|
|
828
|
+
site_id=self.server.site_id,
|
|
829
|
+
site_content_url=self.config.site,
|
|
830
|
+
)
|
|
831
|
+
),
|
|
794
832
|
report=self.report,
|
|
795
833
|
server=self.server,
|
|
796
834
|
platform=self.platform,
|
|
@@ -823,8 +861,7 @@ class TableauSiteSource:
|
|
|
823
861
|
self,
|
|
824
862
|
config: TableauConfig,
|
|
825
863
|
ctx: PipelineContext,
|
|
826
|
-
site:
|
|
827
|
-
site_id: Optional[str],
|
|
864
|
+
site: Union[SiteItem, SiteIdContentUrl],
|
|
828
865
|
report: TableauSourceReport,
|
|
829
866
|
server: Server,
|
|
830
867
|
platform: str,
|
|
@@ -835,13 +872,18 @@ class TableauSiteSource:
|
|
|
835
872
|
self.ctx: PipelineContext = ctx
|
|
836
873
|
self.platform = platform
|
|
837
874
|
|
|
838
|
-
self.site: Optional[SiteItem] =
|
|
839
|
-
if
|
|
840
|
-
self.
|
|
875
|
+
self.site: Optional[SiteItem] = None
|
|
876
|
+
if isinstance(site, SiteItem):
|
|
877
|
+
self.site = site
|
|
878
|
+
assert site.id is not None, "Site ID is required"
|
|
879
|
+
self.site_id = site.id
|
|
880
|
+
self.site_content_url = site.content_url
|
|
881
|
+
elif isinstance(site, SiteIdContentUrl):
|
|
882
|
+
self.site = None
|
|
883
|
+
self.site_id = site.site_id
|
|
884
|
+
self.site_content_url = site.site_content_url
|
|
841
885
|
else:
|
|
842
|
-
|
|
843
|
-
assert self.site.id is not None, "site_id is required when site is provided"
|
|
844
|
-
self.site_id = self.site.id
|
|
886
|
+
raise AssertionError("site or site id+content_url pair is required")
|
|
845
887
|
|
|
846
888
|
self.database_tables: Dict[str, DatabaseTable] = {}
|
|
847
889
|
self.tableau_stat_registry: Dict[str, UsageStat] = {}
|
|
@@ -895,17 +937,12 @@ class TableauSiteSource:
|
|
|
895
937
|
# datasets also have the env in the browse path
|
|
896
938
|
return f"/{self.config.env.lower()}{self.no_env_browse_prefix}"
|
|
897
939
|
|
|
898
|
-
def _re_authenticate(self):
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
self.server.
|
|
903
|
-
|
|
904
|
-
@property
|
|
905
|
-
def site_content_url(self) -> Optional[str]:
|
|
906
|
-
if self.site and self.site.content_url:
|
|
907
|
-
return self.site.content_url
|
|
908
|
-
return None
|
|
940
|
+
def _re_authenticate(self) -> None:
|
|
941
|
+
logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'")
|
|
942
|
+
# Sign-in again may not be enough because Tableau sometimes caches invalid sessions
|
|
943
|
+
# so we need to recreate the Tableau Server object
|
|
944
|
+
self.server = self.config.make_tableau_client(self.site_content_url)
|
|
945
|
+
self.report.last_authenticated_at = datetime.now(timezone.utc)
|
|
909
946
|
|
|
910
947
|
def _populate_usage_stat_registry(self) -> None:
|
|
911
948
|
if self.server is None:
|
|
@@ -1148,7 +1185,7 @@ class TableauSiteSource:
|
|
|
1148
1185
|
connection_type: str,
|
|
1149
1186
|
query_filter: str,
|
|
1150
1187
|
current_cursor: Optional[str],
|
|
1151
|
-
fetch_size: int
|
|
1188
|
+
fetch_size: int,
|
|
1152
1189
|
retry_on_auth_error: bool = True,
|
|
1153
1190
|
retries_remaining: Optional[int] = None,
|
|
1154
1191
|
) -> Tuple[dict, Optional[str], int]:
|
|
@@ -1171,6 +1208,7 @@ class TableauSiteSource:
|
|
|
1171
1208
|
)
|
|
1172
1209
|
try:
|
|
1173
1210
|
assert self.server is not None
|
|
1211
|
+
self.report.num_actual_tableau_metadata_queries += 1
|
|
1174
1212
|
query_data = query_metadata_cursor_based_pagination(
|
|
1175
1213
|
server=self.server,
|
|
1176
1214
|
main_query=query,
|
|
@@ -1180,24 +1218,56 @@ class TableauSiteSource:
|
|
|
1180
1218
|
qry_filter=query_filter,
|
|
1181
1219
|
)
|
|
1182
1220
|
|
|
1183
|
-
except REAUTHENTICATE_ERRORS:
|
|
1184
|
-
|
|
1221
|
+
except REAUTHENTICATE_ERRORS as e:
|
|
1222
|
+
self.report.tableau_server_error_stats[e.__class__.__name__] += 1
|
|
1223
|
+
if not retry_on_auth_error or retries_remaining <= 0:
|
|
1185
1224
|
raise
|
|
1186
1225
|
|
|
1187
|
-
#
|
|
1188
|
-
#
|
|
1189
|
-
#
|
|
1190
|
-
|
|
1226
|
+
# We have been getting some irregular authorization errors like below well before the expected expiry time
|
|
1227
|
+
# - within few seconds of initial authentication . We'll retry without re-auth for such cases.
|
|
1228
|
+
# <class 'tableauserverclient.server.endpoint.exceptions.NonXMLResponseError'>:
|
|
1229
|
+
# b'{"timestamp":"xxx","status":401,"error":"Unauthorized","path":"/relationship-service-war/graphql"}'
|
|
1230
|
+
if self.report.last_authenticated_at and (
|
|
1231
|
+
datetime.now(timezone.utc) - self.report.last_authenticated_at
|
|
1232
|
+
> REGULAR_AUTH_EXPIRY_PERIOD
|
|
1233
|
+
):
|
|
1234
|
+
# If ingestion has been running for over 2 hours, the Tableau
|
|
1235
|
+
# temporary credentials will expire. If this happens, this exception
|
|
1236
|
+
# will be thrown, and we need to re-authenticate and retry.
|
|
1237
|
+
self._re_authenticate()
|
|
1238
|
+
|
|
1191
1239
|
return self.get_connection_object_page(
|
|
1192
1240
|
query=query,
|
|
1193
1241
|
connection_type=connection_type,
|
|
1194
1242
|
query_filter=query_filter,
|
|
1195
1243
|
fetch_size=fetch_size,
|
|
1196
1244
|
current_cursor=current_cursor,
|
|
1197
|
-
retry_on_auth_error=
|
|
1245
|
+
retry_on_auth_error=True,
|
|
1198
1246
|
retries_remaining=retries_remaining - 1,
|
|
1199
1247
|
)
|
|
1248
|
+
|
|
1249
|
+
except InternalServerError as ise:
|
|
1250
|
+
self.report.tableau_server_error_stats[InternalServerError.__name__] += 1
|
|
1251
|
+
# In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry.
|
|
1252
|
+
# Extended with other retryable errors.
|
|
1253
|
+
if ise.code in RETRIABLE_ERROR_CODES:
|
|
1254
|
+
if retries_remaining <= 0:
|
|
1255
|
+
raise ise
|
|
1256
|
+
logger.info(f"Retrying query due to error {ise.code}")
|
|
1257
|
+
return self.get_connection_object_page(
|
|
1258
|
+
query=query,
|
|
1259
|
+
connection_type=connection_type,
|
|
1260
|
+
query_filter=query_filter,
|
|
1261
|
+
fetch_size=fetch_size,
|
|
1262
|
+
current_cursor=current_cursor,
|
|
1263
|
+
retry_on_auth_error=True,
|
|
1264
|
+
retries_remaining=retries_remaining - 1,
|
|
1265
|
+
)
|
|
1266
|
+
else:
|
|
1267
|
+
raise ise
|
|
1268
|
+
|
|
1200
1269
|
except OSError:
|
|
1270
|
+
self.report.tableau_server_error_stats[OSError.__name__] += 1
|
|
1201
1271
|
# In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04),
|
|
1202
1272
|
# the request logic was changed to use threads.
|
|
1203
1273
|
# https://github.com/tableau/server-client-python/commit/307d8a20a30f32c1ce615cca7c6a78b9b9bff081
|
|
@@ -1212,7 +1282,7 @@ class TableauSiteSource:
|
|
|
1212
1282
|
query_filter=query_filter,
|
|
1213
1283
|
fetch_size=fetch_size,
|
|
1214
1284
|
current_cursor=current_cursor,
|
|
1215
|
-
retry_on_auth_error=
|
|
1285
|
+
retry_on_auth_error=True,
|
|
1216
1286
|
retries_remaining=retries_remaining - 1,
|
|
1217
1287
|
)
|
|
1218
1288
|
|
|
@@ -1300,7 +1370,7 @@ class TableauSiteSource:
|
|
|
1300
1370
|
query_filter=query_filter,
|
|
1301
1371
|
fetch_size=fetch_size,
|
|
1302
1372
|
current_cursor=current_cursor,
|
|
1303
|
-
retry_on_auth_error=
|
|
1373
|
+
retry_on_auth_error=True,
|
|
1304
1374
|
retries_remaining=retries_remaining,
|
|
1305
1375
|
)
|
|
1306
1376
|
raise RuntimeError(f"Query {connection_type} error: {errors}")
|
|
@@ -1325,6 +1395,8 @@ class TableauSiteSource:
|
|
|
1325
1395
|
query_filter: dict = {},
|
|
1326
1396
|
page_size_override: Optional[int] = None,
|
|
1327
1397
|
) -> Iterable[dict]:
|
|
1398
|
+
query_filter = optimize_query_filter(query_filter)
|
|
1399
|
+
|
|
1328
1400
|
# Calls the get_connection_object_page function to get the objects,
|
|
1329
1401
|
# and automatically handles pagination.
|
|
1330
1402
|
page_size = page_size_override or self.config.page_size
|
|
@@ -1336,6 +1408,7 @@ class TableauSiteSource:
|
|
|
1336
1408
|
while has_next_page:
|
|
1337
1409
|
filter_: str = make_filter(filter_page)
|
|
1338
1410
|
|
|
1411
|
+
self.report.num_expected_tableau_metadata_queries += 1
|
|
1339
1412
|
(
|
|
1340
1413
|
connection_objects,
|
|
1341
1414
|
current_cursor,
|
|
@@ -1345,7 +1418,11 @@ class TableauSiteSource:
|
|
|
1345
1418
|
connection_type=connection_type,
|
|
1346
1419
|
query_filter=filter_,
|
|
1347
1420
|
current_cursor=current_cursor,
|
|
1348
|
-
|
|
1421
|
+
# `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.).
|
|
1422
|
+
# The number of IDs is always less than or equal to page_size.
|
|
1423
|
+
# If the IDs are primary keys, the number of metadata objects to load matches the number of records to return.
|
|
1424
|
+
# In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size.
|
|
1425
|
+
fetch_size=page_size,
|
|
1349
1426
|
)
|
|
1350
1427
|
|
|
1351
1428
|
yield from connection_objects.get(c.NODES) or []
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import html
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -35,6 +36,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
36
|
UpstreamClass,
|
|
36
37
|
)
|
|
37
38
|
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
|
|
39
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
38
40
|
|
|
39
41
|
logger = logging.getLogger(__name__)
|
|
40
42
|
|
|
@@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
|
|
|
1000
1002
|
]
|
|
1001
1003
|
|
|
1002
1004
|
return filter_pages
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def optimize_query_filter(query_filter: dict) -> dict:
|
|
1008
|
+
"""
|
|
1009
|
+
Duplicates in the filter cause duplicates in the result,
|
|
1010
|
+
leading to entities/aspects being emitted multiple times unnecessarily
|
|
1011
|
+
"""
|
|
1012
|
+
optimized_query = copy.deepcopy(query_filter)
|
|
1013
|
+
|
|
1014
|
+
if query_filter.get(c.ID_WITH_IN):
|
|
1015
|
+
optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN]))
|
|
1016
|
+
if query_filter.get(c.PROJECT_NAME_WITH_IN):
|
|
1017
|
+
optimized_query[c.PROJECT_NAME_WITH_IN] = list(
|
|
1018
|
+
OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN])
|
|
1019
|
+
)
|
|
1020
|
+
return optimized_query
|
|
@@ -82,4 +82,6 @@ PROJECT = "Project"
|
|
|
82
82
|
SITE = "Site"
|
|
83
83
|
IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
|
|
84
84
|
SITE_PERMISSION = "sitePermission"
|
|
85
|
-
|
|
85
|
+
ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
|
|
86
|
+
ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator"
|
|
87
|
+
ROLE_SERVER_ADMIN = "ServerAdministrator"
|
|
@@ -11,8 +11,12 @@ class UserInfo:
|
|
|
11
11
|
site_role: str
|
|
12
12
|
site_id: str
|
|
13
13
|
|
|
14
|
-
def
|
|
15
|
-
return self.site_role
|
|
14
|
+
def has_site_administrator_explorer_privileges(self):
|
|
15
|
+
return self.site_role in [
|
|
16
|
+
c.ROLE_SITE_ADMIN_EXPLORER,
|
|
17
|
+
c.ROLE_SITE_ADMIN_CREATOR,
|
|
18
|
+
c.ROLE_SERVER_ADMIN,
|
|
19
|
+
]
|
|
16
20
|
|
|
17
21
|
@staticmethod
|
|
18
22
|
def from_server(server: Server) -> "UserInfo":
|
|
@@ -28,7 +28,7 @@ def check_user_role(
|
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
# TODO: Add check for `Enable Derived Permissions`
|
|
31
|
-
if not logged_in_user.
|
|
31
|
+
if not logged_in_user.has_site_administrator_explorer_privileges():
|
|
32
32
|
capability_dict[c.SITE_PERMISSION] = CapabilityReport(
|
|
33
33
|
capable=False,
|
|
34
34
|
failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
|
|
@@ -4,7 +4,7 @@ Manage the communication with DataBricks Server and provide equivalent dataclass
|
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import logging
|
|
7
|
-
from datetime import datetime
|
|
7
|
+
from datetime import datetime
|
|
8
8
|
from typing import Any, Dict, Iterable, List, Optional, Union, cast
|
|
9
9
|
from unittest.mock import patch
|
|
10
10
|
|
|
@@ -27,6 +27,7 @@ from databricks.sdk.service.sql import (
|
|
|
27
27
|
from databricks.sdk.service.workspace import ObjectType
|
|
28
28
|
|
|
29
29
|
import datahub
|
|
30
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
30
31
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
31
32
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
32
33
|
UnityCatalogProxyProfilingMixin,
|
|
@@ -211,16 +212,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
211
212
|
id=obj.object_id,
|
|
212
213
|
path=obj.path,
|
|
213
214
|
language=obj.language,
|
|
214
|
-
created_at=(
|
|
215
|
-
|
|
216
|
-
if obj.created_at
|
|
217
|
-
else None
|
|
218
|
-
),
|
|
219
|
-
modified_at=(
|
|
220
|
-
datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc)
|
|
221
|
-
if obj.modified_at
|
|
222
|
-
else None
|
|
223
|
-
),
|
|
215
|
+
created_at=parse_ts_millis(obj.created_at),
|
|
216
|
+
modified_at=parse_ts_millis(obj.modified_at),
|
|
224
217
|
)
|
|
225
218
|
|
|
226
219
|
def query_history(
|
|
@@ -452,17 +445,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
452
445
|
properties=obj.properties or {},
|
|
453
446
|
owner=obj.owner,
|
|
454
447
|
generation=obj.generation,
|
|
455
|
-
created_at=(
|
|
456
|
-
datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc)
|
|
457
|
-
if obj.created_at
|
|
458
|
-
else None
|
|
459
|
-
),
|
|
448
|
+
created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None),
|
|
460
449
|
created_by=obj.created_by,
|
|
461
|
-
updated_at=(
|
|
462
|
-
datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc)
|
|
463
|
-
if obj.updated_at
|
|
464
|
-
else None
|
|
465
|
-
),
|
|
450
|
+
updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None),
|
|
466
451
|
updated_by=obj.updated_by,
|
|
467
452
|
table_id=obj.table_id,
|
|
468
453
|
comment=obj.comment,
|
|
@@ -500,12 +485,8 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
500
485
|
query_id=info.query_id,
|
|
501
486
|
query_text=info.query_text,
|
|
502
487
|
statement_type=info.statement_type,
|
|
503
|
-
start_time=
|
|
504
|
-
|
|
505
|
-
),
|
|
506
|
-
end_time=datetime.fromtimestamp(
|
|
507
|
-
info.query_end_time_ms / 1000, tz=timezone.utc
|
|
508
|
-
),
|
|
488
|
+
start_time=parse_ts_millis(info.query_start_time_ms),
|
|
489
|
+
end_time=parse_ts_millis(info.query_end_time_ms),
|
|
509
490
|
user_id=info.user_id,
|
|
510
491
|
user_name=info.user_name,
|
|
511
492
|
executed_as_user_id=info.executed_as_user_id,
|
|
@@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
|
|
|
54
54
|
return builder.make_user_urn(email.split("@")[0])
|
|
55
55
|
|
|
56
56
|
|
|
57
|
+
def extract_user_email(user: str) -> Optional[str]:
|
|
58
|
+
"""Extracts user email from user input
|
|
59
|
+
|
|
60
|
+
>>> extract_user_email('urn:li:corpuser:abc@xyz.com')
|
|
61
|
+
'abc@xyz.com'
|
|
62
|
+
>>> extract_user_email('urn:li:corpuser:abc')
|
|
63
|
+
>>> extract_user_email('abc@xyz.com')
|
|
64
|
+
'abc@xyz.com'
|
|
65
|
+
"""
|
|
66
|
+
if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
|
|
67
|
+
user = user.split(":")[-1]
|
|
68
|
+
return user if "@" in user else None
|
|
69
|
+
|
|
70
|
+
|
|
57
71
|
def make_usage_workunit(
|
|
58
72
|
bucket_start_time: datetime,
|
|
59
73
|
resource: ResourceType,
|
|
@@ -104,7 +118,7 @@ def make_usage_workunit(
|
|
|
104
118
|
DatasetUserUsageCountsClass(
|
|
105
119
|
user=user_urn_builder(user),
|
|
106
120
|
count=count,
|
|
107
|
-
userEmail=user
|
|
121
|
+
userEmail=extract_user_email(user),
|
|
108
122
|
)
|
|
109
123
|
for user, count in user_freq
|
|
110
124
|
],
|
|
@@ -14,6 +14,8 @@ LINEAGE_EXTRACTION = "Lineage Extraction"
|
|
|
14
14
|
USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion"
|
|
15
15
|
USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats"
|
|
16
16
|
USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation"
|
|
17
|
+
EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage"
|
|
18
|
+
VIEW_PARSING = "View Parsing"
|
|
17
19
|
QUERIES_EXTRACTION = "Queries Extraction"
|
|
18
20
|
PROFILING = "Profiling"
|
|
19
21
|
|
|
@@ -40,4 +42,5 @@ class IngestionStageReport:
|
|
|
40
42
|
self._timer = PerfTimer()
|
|
41
43
|
|
|
42
44
|
self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
|
|
45
|
+
logger.info(f"Stage started: {self.ingestion_stage}")
|
|
43
46
|
self._timer.start()
|