acryl-datahub 1.1.0.5rc7__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2463 -2465
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +58 -58
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +0 -7
- datahub/cli/cli_utils.py +73 -0
- datahub/cli/delete_cli.py +0 -6
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +148 -228
- datahub/cli/exists_cli.py +0 -4
- datahub/cli/get_cli.py +0 -4
- datahub/cli/ingest_cli.py +1 -20
- datahub/cli/put_cli.py +0 -6
- datahub/cli/quickstart_versioning.py +50 -5
- datahub/cli/specific/assertions_cli.py +0 -6
- datahub/cli/specific/datacontract_cli.py +0 -6
- datahub/cli/specific/dataproduct_cli.py +0 -22
- datahub/cli/specific/dataset_cli.py +0 -11
- datahub/cli/specific/forms_cli.py +0 -6
- datahub/cli/specific/group_cli.py +0 -4
- datahub/cli/specific/structuredproperties_cli.py +0 -7
- datahub/cli/specific/user_cli.py +0 -4
- datahub/cli/state_cli.py +0 -4
- datahub/cli/timeline_cli.py +0 -4
- datahub/entrypoints.py +4 -3
- datahub/ingestion/autogenerated/capability_summary.json +88 -23
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/graph/client.py +2 -2
- datahub/ingestion/run/pipeline.py +43 -0
- datahub/ingestion/source/bigquery_v2/bigquery.py +9 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dremio/dremio_source.py +1 -4
- datahub/ingestion/source/gcs/gcs_source.py +9 -1
- datahub/ingestion/source/identity/okta.py +0 -13
- datahub/ingestion/source/powerbi/powerbi.py +0 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/sigma/sigma.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +100 -58
- datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +1 -1
- datahub/ingestion/source/sql/hive_metastore.py +0 -10
- datahub/ingestion/source/sql/sql_common.py +4 -0
- datahub/ingestion/source/sql/vertica.py +0 -4
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/superset.py +56 -1
- datahub/ingestion/source/tableau/tableau.py +40 -34
- datahub/ingestion/source/tableau/tableau_constant.py +0 -2
- datahub/ingestion/source/unity/source.py +9 -1
- datahub/sdk/lineage_client.py +2 -2
- datahub/sql_parsing/sql_parsing_aggregator.py +21 -12
- datahub/sql_parsing/sqlglot_lineage.py +40 -15
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Dict, List, Optional, Set
|
|
5
6
|
|
|
6
7
|
import pydantic
|
|
@@ -53,6 +54,11 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
|
53
54
|
]
|
|
54
55
|
|
|
55
56
|
|
|
57
|
+
class QueryDedupStrategyType(Enum):
|
|
58
|
+
STANDARD = "STANDARD"
|
|
59
|
+
NONE = "NONE"
|
|
60
|
+
|
|
61
|
+
|
|
56
62
|
class TagOption(StrEnum):
|
|
57
63
|
with_lineage = "with_lineage"
|
|
58
64
|
without_lineage = "without_lineage"
|
|
@@ -248,6 +254,11 @@ class SnowflakeV2Config(
|
|
|
248
254
|
"This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
|
|
249
255
|
)
|
|
250
256
|
|
|
257
|
+
query_dedup_strategy: QueryDedupStrategyType = Field(
|
|
258
|
+
default=QueryDedupStrategyType.STANDARD,
|
|
259
|
+
description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
|
|
260
|
+
)
|
|
261
|
+
|
|
251
262
|
_check_role_grants_removed = pydantic_removed_field("check_role_grants")
|
|
252
263
|
_provision_role_removed = pydantic_removed_field("provision_role")
|
|
253
264
|
|
|
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
|
|
|
28
28
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
29
29
|
from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
30
30
|
DEFAULT_TEMP_TABLES_PATTERNS,
|
|
31
|
+
QueryDedupStrategyType,
|
|
31
32
|
SnowflakeFilterConfig,
|
|
32
33
|
SnowflakeIdentifierConfig,
|
|
33
34
|
)
|
|
@@ -118,6 +119,8 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
118
119
|
include_query_usage_statistics: bool = True
|
|
119
120
|
include_operations: bool = True
|
|
120
121
|
|
|
122
|
+
query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
|
|
123
|
+
|
|
121
124
|
|
|
122
125
|
class SnowflakeQueriesSourceConfig(
|
|
123
126
|
SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
|
|
@@ -374,12 +377,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
374
377
|
) -> Iterable[
|
|
375
378
|
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
376
379
|
]:
|
|
377
|
-
query_log_query =
|
|
380
|
+
query_log_query = QueryLogQueryBuilder(
|
|
378
381
|
start_time=self.config.window.start_time,
|
|
379
382
|
end_time=self.config.window.end_time,
|
|
380
383
|
bucket_duration=self.config.window.bucket_duration,
|
|
381
384
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
382
|
-
|
|
385
|
+
dedup_strategy=self.config.query_dedup_strategy,
|
|
386
|
+
).build_enriched_query_log_query()
|
|
383
387
|
|
|
384
388
|
with self.structured_reporter.report_exc(
|
|
385
389
|
"Error fetching query log from Snowflake"
|
|
@@ -710,63 +714,101 @@ class SnowflakeQueriesSource(Source):
|
|
|
710
714
|
super().close()
|
|
711
715
|
|
|
712
716
|
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
)
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
717
|
+
class QueryLogQueryBuilder:
|
|
718
|
+
def __init__(
|
|
719
|
+
self,
|
|
720
|
+
start_time: datetime,
|
|
721
|
+
end_time: datetime,
|
|
722
|
+
bucket_duration: BucketDuration,
|
|
723
|
+
deny_usernames: Optional[List[str]],
|
|
724
|
+
max_tables_per_query: int = 20,
|
|
725
|
+
dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
|
|
726
|
+
):
|
|
727
|
+
self.start_time = start_time
|
|
728
|
+
self.end_time = end_time
|
|
729
|
+
self.start_time_millis = int(start_time.timestamp() * 1000)
|
|
730
|
+
self.end_time_millis = int(end_time.timestamp() * 1000)
|
|
731
|
+
self.max_tables_per_query = max_tables_per_query
|
|
732
|
+
self.dedup_strategy = dedup_strategy
|
|
733
|
+
|
|
734
|
+
self.users_filter = "TRUE"
|
|
735
|
+
if deny_usernames:
|
|
736
|
+
user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
|
|
737
|
+
self.users_filter = f"user_name NOT IN ({user_not_in})"
|
|
738
|
+
|
|
739
|
+
self.time_bucket_size = bucket_duration.value
|
|
740
|
+
assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
|
|
741
|
+
|
|
742
|
+
def _query_fingerprinted_queries(self):
|
|
743
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
744
|
+
secondary_fingerprint_sql = """
|
|
745
|
+
CASE
|
|
746
|
+
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
747
|
+
-- Extract project id and hash it
|
|
748
|
+
THEN CAST(HASH(
|
|
749
|
+
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
750
|
+
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
751
|
+
) AS VARCHAR)
|
|
752
|
+
ELSE NULL
|
|
753
|
+
END"""
|
|
754
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
755
|
+
secondary_fingerprint_sql = "NULL"
|
|
756
|
+
else:
|
|
757
|
+
raise NotImplementedError(
|
|
758
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
759
|
+
)
|
|
760
|
+
return f"""
|
|
761
|
+
SELECT *,
|
|
762
|
+
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
763
|
+
query_history.query_parameterized_hash as query_fingerprint,
|
|
764
|
+
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
765
|
+
{secondary_fingerprint_sql} as query_secondary_fingerprint
|
|
766
|
+
FROM
|
|
767
|
+
snowflake.account_usage.query_history
|
|
768
|
+
WHERE
|
|
769
|
+
query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
770
|
+
AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
771
|
+
AND execution_status = 'SUCCESS'
|
|
772
|
+
AND {self.users_filter}"""
|
|
773
|
+
|
|
774
|
+
def _query_deduplicated_queries(self):
|
|
775
|
+
if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
|
|
776
|
+
return f"""
|
|
777
|
+
SELECT
|
|
778
|
+
*,
|
|
779
|
+
DATE_TRUNC(
|
|
780
|
+
{self.time_bucket_size},
|
|
781
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
782
|
+
) AS bucket_start_time,
|
|
783
|
+
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
784
|
+
FROM
|
|
785
|
+
fingerprinted_queries
|
|
786
|
+
QUALIFY
|
|
787
|
+
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
|
|
788
|
+
elif self.dedup_strategy == QueryDedupStrategyType.NONE:
|
|
789
|
+
return f"""
|
|
790
|
+
SELECT
|
|
791
|
+
*,
|
|
792
|
+
DATE_TRUNC(
|
|
793
|
+
{self.time_bucket_size},
|
|
794
|
+
CONVERT_TIMEZONE('UTC', start_time)
|
|
795
|
+
) AS bucket_start_time,
|
|
796
|
+
1 AS query_count,
|
|
797
|
+
FROM
|
|
798
|
+
fingerprinted_queries"""
|
|
799
|
+
else:
|
|
800
|
+
raise NotImplementedError(
|
|
801
|
+
f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
|
|
802
|
+
)
|
|
733
803
|
|
|
734
|
-
|
|
804
|
+
def build_enriched_query_log_query(self) -> str:
|
|
805
|
+
return f"""\
|
|
735
806
|
WITH
|
|
736
807
|
fingerprinted_queries as (
|
|
737
|
-
|
|
738
|
-
-- TODO: Generate better fingerprints for each query by pushing down regex logic.
|
|
739
|
-
query_history.query_parameterized_hash as query_fingerprint,
|
|
740
|
-
-- Optional and additional hash to be used for query deduplication and final query identity
|
|
741
|
-
CASE
|
|
742
|
-
WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
|
|
743
|
-
-- Extract project id and hash it
|
|
744
|
-
THEN CAST(HASH(
|
|
745
|
-
REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
|
|
746
|
-
REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
|
|
747
|
-
) AS VARCHAR)
|
|
748
|
-
ELSE NULL
|
|
749
|
-
END as query_secondary_fingerprint
|
|
750
|
-
FROM
|
|
751
|
-
snowflake.account_usage.query_history
|
|
752
|
-
WHERE
|
|
753
|
-
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
|
|
754
|
-
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
|
|
755
|
-
AND execution_status = 'SUCCESS'
|
|
756
|
-
AND {users_filter}
|
|
808
|
+
{self._query_fingerprinted_queries()}
|
|
757
809
|
)
|
|
758
810
|
, deduplicated_queries as (
|
|
759
|
-
|
|
760
|
-
*,
|
|
761
|
-
DATE_TRUNC(
|
|
762
|
-
{time_bucket_size},
|
|
763
|
-
CONVERT_TIMEZONE('UTC', start_time)
|
|
764
|
-
) AS bucket_start_time,
|
|
765
|
-
COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
|
|
766
|
-
FROM
|
|
767
|
-
fingerprinted_queries
|
|
768
|
-
QUALIFY
|
|
769
|
-
ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
|
|
811
|
+
{self._query_deduplicated_queries()}
|
|
770
812
|
)
|
|
771
813
|
, raw_access_history AS (
|
|
772
814
|
SELECT
|
|
@@ -780,9 +822,9 @@ fingerprinted_queries as (
|
|
|
780
822
|
FROM
|
|
781
823
|
snowflake.account_usage.access_history
|
|
782
824
|
WHERE
|
|
783
|
-
query_start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
|
|
784
|
-
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
|
|
785
|
-
AND {users_filter}
|
|
825
|
+
query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
|
|
826
|
+
AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
|
|
827
|
+
AND {self.users_filter}
|
|
786
828
|
AND query_id IN (
|
|
787
829
|
SELECT query_id FROM deduplicated_queries
|
|
788
830
|
)
|
|
@@ -795,7 +837,7 @@ fingerprinted_queries as (
|
|
|
795
837
|
query_start_time,
|
|
796
838
|
ARRAY_SLICE(
|
|
797
839
|
FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
|
|
798
|
-
0, {
|
|
840
|
+
0, {self.max_tables_per_query}
|
|
799
841
|
) as direct_objects_accessed,
|
|
800
842
|
-- TODO: Drop the columns.baseSources subfield.
|
|
801
843
|
FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
|
|
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
|
|
|
32
32
|
)
|
|
33
33
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
35
36
|
from datahub.ingestion.source.snowflake.constants import (
|
|
36
37
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
37
38
|
SnowflakeEdition,
|
|
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
98
|
@support_status(SupportStatus.CERTIFIED)
|
|
98
99
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
99
100
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
100
|
-
@capability(
|
|
101
|
+
@capability(
|
|
102
|
+
SourceCapability.CONTAINERS,
|
|
103
|
+
"Enabled by default",
|
|
104
|
+
subtype_modifier=[
|
|
105
|
+
SourceCapabilityModifier.DATABASE,
|
|
106
|
+
SourceCapabilityModifier.SCHEMA,
|
|
107
|
+
],
|
|
108
|
+
)
|
|
101
109
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
102
110
|
@capability(
|
|
103
111
|
SourceCapability.DATA_PROFILING,
|
|
@@ -577,6 +585,7 @@ class SnowflakeV2Source(
|
|
|
577
585
|
|
|
578
586
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
579
587
|
connection=self.connection,
|
|
588
|
+
# TODO: this should be its own section in main recipe
|
|
580
589
|
config=SnowflakeQueriesExtractorConfig(
|
|
581
590
|
window=BaseTimeWindowConfig(
|
|
582
591
|
start_time=self.config.start_time,
|
|
@@ -591,6 +600,7 @@ class SnowflakeV2Source(
|
|
|
591
600
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
592
601
|
user_email_pattern=self.config.user_email_pattern,
|
|
593
602
|
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
603
|
+
query_dedup_strategy=self.config.query_dedup_strategy,
|
|
594
604
|
),
|
|
595
605
|
structured_report=self.report,
|
|
596
606
|
filters=self.filters,
|
|
@@ -65,7 +65,7 @@ class StoredProcLineageTracker(Closeable):
|
|
|
65
65
|
# { root_query_id -> StoredProcExecutionLineage }
|
|
66
66
|
self._stored_proc_execution_lineage: FileBackedDict[
|
|
67
67
|
StoredProcExecutionLineage
|
|
68
|
-
] = FileBackedDict(shared_connection)
|
|
68
|
+
] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
|
|
69
69
|
|
|
70
70
|
def add_stored_proc_call(self, call: StoredProcCall) -> None:
|
|
71
71
|
"""Add a stored procedure call to track."""
|
|
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
|
|
|
52
52
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
53
53
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
54
54
|
from datahub.metadata.schema_classes import (
|
|
55
|
-
ChangeTypeClass,
|
|
56
55
|
DatasetPropertiesClass,
|
|
57
56
|
SubTypesClass,
|
|
58
57
|
ViewPropertiesClass,
|
|
@@ -601,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
601
600
|
yield dpi_aspect
|
|
602
601
|
|
|
603
602
|
yield MetadataChangeProposalWrapper(
|
|
604
|
-
entityType="dataset",
|
|
605
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
606
603
|
entityUrn=dataset_urn,
|
|
607
|
-
aspectName="subTypes",
|
|
608
604
|
aspect=SubTypesClass(typeNames=[self.table_subtype]),
|
|
609
605
|
).as_workunit()
|
|
610
606
|
|
|
@@ -810,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
810
806
|
|
|
811
807
|
# Add views subtype
|
|
812
808
|
yield MetadataChangeProposalWrapper(
|
|
813
|
-
entityType="dataset",
|
|
814
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
815
809
|
entityUrn=dataset_urn,
|
|
816
|
-
aspectName="subTypes",
|
|
817
810
|
aspect=SubTypesClass(typeNames=[self.view_subtype]),
|
|
818
811
|
).as_workunit()
|
|
819
812
|
|
|
@@ -824,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
824
817
|
viewLogic=dataset.view_definition if dataset.view_definition else "",
|
|
825
818
|
)
|
|
826
819
|
yield MetadataChangeProposalWrapper(
|
|
827
|
-
entityType="dataset",
|
|
828
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
829
820
|
entityUrn=dataset_urn,
|
|
830
|
-
aspectName="viewProperties",
|
|
831
821
|
aspect=view_properties_aspect,
|
|
832
822
|
).as_workunit()
|
|
833
823
|
|
|
@@ -292,6 +292,10 @@ class ProfileMetadata:
|
|
|
292
292
|
SourceCapability.CONTAINERS,
|
|
293
293
|
"Enabled by default",
|
|
294
294
|
supported=True,
|
|
295
|
+
subtype_modifier=[
|
|
296
|
+
SourceCapabilityModifier.DATABASE,
|
|
297
|
+
SourceCapabilityModifier.SCHEMA,
|
|
298
|
+
],
|
|
295
299
|
)
|
|
296
300
|
@capability(
|
|
297
301
|
SourceCapability.DESCRIPTIONS,
|
|
@@ -45,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
|
45
45
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
46
46
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
47
47
|
from datahub.metadata.schema_classes import (
|
|
48
|
-
ChangeTypeClass,
|
|
49
48
|
DatasetLineageTypeClass,
|
|
50
49
|
DatasetPropertiesClass,
|
|
51
50
|
SubTypesClass,
|
|
@@ -501,10 +500,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
501
500
|
if dpi_aspect:
|
|
502
501
|
yield dpi_aspect
|
|
503
502
|
yield MetadataChangeProposalWrapper(
|
|
504
|
-
entityType="dataset",
|
|
505
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
506
503
|
entityUrn=dataset_urn,
|
|
507
|
-
aspectName="subTypes",
|
|
508
504
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
509
505
|
).as_workunit()
|
|
510
506
|
|
|
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
66
66
|
description="The default schema to use for unqualified table names",
|
|
67
67
|
default=None,
|
|
68
68
|
)
|
|
69
|
-
|
|
69
|
+
override_dialect: Optional[str] = Field(
|
|
70
70
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
71
71
|
default=None,
|
|
72
72
|
)
|
|
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
|
|
|
181
181
|
schema_resolver=self.schema_resolver,
|
|
182
182
|
default_db=self.config.default_db,
|
|
183
183
|
default_schema=self.config.default_schema,
|
|
184
|
-
|
|
184
|
+
override_dialect=self.config.override_dialect,
|
|
185
185
|
)
|
|
186
186
|
if result.debug_info.table_error:
|
|
187
187
|
logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
|
|
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
658
658
|
if datasource_id:
|
|
659
659
|
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
660
660
|
dataset_column_info = dataset_info.get("columns", [])
|
|
661
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
661
662
|
|
|
662
663
|
for column in dataset_column_info:
|
|
663
664
|
col_name = column.get("column_name", "")
|
|
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
671
672
|
continue
|
|
672
673
|
|
|
673
674
|
dataset_columns.append((col_name, col_type, col_description))
|
|
675
|
+
|
|
676
|
+
for metric in dataset_metric_info:
|
|
677
|
+
metric_name = metric.get("metric_name", "")
|
|
678
|
+
metric_type = metric.get("metric_type", "")
|
|
679
|
+
metric_description = metric.get("description", "")
|
|
680
|
+
|
|
681
|
+
if metric_name == "" or metric_type == "":
|
|
682
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
683
|
+
continue
|
|
684
|
+
|
|
685
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
674
686
|
else:
|
|
675
687
|
# if no datasource id, cannot build cll, just return
|
|
676
688
|
logger.warning(
|
|
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
972
984
|
schema_fields.append(field)
|
|
973
985
|
return schema_fields
|
|
974
986
|
|
|
987
|
+
def gen_metric_schema_fields(
|
|
988
|
+
self, metric_data: List[Dict[str, Any]]
|
|
989
|
+
) -> List[SchemaField]:
|
|
990
|
+
schema_fields: List[SchemaField] = []
|
|
991
|
+
for metric in metric_data:
|
|
992
|
+
metric_type = metric.get("metric_type", "")
|
|
993
|
+
data_type = resolve_sql_type(metric_type)
|
|
994
|
+
if data_type is None:
|
|
995
|
+
data_type = NullType()
|
|
996
|
+
|
|
997
|
+
field = SchemaField(
|
|
998
|
+
fieldPath=metric.get("metric_name", ""),
|
|
999
|
+
type=SchemaFieldDataType(data_type),
|
|
1000
|
+
nativeDataType=metric_type or "",
|
|
1001
|
+
description=metric.get("description", ""),
|
|
1002
|
+
nullable=True,
|
|
1003
|
+
)
|
|
1004
|
+
schema_fields.append(field)
|
|
1005
|
+
return schema_fields
|
|
1006
|
+
|
|
975
1007
|
def gen_schema_metadata(
|
|
976
1008
|
self,
|
|
977
1009
|
dataset_response: dict,
|
|
978
1010
|
) -> SchemaMetadata:
|
|
979
1011
|
dataset_response = dataset_response.get("result", {})
|
|
980
1012
|
column_data = dataset_response.get("columns", [])
|
|
1013
|
+
metric_data = dataset_response.get("metrics", [])
|
|
1014
|
+
|
|
1015
|
+
column_fields = self.gen_schema_fields(column_data)
|
|
1016
|
+
metric_fields = self.gen_metric_schema_fields(metric_data)
|
|
1017
|
+
|
|
981
1018
|
schema_metadata = SchemaMetadata(
|
|
982
1019
|
schemaName=dataset_response.get("table_name", ""),
|
|
983
1020
|
platform=make_data_platform_urn(self.platform),
|
|
984
1021
|
version=0,
|
|
985
1022
|
hash="",
|
|
986
1023
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
987
|
-
fields=
|
|
1024
|
+
fields=column_fields + metric_fields,
|
|
988
1025
|
)
|
|
989
1026
|
return schema_metadata
|
|
990
1027
|
|
|
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1049
1086
|
# To generate column level lineage, we can manually decode the metadata
|
|
1050
1087
|
# to produce the ColumnLineageInfo
|
|
1051
1088
|
columns = dataset_response.get("result", {}).get("columns", [])
|
|
1089
|
+
metrics = dataset_response.get("result", {}).get("metrics", [])
|
|
1090
|
+
|
|
1052
1091
|
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
1053
1092
|
|
|
1054
1093
|
for column in columns:
|
|
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1067
1106
|
)
|
|
1068
1107
|
)
|
|
1069
1108
|
|
|
1109
|
+
for metric in metrics:
|
|
1110
|
+
metric_name = metric.get("metric_name", "")
|
|
1111
|
+
if not metric_name:
|
|
1112
|
+
continue
|
|
1113
|
+
|
|
1114
|
+
downstream = [make_schema_field_urn(datasource_urn, metric_name)]
|
|
1115
|
+
upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
|
|
1116
|
+
fine_grained_lineages.append(
|
|
1117
|
+
FineGrainedLineageClass(
|
|
1118
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
1119
|
+
downstreams=downstream,
|
|
1120
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
1121
|
+
upstreams=upstreams,
|
|
1122
|
+
)
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1070
1125
|
upstream_lineage = UpstreamLineageClass(
|
|
1071
1126
|
upstreams=[
|
|
1072
1127
|
UpstreamClass(
|
|
@@ -149,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
149
149
|
)
|
|
150
150
|
from datahub.metadata.schema_classes import (
|
|
151
151
|
BrowsePathsClass,
|
|
152
|
-
ChangeTypeClass,
|
|
153
152
|
ChartInfoClass,
|
|
154
153
|
ChartUsageStatisticsClass,
|
|
155
154
|
DashboardInfoClass,
|
|
@@ -529,6 +528,14 @@ class TableauConfig(
|
|
|
529
528
|
default=False,
|
|
530
529
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
|
531
530
|
)
|
|
531
|
+
emit_all_published_datasources: bool = Field(
|
|
532
|
+
default=False,
|
|
533
|
+
description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
|
|
534
|
+
)
|
|
535
|
+
emit_all_embedded_datasources: bool = Field(
|
|
536
|
+
default=False,
|
|
537
|
+
description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
|
|
538
|
+
)
|
|
532
539
|
|
|
533
540
|
env: str = Field(
|
|
534
541
|
default=builder.DEFAULT_ENV,
|
|
@@ -2180,32 +2187,32 @@ class TableauSiteSource:
|
|
|
2180
2187
|
else []
|
|
2181
2188
|
)
|
|
2182
2189
|
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2190
|
+
tableau_table_list = csql.get(c.TABLES, [])
|
|
2191
|
+
if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
|
|
2192
|
+
not tableau_table_list
|
|
2193
|
+
and self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2194
|
+
):
|
|
2195
|
+
if not tableau_table_list:
|
|
2196
|
+
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2197
|
+
# we extract the lineage from the raw queries
|
|
2198
|
+
logger.debug(
|
|
2199
|
+
"Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
|
|
2200
|
+
)
|
|
2201
|
+
else:
|
|
2202
|
+
# The Tableau SQL parser is much worse than our sqlglot based parser,
|
|
2203
|
+
# so relying on metadata parsed by Tableau from SQL queries can be
|
|
2204
|
+
# less accurate. This option allows us to ignore Tableau's parser and
|
|
2205
|
+
# only use our own.
|
|
2206
|
+
logger.debug("Parsing TLL & CLL from custom sql (forced)")
|
|
2207
|
+
|
|
2189
2208
|
yield from self._create_lineage_from_unsupported_csql(
|
|
2190
2209
|
csql_urn, csql, columns
|
|
2191
2210
|
)
|
|
2192
2211
|
else:
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
yield from self._create_lineage_to_upstream_tables(
|
|
2198
|
-
csql_urn, tables, datasource
|
|
2199
|
-
)
|
|
2200
|
-
elif (
|
|
2201
|
-
self.config.extract_lineage_from_unsupported_custom_sql_queries
|
|
2202
|
-
):
|
|
2203
|
-
logger.debug("Extracting TLL & CLL from custom sql")
|
|
2204
|
-
# custom sql tables may contain unsupported sql, causing incomplete lineage
|
|
2205
|
-
# we extract the lineage from the raw queries
|
|
2206
|
-
yield from self._create_lineage_from_unsupported_csql(
|
|
2207
|
-
csql_urn, csql, columns
|
|
2208
|
-
)
|
|
2212
|
+
# lineage from custom sql -> datasets/tables #
|
|
2213
|
+
yield from self._create_lineage_to_upstream_tables(
|
|
2214
|
+
csql_urn, tableau_table_list, datasource
|
|
2215
|
+
)
|
|
2209
2216
|
|
|
2210
2217
|
# Schema Metadata
|
|
2211
2218
|
schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
|
|
@@ -2243,7 +2250,6 @@ class TableauSiteSource:
|
|
|
2243
2250
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2244
2251
|
yield self.get_metadata_change_proposal(
|
|
2245
2252
|
dataset_snapshot.urn,
|
|
2246
|
-
aspect_name=c.SUB_TYPES,
|
|
2247
2253
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
|
|
2248
2254
|
)
|
|
2249
2255
|
|
|
@@ -2408,7 +2414,6 @@ class TableauSiteSource:
|
|
|
2408
2414
|
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
|
|
2409
2415
|
yield self.get_metadata_change_proposal(
|
|
2410
2416
|
csql_urn,
|
|
2411
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2412
2417
|
aspect=upstream_lineage,
|
|
2413
2418
|
)
|
|
2414
2419
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2594,7 +2599,6 @@ class TableauSiteSource:
|
|
|
2594
2599
|
)
|
|
2595
2600
|
yield self.get_metadata_change_proposal(
|
|
2596
2601
|
csql_urn,
|
|
2597
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2598
2602
|
aspect=upstream_lineage,
|
|
2599
2603
|
)
|
|
2600
2604
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2640,14 +2644,10 @@ class TableauSiteSource:
|
|
|
2640
2644
|
def get_metadata_change_proposal(
|
|
2641
2645
|
self,
|
|
2642
2646
|
urn: str,
|
|
2643
|
-
aspect_name: str,
|
|
2644
2647
|
aspect: Union["UpstreamLineage", "SubTypesClass"],
|
|
2645
2648
|
) -> MetadataWorkUnit:
|
|
2646
2649
|
return MetadataChangeProposalWrapper(
|
|
2647
|
-
entityType=c.DATASET,
|
|
2648
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
2649
2650
|
entityUrn=urn,
|
|
2650
|
-
aspectName=aspect_name,
|
|
2651
2651
|
aspect=aspect,
|
|
2652
2652
|
).as_workunit()
|
|
2653
2653
|
|
|
@@ -2755,7 +2755,6 @@ class TableauSiteSource:
|
|
|
2755
2755
|
)
|
|
2756
2756
|
yield self.get_metadata_change_proposal(
|
|
2757
2757
|
datasource_urn,
|
|
2758
|
-
aspect_name=c.UPSTREAM_LINEAGE,
|
|
2759
2758
|
aspect=upstream_lineage,
|
|
2760
2759
|
)
|
|
2761
2760
|
self.report.num_tables_with_upstream_lineage += 1
|
|
@@ -2774,7 +2773,6 @@ class TableauSiteSource:
|
|
|
2774
2773
|
yield self.get_metadata_change_event(dataset_snapshot)
|
|
2775
2774
|
yield self.get_metadata_change_proposal(
|
|
2776
2775
|
dataset_snapshot.urn,
|
|
2777
|
-
aspect_name=c.SUB_TYPES,
|
|
2778
2776
|
aspect=SubTypesClass(
|
|
2779
2777
|
typeNames=(
|
|
2780
2778
|
["Embedded Data Source"]
|
|
@@ -2860,7 +2858,11 @@ class TableauSiteSource:
|
|
|
2860
2858
|
return datasource
|
|
2861
2859
|
|
|
2862
2860
|
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
2863
|
-
datasource_filter =
|
|
2861
|
+
datasource_filter = (
|
|
2862
|
+
{}
|
|
2863
|
+
if self.config.emit_all_published_datasources
|
|
2864
|
+
else {c.ID_WITH_IN: self.datasource_ids_being_used}
|
|
2865
|
+
)
|
|
2864
2866
|
|
|
2865
2867
|
for datasource in self.get_connection_objects(
|
|
2866
2868
|
query=published_datasource_graphql_query,
|
|
@@ -3553,7 +3555,11 @@ class TableauSiteSource:
|
|
|
3553
3555
|
return browse_paths
|
|
3554
3556
|
|
|
3555
3557
|
def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
|
|
3556
|
-
datasource_filter =
|
|
3558
|
+
datasource_filter = (
|
|
3559
|
+
{}
|
|
3560
|
+
if self.config.emit_all_embedded_datasources
|
|
3561
|
+
else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
|
|
3562
|
+
)
|
|
3557
3563
|
|
|
3558
3564
|
for datasource in self.get_connection_objects(
|
|
3559
3565
|
query=embedded_datasource_graphql_query,
|
|
@@ -50,7 +50,6 @@ TABLES = "tables"
|
|
|
50
50
|
DESCRIPTION = "description"
|
|
51
51
|
SQL = "SQL"
|
|
52
52
|
QUERY = "query"
|
|
53
|
-
SUB_TYPES = "subTypes"
|
|
54
53
|
VIEW = "view"
|
|
55
54
|
CUSTOM_SQL = "Custom SQL"
|
|
56
55
|
REMOTE_TYPE = "remoteType"
|
|
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
|
|
|
58
57
|
PUBLISHED_DATA_SOURCE = "PublishedDatasource"
|
|
59
58
|
LUID = "luid"
|
|
60
59
|
EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
|
|
61
|
-
UPSTREAM_LINEAGE = "upstreamLineage"
|
|
62
60
|
OWNER = "owner"
|
|
63
61
|
USERNAME = "username"
|
|
64
62
|
HAS_EXTRACTS = "hasExtracts"
|