acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
6
|
+
ElasticPlatformResourceQuery,
|
|
7
|
+
PlatformResource,
|
|
8
|
+
PlatformResourceKey,
|
|
9
|
+
PlatformResourceSearchFields,
|
|
10
|
+
)
|
|
11
|
+
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
14
|
+
from datahub.ingestion.source.redshift.config import RedshiftConfig
|
|
15
|
+
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
16
|
+
InboundDatashare,
|
|
17
|
+
OutboundDatashare,
|
|
18
|
+
PartialInboundDatashare,
|
|
19
|
+
RedshiftTable,
|
|
20
|
+
RedshiftView,
|
|
21
|
+
)
|
|
22
|
+
from datahub.ingestion.source.redshift.report import RedshiftReport
|
|
23
|
+
from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping
|
|
24
|
+
from datahub.utilities.search_utils import LogicalOperator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OutboundSharePlatformResource(BaseModel):
|
|
28
|
+
namespace: str
|
|
29
|
+
platform_instance: Optional[str]
|
|
30
|
+
env: str
|
|
31
|
+
source_database: str
|
|
32
|
+
share_name: str
|
|
33
|
+
|
|
34
|
+
def get_key(self) -> str:
|
|
35
|
+
return f"{self.namespace}.{self.share_name}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
PLATFORM_RESOURCE_TYPE = "OUTBOUND_DATASHARE"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RedshiftDatasharesHelper:
|
|
42
|
+
"""
|
|
43
|
+
Redshift datashares lineage generation relies on PlatformResource entity
|
|
44
|
+
to identify the producer namespace and its platform_instance and env
|
|
45
|
+
|
|
46
|
+
Ingestion of any database in namespace will
|
|
47
|
+
A. generate PlatformResource entity for all outbound shares in namespace.
|
|
48
|
+
B. generate lineage with upstream tables from another namespace, if the database
|
|
49
|
+
is created from an inbound share
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
config: RedshiftConfig,
|
|
56
|
+
report: RedshiftReport,
|
|
57
|
+
graph: Optional[DataHubGraph],
|
|
58
|
+
):
|
|
59
|
+
self.platform = "redshift"
|
|
60
|
+
self.config = config
|
|
61
|
+
self.report = report
|
|
62
|
+
self.graph = graph
|
|
63
|
+
|
|
64
|
+
def to_platform_resource(
|
|
65
|
+
self, shares: List[OutboundDatashare]
|
|
66
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
67
|
+
if not shares:
|
|
68
|
+
self.report.outbound_shares_count = 0
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
self.report.outbound_shares_count = len(shares)
|
|
72
|
+
# Producer namespace will be current namespace for all
|
|
73
|
+
# outbound data shares
|
|
74
|
+
|
|
75
|
+
for share in shares:
|
|
76
|
+
producer_namespace = share.producer_namespace
|
|
77
|
+
try:
|
|
78
|
+
platform_resource_key = PlatformResourceKey(
|
|
79
|
+
platform=self.platform,
|
|
80
|
+
platform_instance=self.config.platform_instance,
|
|
81
|
+
resource_type=PLATFORM_RESOURCE_TYPE,
|
|
82
|
+
primary_key=share.get_key(),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
value = OutboundSharePlatformResource(
|
|
86
|
+
namespace=producer_namespace,
|
|
87
|
+
platform_instance=self.config.platform_instance,
|
|
88
|
+
env=self.config.env,
|
|
89
|
+
source_database=share.source_database,
|
|
90
|
+
share_name=share.share_name,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
platform_resource = PlatformResource.create(
|
|
94
|
+
key=platform_resource_key,
|
|
95
|
+
value=value,
|
|
96
|
+
secondary_keys=[share.share_name, share.producer_namespace],
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
yield from platform_resource.to_mcps()
|
|
100
|
+
|
|
101
|
+
except Exception as exc:
|
|
102
|
+
self.report.warning(
|
|
103
|
+
title="Downstream lineage to outbound datashare may not work",
|
|
104
|
+
message="Failed to generate platform resource for outbound datashares",
|
|
105
|
+
context=f"Namespace {share.producer_namespace} Share {share.share_name}",
|
|
106
|
+
exc=exc,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def generate_lineage(
|
|
110
|
+
self,
|
|
111
|
+
share: Union[InboundDatashare, PartialInboundDatashare],
|
|
112
|
+
tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]],
|
|
113
|
+
) -> Iterable[KnownLineageMapping]:
|
|
114
|
+
upstream_share = self.find_upstream_share(share)
|
|
115
|
+
|
|
116
|
+
if not upstream_share:
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
for schema in tables:
|
|
120
|
+
for table in tables[schema]:
|
|
121
|
+
dataset_urn = self.gen_dataset_urn(
|
|
122
|
+
f"{share.consumer_database}.{schema}.{table.name}",
|
|
123
|
+
self.config.platform_instance,
|
|
124
|
+
self.config.env,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
upstream_dataset_urn = self.gen_dataset_urn(
|
|
128
|
+
f"{upstream_share.source_database}.{schema}.{table.name}",
|
|
129
|
+
upstream_share.platform_instance,
|
|
130
|
+
upstream_share.env,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
yield KnownLineageMapping(
|
|
134
|
+
upstream_urn=upstream_dataset_urn, downstream_urn=dataset_urn
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def find_upstream_share(
|
|
138
|
+
self, share: Union[InboundDatashare, PartialInboundDatashare]
|
|
139
|
+
) -> Optional[OutboundSharePlatformResource]:
|
|
140
|
+
if not self.graph:
|
|
141
|
+
self.report.warning(
|
|
142
|
+
title="Upstream lineage of inbound datashare will be missing",
|
|
143
|
+
message="Missing datahub graph. Either use the datahub-rest sink or "
|
|
144
|
+
"set the top-level datahub_api config in the recipe",
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
resources = self.get_platform_resources(self.graph, share)
|
|
148
|
+
|
|
149
|
+
if len(resources) == 0 or (
|
|
150
|
+
not any(
|
|
151
|
+
[
|
|
152
|
+
resource.resource_info is not None
|
|
153
|
+
and resource.resource_info.resource_type
|
|
154
|
+
== PLATFORM_RESOURCE_TYPE
|
|
155
|
+
for resource in resources
|
|
156
|
+
]
|
|
157
|
+
)
|
|
158
|
+
):
|
|
159
|
+
self.report.info(
|
|
160
|
+
title="Upstream lineage of inbound datashare will be missing",
|
|
161
|
+
message="Missing platform resource for share. "
|
|
162
|
+
"Setup redshift ingestion for namespace if not already done. If ingestion is setup, "
|
|
163
|
+
"check whether ingestion user has ALTER/SHARE permission to share.",
|
|
164
|
+
context=share.get_description(),
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
# Ideally we should get only one resource as primary key is namespace+share
|
|
168
|
+
# and type is "OUTBOUND_DATASHARE"
|
|
169
|
+
for resource in resources:
|
|
170
|
+
try:
|
|
171
|
+
assert (
|
|
172
|
+
resource.resource_info is not None
|
|
173
|
+
and resource.resource_info.value is not None
|
|
174
|
+
)
|
|
175
|
+
return resource.resource_info.value.as_pydantic_object(
|
|
176
|
+
OutboundSharePlatformResource, True
|
|
177
|
+
)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
self.report.warning(
|
|
180
|
+
title="Upstream lineage of inbound datashare will be missing",
|
|
181
|
+
message="Failed to parse platform resource for outbound datashare",
|
|
182
|
+
context=share.get_description(),
|
|
183
|
+
exc=e,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
def get_platform_resources(
|
|
189
|
+
self,
|
|
190
|
+
graph: DataHubGraph,
|
|
191
|
+
share: Union[InboundDatashare, PartialInboundDatashare],
|
|
192
|
+
) -> List[PlatformResource]:
|
|
193
|
+
# NOTE: ideally we receive InboundDatashare and not PartialInboundDatashare.
|
|
194
|
+
# however due to varchar(128) type of database table that captures datashare options
|
|
195
|
+
# we may receive only partial information about inbound share
|
|
196
|
+
# Alternate option to get InboundDatashare using svv_datashares requires superuser
|
|
197
|
+
if isinstance(share, PartialInboundDatashare):
|
|
198
|
+
return list(
|
|
199
|
+
PlatformResource.search_by_filters(
|
|
200
|
+
graph,
|
|
201
|
+
ElasticPlatformResourceQuery.create_from()
|
|
202
|
+
.group(LogicalOperator.AND)
|
|
203
|
+
.add_field_match(
|
|
204
|
+
PlatformResourceSearchFields.RESOURCE_TYPE,
|
|
205
|
+
PLATFORM_RESOURCE_TYPE,
|
|
206
|
+
)
|
|
207
|
+
.add_field_match(
|
|
208
|
+
PlatformResourceSearchFields.PLATFORM, self.platform
|
|
209
|
+
)
|
|
210
|
+
.add_field_match(
|
|
211
|
+
PlatformResourceSearchFields.SECONDARY_KEYS,
|
|
212
|
+
share.share_name,
|
|
213
|
+
)
|
|
214
|
+
.add_wildcard(
|
|
215
|
+
PlatformResourceSearchFields.SECONDARY_KEYS.field_name,
|
|
216
|
+
f"{share.producer_namespace_prefix}*",
|
|
217
|
+
)
|
|
218
|
+
.end(),
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
return list(
|
|
222
|
+
PlatformResource.search_by_key(
|
|
223
|
+
graph, key=share.get_key(), primary=True, is_exact=True
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# TODO: Refactor and move to new RedshiftIdentifierBuilder class
|
|
228
|
+
def gen_dataset_urn(
|
|
229
|
+
self, datahub_dataset_name: str, platform_instance: Optional[str], env: str
|
|
230
|
+
) -> str:
|
|
231
|
+
return make_dataset_urn_with_platform_instance(
|
|
232
|
+
platform=self.platform,
|
|
233
|
+
name=datahub_dataset_name,
|
|
234
|
+
platform_instance=platform_instance,
|
|
235
|
+
env=env,
|
|
236
|
+
)
|
|
@@ -813,9 +813,13 @@ class RedshiftLineageExtractor:
|
|
|
813
813
|
)
|
|
814
814
|
|
|
815
815
|
tablename = table.name
|
|
816
|
-
if
|
|
816
|
+
if (
|
|
817
|
+
table.is_external_table()
|
|
818
|
+
and schema.is_external_schema()
|
|
819
|
+
and schema.external_platform
|
|
820
|
+
):
|
|
817
821
|
# external_db_params = schema.option
|
|
818
|
-
upstream_platform = schema.
|
|
822
|
+
upstream_platform = schema.external_platform.lower()
|
|
819
823
|
catalog_upstream = UpstreamClass(
|
|
820
824
|
mce_builder.make_dataset_urn_with_platform_instance(
|
|
821
825
|
upstream_platform,
|
|
@@ -401,11 +401,14 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
401
401
|
) -> None:
|
|
402
402
|
for schema_name, tables in all_tables[self.database].items():
|
|
403
403
|
for table in tables:
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
404
|
+
schema = db_schemas[self.database][schema_name]
|
|
405
|
+
if (
|
|
406
|
+
table.is_external_table()
|
|
407
|
+
and schema.is_external_schema()
|
|
408
|
+
and schema.external_platform
|
|
409
|
+
):
|
|
407
410
|
# external_db_params = schema.option
|
|
408
|
-
upstream_platform = schema.
|
|
411
|
+
upstream_platform = schema.external_platform.lower()
|
|
409
412
|
|
|
410
413
|
table_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
411
414
|
self.platform,
|
|
@@ -413,14 +416,26 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
413
416
|
platform_instance=self.config.platform_instance,
|
|
414
417
|
env=self.config.env,
|
|
415
418
|
)
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
419
|
+
if upstream_platform == self.platform:
|
|
420
|
+
upstream_schema = schema.get_upstream_schema_name() or "public"
|
|
421
|
+
upstream_dataset_name = (
|
|
422
|
+
f"{schema.external_database}.{upstream_schema}.{table.name}"
|
|
423
|
+
)
|
|
424
|
+
upstream_platform_instance = self.config.platform_instance
|
|
425
|
+
else:
|
|
426
|
+
upstream_dataset_name = (
|
|
427
|
+
f"{schema.external_database}.{table.name}"
|
|
428
|
+
)
|
|
429
|
+
upstream_platform_instance = (
|
|
420
430
|
self.config.platform_instance_map.get(upstream_platform)
|
|
421
431
|
if self.config.platform_instance_map
|
|
422
432
|
else None
|
|
423
|
-
)
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
|
|
436
|
+
upstream_platform,
|
|
437
|
+
upstream_dataset_name,
|
|
438
|
+
platform_instance=upstream_platform_instance,
|
|
424
439
|
env=self.config.env,
|
|
425
440
|
)
|
|
426
441
|
|
|
@@ -48,7 +48,7 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
51
|
-
if table.
|
|
51
|
+
if table.is_external_table() or self.report.is_shared_database:
|
|
52
52
|
if not self.config.profiling.profile_external_tables:
|
|
53
53
|
# Case 1: If user did not tell us to profile external tables, simply log this.
|
|
54
54
|
self.report.profiling_skipped_other[schema] += 1
|
|
@@ -31,40 +31,64 @@ class RedshiftCommonQuery:
|
|
|
31
31
|
AND (datname <> ('template1')::name)
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
34
|
+
# NOTE: although schema owner id is available in tables, we do not use it
|
|
35
|
+
# as getting username from id requires access to pg_catalog.pg_user_info
|
|
36
|
+
# which is available only to superusers.
|
|
37
|
+
# NOTE: Need union here instead of using svv_all_schemas, in order to get
|
|
38
|
+
# external platform related lineage
|
|
39
|
+
# NOTE: Using database_name filter for svv_redshift_schemas, as otherwise
|
|
40
|
+
# schemas from other shared databases also show up.
|
|
41
|
+
@staticmethod
|
|
42
|
+
def list_schemas(database: str) -> str:
|
|
43
|
+
return f"""
|
|
44
|
+
SELECT
|
|
45
|
+
schema_name,
|
|
46
|
+
schema_type,
|
|
47
|
+
schema_option,
|
|
48
|
+
cast(null as varchar(256)) as external_platform,
|
|
49
|
+
cast(null as varchar(256)) as external_database
|
|
50
|
+
FROM svv_redshift_schemas
|
|
51
|
+
WHERE database_name = '{database}'
|
|
52
|
+
AND schema_name != 'pg_catalog' and schema_name != 'information_schema'
|
|
53
|
+
UNION ALL
|
|
54
|
+
SELECT
|
|
55
|
+
schemaname as schema_name,
|
|
56
|
+
'external' as schema_type,
|
|
57
|
+
esoptions as schema_option,
|
|
58
|
+
CASE s.eskind
|
|
59
|
+
WHEN '1' THEN 'GLUE'
|
|
60
|
+
WHEN '2' THEN 'HIVE'
|
|
61
|
+
WHEN '3' THEN 'POSTGRES'
|
|
62
|
+
WHEN '4' THEN 'REDSHIFT'
|
|
63
|
+
ELSE 'OTHER'
|
|
64
|
+
END as external_platform,
|
|
65
|
+
databasename as external_database
|
|
59
66
|
FROM SVV_EXTERNAL_SCHEMAS as s
|
|
60
|
-
-- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner
|
|
61
67
|
ORDER BY SCHEMA_NAME;
|
|
62
68
|
"""
|
|
63
69
|
|
|
70
|
+
@staticmethod
|
|
71
|
+
def get_database_details(database):
|
|
72
|
+
return f"""\
|
|
73
|
+
select
|
|
74
|
+
database_name,
|
|
75
|
+
database_type,
|
|
76
|
+
database_options
|
|
77
|
+
from svv_redshift_databases
|
|
78
|
+
where database_name='{database}';"""
|
|
79
|
+
|
|
80
|
+
# NOTE: although table owner id is available in tables, we do not use it
|
|
81
|
+
# as getting username from id requires access to pg_catalog.pg_user_info
|
|
82
|
+
# which is available only to superusers.
|
|
83
|
+
# NOTE: Tables from shared database are not available in pg_catalog.pg_class
|
|
64
84
|
@staticmethod
|
|
65
85
|
def list_tables(
|
|
86
|
+
database: str,
|
|
66
87
|
skip_external_tables: bool = False,
|
|
88
|
+
is_shared_database: bool = False,
|
|
67
89
|
) -> str:
|
|
90
|
+
# NOTE: it looks like description is available only in pg_description
|
|
91
|
+
# So this remains preferrred way
|
|
68
92
|
tables_query = """
|
|
69
93
|
SELECT CASE c.relkind
|
|
70
94
|
WHEN 'r' THEN 'TABLE'
|
|
@@ -83,8 +107,6 @@ SELECT schemaname as schema_name,
|
|
|
83
107
|
WHEN 8 THEN 'ALL'
|
|
84
108
|
END AS "diststyle",
|
|
85
109
|
c.relowner AS "owner_id",
|
|
86
|
-
-- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
|
|
87
|
-
-- u.usename AS "owner_name",
|
|
88
110
|
null as "owner_name",
|
|
89
111
|
TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition",
|
|
90
112
|
pg_catalog.array_to_string(c.relacl,'\n') AS "privileges",
|
|
@@ -98,12 +120,12 @@ SELECT schemaname as schema_name,
|
|
|
98
120
|
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
|
|
99
121
|
LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
|
|
100
122
|
LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
|
|
101
|
-
-- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
|
|
102
123
|
WHERE c.relkind IN ('r','v','m','S','f')
|
|
103
124
|
AND n.nspname !~ '^pg_'
|
|
104
125
|
AND n.nspname != 'information_schema'
|
|
105
126
|
"""
|
|
106
|
-
|
|
127
|
+
|
|
128
|
+
external_tables_query = f"""
|
|
107
129
|
SELECT 'EXTERNAL_TABLE' as tabletype,
|
|
108
130
|
NULL AS "schema_oid",
|
|
109
131
|
schemaname AS "schema",
|
|
@@ -122,16 +144,70 @@ SELECT schemaname as schema_name,
|
|
|
122
144
|
serde_parameters,
|
|
123
145
|
NULL as table_description
|
|
124
146
|
FROM pg_catalog.svv_external_tables
|
|
147
|
+
WHERE redshift_database_name='{database}'
|
|
148
|
+
ORDER BY "schema",
|
|
149
|
+
"relname"
|
|
150
|
+
"""
|
|
151
|
+
shared_database_tables_query = f"""
|
|
152
|
+
SELECT table_type as tabletype,
|
|
153
|
+
NULL AS "schema_oid",
|
|
154
|
+
schema_name AS "schema",
|
|
155
|
+
NULL AS "rel_oid",
|
|
156
|
+
table_name AS "relname",
|
|
157
|
+
NULL as "creation_time",
|
|
158
|
+
NULL AS "diststyle",
|
|
159
|
+
table_owner AS "owner_id",
|
|
160
|
+
NULL AS "owner_name",
|
|
161
|
+
NULL AS "view_definition",
|
|
162
|
+
table_acl AS "privileges",
|
|
163
|
+
NULL as "location",
|
|
164
|
+
NULL as parameters,
|
|
165
|
+
NULL as input_format,
|
|
166
|
+
NULL As output_format,
|
|
167
|
+
NULL as serde_parameters,
|
|
168
|
+
NULL as table_description
|
|
169
|
+
FROM svv_redshift_tables
|
|
170
|
+
WHERE database_name='{database}'
|
|
125
171
|
ORDER BY "schema",
|
|
126
172
|
"relname"
|
|
127
173
|
"""
|
|
128
|
-
if
|
|
174
|
+
if is_shared_database:
|
|
175
|
+
return shared_database_tables_query
|
|
176
|
+
elif skip_external_tables:
|
|
129
177
|
return tables_query
|
|
130
178
|
else:
|
|
131
179
|
return f"{tables_query} UNION {external_tables_query}"
|
|
132
180
|
|
|
133
|
-
|
|
134
|
-
list_columns
|
|
181
|
+
@staticmethod
|
|
182
|
+
def list_columns(
|
|
183
|
+
database_name: str, schema_name: str, is_shared_database: bool = False
|
|
184
|
+
) -> str:
|
|
185
|
+
if is_shared_database:
|
|
186
|
+
return f"""
|
|
187
|
+
SELECT
|
|
188
|
+
schema_name as "schema",
|
|
189
|
+
table_name as "table_name",
|
|
190
|
+
column_name as "name",
|
|
191
|
+
encoding as "encode",
|
|
192
|
+
-- Spectrum represents data types differently.
|
|
193
|
+
-- Standardize, so we can infer types.
|
|
194
|
+
data_type AS "type",
|
|
195
|
+
distkey as "distkey",
|
|
196
|
+
sortkey as "sortkey",
|
|
197
|
+
(case when is_nullable = 'no' then TRUE else FALSE end) as "notnull",
|
|
198
|
+
null as "comment",
|
|
199
|
+
null as "adsrc",
|
|
200
|
+
ordinal_position as "attnum",
|
|
201
|
+
data_type AS "format_type",
|
|
202
|
+
column_default as "default",
|
|
203
|
+
null as "schema_oid",
|
|
204
|
+
null as "table_oid"
|
|
205
|
+
FROM SVV_REDSHIFT_COLUMNS
|
|
206
|
+
WHERE 1 and schema = '{schema_name}'
|
|
207
|
+
AND database_name = '{database_name}'
|
|
208
|
+
ORDER BY "schema", "table_name", "attnum"
|
|
209
|
+
"""
|
|
210
|
+
return f"""
|
|
135
211
|
SELECT
|
|
136
212
|
n.nspname as "schema",
|
|
137
213
|
c.relname as "table_name",
|
|
@@ -206,6 +282,7 @@ SELECT schemaname as schema_name,
|
|
|
206
282
|
null as "table_oid"
|
|
207
283
|
FROM SVV_EXTERNAL_COLUMNS
|
|
208
284
|
WHERE 1 and schema = '{schema_name}'
|
|
285
|
+
AND redshift_database_name = '{database_name}'
|
|
209
286
|
ORDER BY "schema", "table_name", "attnum"
|
|
210
287
|
"""
|
|
211
288
|
|
|
@@ -362,6 +439,29 @@ ORDER BY target_schema, target_table, filename
|
|
|
362
439
|
) -> str:
|
|
363
440
|
raise NotImplementedError
|
|
364
441
|
|
|
442
|
+
@staticmethod
|
|
443
|
+
def list_outbound_datashares() -> str:
|
|
444
|
+
return """SELECT \
|
|
445
|
+
share_type, \
|
|
446
|
+
share_name, \
|
|
447
|
+
trim(producer_namespace) as producer_namespace, \
|
|
448
|
+
source_database \
|
|
449
|
+
FROM svv_datashares
|
|
450
|
+
WHERE share_type='OUTBOUND'\
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def get_inbound_datashare(database: str) -> str:
|
|
455
|
+
return f"""SELECT \
|
|
456
|
+
share_type, \
|
|
457
|
+
share_name, \
|
|
458
|
+
trim(producer_namespace) as producer_namespace, \
|
|
459
|
+
consumer_database \
|
|
460
|
+
FROM svv_datashares
|
|
461
|
+
WHERE share_type='INBOUND'
|
|
462
|
+
AND consumer_database= '{database}'\
|
|
463
|
+
"""
|
|
464
|
+
|
|
365
465
|
|
|
366
466
|
class RedshiftProvisionedQuery(RedshiftCommonQuery):
|
|
367
467
|
@staticmethod
|