acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
1
+ from typing import Dict, Iterable, List, Optional, Union
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datahub.api.entities.platformresource.platform_resource import (
6
+ ElasticPlatformResourceQuery,
7
+ PlatformResource,
8
+ PlatformResourceKey,
9
+ PlatformResourceSearchFields,
10
+ )
11
+ from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.graph.client import DataHubGraph
14
+ from datahub.ingestion.source.redshift.config import RedshiftConfig
15
+ from datahub.ingestion.source.redshift.redshift_schema import (
16
+ InboundDatashare,
17
+ OutboundDatashare,
18
+ PartialInboundDatashare,
19
+ RedshiftTable,
20
+ RedshiftView,
21
+ )
22
+ from datahub.ingestion.source.redshift.report import RedshiftReport
23
+ from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping
24
+ from datahub.utilities.search_utils import LogicalOperator
25
+
26
+
27
+ class OutboundSharePlatformResource(BaseModel):
28
+ namespace: str
29
+ platform_instance: Optional[str]
30
+ env: str
31
+ source_database: str
32
+ share_name: str
33
+
34
+ def get_key(self) -> str:
35
+ return f"{self.namespace}.{self.share_name}"
36
+
37
+
38
+ PLATFORM_RESOURCE_TYPE = "OUTBOUND_DATASHARE"
39
+
40
+
41
+ class RedshiftDatasharesHelper:
42
+ """
43
+ Redshift datashares lineage generation relies on PlatformResource entity
44
+ to identify the producer namespace and its platform_instance and env
45
+
46
+ Ingestion of any database in namespace will
47
+ A. generate PlatformResource entity for all outbound shares in namespace.
48
+ B. generate lineage with upstream tables from another namespace, if the database
49
+ is created from an inbound share
50
+
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: RedshiftConfig,
56
+ report: RedshiftReport,
57
+ graph: Optional[DataHubGraph],
58
+ ):
59
+ self.platform = "redshift"
60
+ self.config = config
61
+ self.report = report
62
+ self.graph = graph
63
+
64
+ def to_platform_resource(
65
+ self, shares: List[OutboundDatashare]
66
+ ) -> Iterable[MetadataChangeProposalWrapper]:
67
+ if not shares:
68
+ self.report.outbound_shares_count = 0
69
+ return
70
+
71
+ self.report.outbound_shares_count = len(shares)
72
+ # Producer namespace will be current namespace for all
73
+ # outbound data shares
74
+
75
+ for share in shares:
76
+ producer_namespace = share.producer_namespace
77
+ try:
78
+ platform_resource_key = PlatformResourceKey(
79
+ platform=self.platform,
80
+ platform_instance=self.config.platform_instance,
81
+ resource_type=PLATFORM_RESOURCE_TYPE,
82
+ primary_key=share.get_key(),
83
+ )
84
+
85
+ value = OutboundSharePlatformResource(
86
+ namespace=producer_namespace,
87
+ platform_instance=self.config.platform_instance,
88
+ env=self.config.env,
89
+ source_database=share.source_database,
90
+ share_name=share.share_name,
91
+ )
92
+
93
+ platform_resource = PlatformResource.create(
94
+ key=platform_resource_key,
95
+ value=value,
96
+ secondary_keys=[share.share_name, share.producer_namespace],
97
+ )
98
+
99
+ yield from platform_resource.to_mcps()
100
+
101
+ except Exception as exc:
102
+ self.report.warning(
103
+ title="Downstream lineage to outbound datashare may not work",
104
+ message="Failed to generate platform resource for outbound datashares",
105
+ context=f"Namespace {share.producer_namespace} Share {share.share_name}",
106
+ exc=exc,
107
+ )
108
+
109
+ def generate_lineage(
110
+ self,
111
+ share: Union[InboundDatashare, PartialInboundDatashare],
112
+ tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]],
113
+ ) -> Iterable[KnownLineageMapping]:
114
+ upstream_share = self.find_upstream_share(share)
115
+
116
+ if not upstream_share:
117
+ return
118
+
119
+ for schema in tables:
120
+ for table in tables[schema]:
121
+ dataset_urn = self.gen_dataset_urn(
122
+ f"{share.consumer_database}.{schema}.{table.name}",
123
+ self.config.platform_instance,
124
+ self.config.env,
125
+ )
126
+
127
+ upstream_dataset_urn = self.gen_dataset_urn(
128
+ f"{upstream_share.source_database}.{schema}.{table.name}",
129
+ upstream_share.platform_instance,
130
+ upstream_share.env,
131
+ )
132
+
133
+ yield KnownLineageMapping(
134
+ upstream_urn=upstream_dataset_urn, downstream_urn=dataset_urn
135
+ )
136
+
137
+ def find_upstream_share(
138
+ self, share: Union[InboundDatashare, PartialInboundDatashare]
139
+ ) -> Optional[OutboundSharePlatformResource]:
140
+ if not self.graph:
141
+ self.report.warning(
142
+ title="Upstream lineage of inbound datashare will be missing",
143
+ message="Missing datahub graph. Either use the datahub-rest sink or "
144
+ "set the top-level datahub_api config in the recipe",
145
+ )
146
+ else:
147
+ resources = self.get_platform_resources(self.graph, share)
148
+
149
+ if len(resources) == 0 or (
150
+ not any(
151
+ [
152
+ resource.resource_info is not None
153
+ and resource.resource_info.resource_type
154
+ == PLATFORM_RESOURCE_TYPE
155
+ for resource in resources
156
+ ]
157
+ )
158
+ ):
159
+ self.report.info(
160
+ title="Upstream lineage of inbound datashare will be missing",
161
+ message="Missing platform resource for share. "
162
+ "Setup redshift ingestion for namespace if not already done. If ingestion is setup, "
163
+ "check whether ingestion user has ALTER/SHARE permission to share.",
164
+ context=share.get_description(),
165
+ )
166
+ else:
167
+ # Ideally we should get only one resource as primary key is namespace+share
168
+ # and type is "OUTBOUND_DATASHARE"
169
+ for resource in resources:
170
+ try:
171
+ assert (
172
+ resource.resource_info is not None
173
+ and resource.resource_info.value is not None
174
+ )
175
+ return resource.resource_info.value.as_pydantic_object(
176
+ OutboundSharePlatformResource, True
177
+ )
178
+ except Exception as e:
179
+ self.report.warning(
180
+ title="Upstream lineage of inbound datashare will be missing",
181
+ message="Failed to parse platform resource for outbound datashare",
182
+ context=share.get_description(),
183
+ exc=e,
184
+ )
185
+
186
+ return None
187
+
188
+ def get_platform_resources(
189
+ self,
190
+ graph: DataHubGraph,
191
+ share: Union[InboundDatashare, PartialInboundDatashare],
192
+ ) -> List[PlatformResource]:
193
+ # NOTE: ideally we receive InboundDatashare and not PartialInboundDatashare.
194
+ # however due to varchar(128) type of database table that captures datashare options
195
+ # we may receive only partial information about inbound share
196
+ # Alternate option to get InboundDatashare using svv_datashares requires superuser
197
+ if isinstance(share, PartialInboundDatashare):
198
+ return list(
199
+ PlatformResource.search_by_filters(
200
+ graph,
201
+ ElasticPlatformResourceQuery.create_from()
202
+ .group(LogicalOperator.AND)
203
+ .add_field_match(
204
+ PlatformResourceSearchFields.RESOURCE_TYPE,
205
+ PLATFORM_RESOURCE_TYPE,
206
+ )
207
+ .add_field_match(
208
+ PlatformResourceSearchFields.PLATFORM, self.platform
209
+ )
210
+ .add_field_match(
211
+ PlatformResourceSearchFields.SECONDARY_KEYS,
212
+ share.share_name,
213
+ )
214
+ .add_wildcard(
215
+ PlatformResourceSearchFields.SECONDARY_KEYS.field_name,
216
+ f"{share.producer_namespace_prefix}*",
217
+ )
218
+ .end(),
219
+ )
220
+ )
221
+ return list(
222
+ PlatformResource.search_by_key(
223
+ graph, key=share.get_key(), primary=True, is_exact=True
224
+ )
225
+ )
226
+
227
+ # TODO: Refactor and move to new RedshiftIdentifierBuilder class
228
+ def gen_dataset_urn(
229
+ self, datahub_dataset_name: str, platform_instance: Optional[str], env: str
230
+ ) -> str:
231
+ return make_dataset_urn_with_platform_instance(
232
+ platform=self.platform,
233
+ name=datahub_dataset_name,
234
+ platform_instance=platform_instance,
235
+ env=env,
236
+ )
@@ -813,9 +813,13 @@ class RedshiftLineageExtractor:
813
813
  )
814
814
 
815
815
  tablename = table.name
816
- if table.type == "EXTERNAL_TABLE":
816
+ if (
817
+ table.is_external_table()
818
+ and schema.is_external_schema()
819
+ and schema.external_platform
820
+ ):
817
821
  # external_db_params = schema.option
818
- upstream_platform = schema.type.lower()
822
+ upstream_platform = schema.external_platform.lower()
819
823
  catalog_upstream = UpstreamClass(
820
824
  mce_builder.make_dataset_urn_with_platform_instance(
821
825
  upstream_platform,
@@ -401,11 +401,14 @@ class RedshiftSqlLineageV2(Closeable):
401
401
  ) -> None:
402
402
  for schema_name, tables in all_tables[self.database].items():
403
403
  for table in tables:
404
- if table.type == "EXTERNAL_TABLE":
405
- schema = db_schemas[self.database][schema_name]
406
-
404
+ schema = db_schemas[self.database][schema_name]
405
+ if (
406
+ table.is_external_table()
407
+ and schema.is_external_schema()
408
+ and schema.external_platform
409
+ ):
407
410
  # external_db_params = schema.option
408
- upstream_platform = schema.type.lower()
411
+ upstream_platform = schema.external_platform.lower()
409
412
 
410
413
  table_urn = mce_builder.make_dataset_urn_with_platform_instance(
411
414
  self.platform,
@@ -413,14 +416,26 @@ class RedshiftSqlLineageV2(Closeable):
413
416
  platform_instance=self.config.platform_instance,
414
417
  env=self.config.env,
415
418
  )
416
- upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
417
- upstream_platform,
418
- f"{schema.external_database}.{table.name}",
419
- platform_instance=(
419
+ if upstream_platform == self.platform:
420
+ upstream_schema = schema.get_upstream_schema_name() or "public"
421
+ upstream_dataset_name = (
422
+ f"{schema.external_database}.{upstream_schema}.{table.name}"
423
+ )
424
+ upstream_platform_instance = self.config.platform_instance
425
+ else:
426
+ upstream_dataset_name = (
427
+ f"{schema.external_database}.{table.name}"
428
+ )
429
+ upstream_platform_instance = (
420
430
  self.config.platform_instance_map.get(upstream_platform)
421
431
  if self.config.platform_instance_map
422
432
  else None
423
- ),
433
+ )
434
+
435
+ upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
436
+ upstream_platform,
437
+ upstream_dataset_name,
438
+ platform_instance=upstream_platform_instance,
424
439
  env=self.config.env,
425
440
  )
426
441
 
@@ -48,7 +48,7 @@ class RedshiftProfiler(GenericProfiler):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
51
- if table.type == "EXTERNAL_TABLE":
51
+ if table.is_external_table() or self.report.is_shared_database:
52
52
  if not self.config.profiling.profile_external_tables:
53
53
  # Case 1: If user did not tell us to profile external tables, simply log this.
54
54
  self.report.profiling_skipped_other[schema] += 1
@@ -31,40 +31,64 @@ class RedshiftCommonQuery:
31
31
  AND (datname <> ('template1')::name)
32
32
  """
33
33
 
34
- list_schemas: str = """SELECT distinct n.nspname AS "schema_name",
35
- 'local' as schema_type,
36
- null as schema_owner_name,
37
- '' as schema_option,
38
- null as external_database
39
- FROM pg_catalog.pg_class c
40
- LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
41
- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
42
- WHERE c.relkind IN ('r','v','m','S','f')
43
- AND n.nspname !~ '^pg_'
44
- AND n.nspname != 'information_schema'
45
- UNION ALL
46
- SELECT schemaname as schema_name,
47
- CASE s.eskind
48
- WHEN '1' THEN 'GLUE'
49
- WHEN '2' THEN 'HIVE'
50
- WHEN '3' THEN 'POSTGRES'
51
- WHEN '4' THEN 'REDSHIFT'
52
- ELSE 'OTHER'
53
- END as schema_type,
54
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
55
- -- usename as schema_owner_name,
56
- null as schema_owner_name,
57
- esoptions as schema_option,
58
- databasename as external_database
34
+ # NOTE: although schema owner id is available in tables, we do not use it
35
+ # as getting username from id requires access to pg_catalog.pg_user_info
36
+ # which is available only to superusers.
37
+ # NOTE: Need union here instead of using svv_all_schemas, in order to get
38
+ # external platform related lineage
39
+ # NOTE: Using database_name filter for svv_redshift_schemas, as otherwise
40
+ # schemas from other shared databases also show up.
41
+ @staticmethod
42
+ def list_schemas(database: str) -> str:
43
+ return f"""
44
+ SELECT
45
+ schema_name,
46
+ schema_type,
47
+ schema_option,
48
+ cast(null as varchar(256)) as external_platform,
49
+ cast(null as varchar(256)) as external_database
50
+ FROM svv_redshift_schemas
51
+ WHERE database_name = '{database}'
52
+ AND schema_name != 'pg_catalog' and schema_name != 'information_schema'
53
+ UNION ALL
54
+ SELECT
55
+ schemaname as schema_name,
56
+ 'external' as schema_type,
57
+ esoptions as schema_option,
58
+ CASE s.eskind
59
+ WHEN '1' THEN 'GLUE'
60
+ WHEN '2' THEN 'HIVE'
61
+ WHEN '3' THEN 'POSTGRES'
62
+ WHEN '4' THEN 'REDSHIFT'
63
+ ELSE 'OTHER'
64
+ END as external_platform,
65
+ databasename as external_database
59
66
  FROM SVV_EXTERNAL_SCHEMAS as s
60
- -- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner
61
67
  ORDER BY SCHEMA_NAME;
62
68
  """
63
69
 
70
+ @staticmethod
71
+ def get_database_details(database):
72
+ return f"""\
73
+ select
74
+ database_name,
75
+ database_type,
76
+ database_options
77
+ from svv_redshift_databases
78
+ where database_name='{database}';"""
79
+
80
+ # NOTE: although table owner id is available in tables, we do not use it
81
+ # as getting username from id requires access to pg_catalog.pg_user_info
82
+ # which is available only to superusers.
83
+ # NOTE: Tables from shared database are not available in pg_catalog.pg_class
64
84
  @staticmethod
65
85
  def list_tables(
86
+ database: str,
66
87
  skip_external_tables: bool = False,
88
+ is_shared_database: bool = False,
67
89
  ) -> str:
90
+ # NOTE: it looks like description is available only in pg_description
91
+ # So this remains preferrred way
68
92
  tables_query = """
69
93
  SELECT CASE c.relkind
70
94
  WHEN 'r' THEN 'TABLE'
@@ -83,8 +107,6 @@ SELECT schemaname as schema_name,
83
107
  WHEN 8 THEN 'ALL'
84
108
  END AS "diststyle",
85
109
  c.relowner AS "owner_id",
86
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
87
- -- u.usename AS "owner_name",
88
110
  null as "owner_name",
89
111
  TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition",
90
112
  pg_catalog.array_to_string(c.relacl,'\n') AS "privileges",
@@ -98,12 +120,12 @@ SELECT schemaname as schema_name,
98
120
  LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
99
121
  LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
100
122
  LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
101
- -- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
102
123
  WHERE c.relkind IN ('r','v','m','S','f')
103
124
  AND n.nspname !~ '^pg_'
104
125
  AND n.nspname != 'information_schema'
105
126
  """
106
- external_tables_query = """
127
+
128
+ external_tables_query = f"""
107
129
  SELECT 'EXTERNAL_TABLE' as tabletype,
108
130
  NULL AS "schema_oid",
109
131
  schemaname AS "schema",
@@ -122,16 +144,70 @@ SELECT schemaname as schema_name,
122
144
  serde_parameters,
123
145
  NULL as table_description
124
146
  FROM pg_catalog.svv_external_tables
147
+ WHERE redshift_database_name='{database}'
148
+ ORDER BY "schema",
149
+ "relname"
150
+ """
151
+ shared_database_tables_query = f"""
152
+ SELECT table_type as tabletype,
153
+ NULL AS "schema_oid",
154
+ schema_name AS "schema",
155
+ NULL AS "rel_oid",
156
+ table_name AS "relname",
157
+ NULL as "creation_time",
158
+ NULL AS "diststyle",
159
+ table_owner AS "owner_id",
160
+ NULL AS "owner_name",
161
+ NULL AS "view_definition",
162
+ table_acl AS "privileges",
163
+ NULL as "location",
164
+ NULL as parameters,
165
+ NULL as input_format,
166
+ NULL As output_format,
167
+ NULL as serde_parameters,
168
+ NULL as table_description
169
+ FROM svv_redshift_tables
170
+ WHERE database_name='{database}'
125
171
  ORDER BY "schema",
126
172
  "relname"
127
173
  """
128
- if skip_external_tables:
174
+ if is_shared_database:
175
+ return shared_database_tables_query
176
+ elif skip_external_tables:
129
177
  return tables_query
130
178
  else:
131
179
  return f"{tables_query} UNION {external_tables_query}"
132
180
 
133
- # Why is this unused. Is this a bug?
134
- list_columns: str = """
181
+ @staticmethod
182
+ def list_columns(
183
+ database_name: str, schema_name: str, is_shared_database: bool = False
184
+ ) -> str:
185
+ if is_shared_database:
186
+ return f"""
187
+ SELECT
188
+ schema_name as "schema",
189
+ table_name as "table_name",
190
+ column_name as "name",
191
+ encoding as "encode",
192
+ -- Spectrum represents data types differently.
193
+ -- Standardize, so we can infer types.
194
+ data_type AS "type",
195
+ distkey as "distkey",
196
+ sortkey as "sortkey",
197
+ (case when is_nullable = 'no' then TRUE else FALSE end) as "notnull",
198
+ null as "comment",
199
+ null as "adsrc",
200
+ ordinal_position as "attnum",
201
+ data_type AS "format_type",
202
+ column_default as "default",
203
+ null as "schema_oid",
204
+ null as "table_oid"
205
+ FROM SVV_REDSHIFT_COLUMNS
206
+ WHERE 1 and schema = '{schema_name}'
207
+ AND database_name = '{database_name}'
208
+ ORDER BY "schema", "table_name", "attnum"
209
+ """
210
+ return f"""
135
211
  SELECT
136
212
  n.nspname as "schema",
137
213
  c.relname as "table_name",
@@ -206,6 +282,7 @@ SELECT schemaname as schema_name,
206
282
  null as "table_oid"
207
283
  FROM SVV_EXTERNAL_COLUMNS
208
284
  WHERE 1 and schema = '{schema_name}'
285
+ AND redshift_database_name = '{database_name}'
209
286
  ORDER BY "schema", "table_name", "attnum"
210
287
  """
211
288
 
@@ -362,6 +439,29 @@ ORDER BY target_schema, target_table, filename
362
439
  ) -> str:
363
440
  raise NotImplementedError
364
441
 
442
+ @staticmethod
443
+ def list_outbound_datashares() -> str:
444
+ return """SELECT \
445
+ share_type, \
446
+ share_name, \
447
+ trim(producer_namespace) as producer_namespace, \
448
+ source_database \
449
+ FROM svv_datashares
450
+ WHERE share_type='OUTBOUND'\
451
+ """
452
+
453
+ @staticmethod
454
+ def get_inbound_datashare(database: str) -> str:
455
+ return f"""SELECT \
456
+ share_type, \
457
+ share_name, \
458
+ trim(producer_namespace) as producer_namespace, \
459
+ consumer_database \
460
+ FROM svv_datashares
461
+ WHERE share_type='INBOUND'
462
+ AND consumer_database= '{database}'\
463
+ """
464
+
365
465
 
366
466
  class RedshiftProvisionedQuery(RedshiftCommonQuery):
367
467
  @staticmethod