acryl-datahub 1.0.0rc10__py3-none-any.whl → 1.0.0rc12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (29) hide show
  1. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/METADATA +2513 -2513
  2. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/RECORD +29 -28
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +4 -3
  5. datahub/cli/check_cli.py +72 -19
  6. datahub/ingestion/source/iceberg/iceberg_common.py +40 -1
  7. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  8. datahub/ingestion/source/redshift/config.py +4 -0
  9. datahub/ingestion/source/redshift/datashares.py +236 -0
  10. datahub/ingestion/source/redshift/lineage.py +6 -2
  11. datahub/ingestion/source/redshift/lineage_v2.py +7 -4
  12. datahub/ingestion/source/redshift/profile.py +1 -1
  13. datahub/ingestion/source/redshift/query.py +125 -33
  14. datahub/ingestion/source/redshift/redshift.py +41 -72
  15. datahub/ingestion/source/redshift/redshift_schema.py +166 -6
  16. datahub/ingestion/source/redshift/report.py +3 -0
  17. datahub/ingestion/source/sql/oracle.py +93 -63
  18. datahub/metadata/_schema_classes.py +5 -5
  19. datahub/metadata/schema.avsc +2 -1
  20. datahub/metadata/schemas/DomainKey.avsc +2 -1
  21. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  22. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  23. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  24. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  25. datahub/sql_parsing/sql_parsing_common.py +7 -0
  26. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/LICENSE +0 -0
  27. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/WHEEL +0 -0
  28. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/entry_points.txt +0 -0
  29. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
1
+ from typing import Dict, Iterable, List, Optional, Union
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from datahub.api.entities.platformresource.platform_resource import (
6
+ ElasticPlatformResourceQuery,
7
+ PlatformResource,
8
+ PlatformResourceKey,
9
+ PlatformResourceSearchFields,
10
+ )
11
+ from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.graph.client import DataHubGraph
14
+ from datahub.ingestion.source.redshift.config import RedshiftConfig
15
+ from datahub.ingestion.source.redshift.redshift_schema import (
16
+ InboundDatashare,
17
+ OutboundDatashare,
18
+ PartialInboundDatashare,
19
+ RedshiftTable,
20
+ RedshiftView,
21
+ )
22
+ from datahub.ingestion.source.redshift.report import RedshiftReport
23
+ from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping
24
+ from datahub.utilities.search_utils import LogicalOperator
25
+
26
+
27
+ class OutboundSharePlatformResource(BaseModel):
28
+ namespace: str
29
+ platform_instance: Optional[str]
30
+ env: str
31
+ source_database: str
32
+ share_name: str
33
+
34
+ def get_key(self) -> str:
35
+ return f"{self.namespace}.{self.share_name}"
36
+
37
+
38
+ PLATFORM_RESOURCE_TYPE = "OUTBOUND_DATASHARE"
39
+
40
+
41
+ class RedshiftDatasharesHelper:
42
+ """
43
+ Redshift datashares lineage generation relies on PlatformResource entity
44
+ to identify the producer namespace and its platform_instance and env
45
+
46
+ Ingestion of any database in namespace will
47
+ A. generate PlatformResource entity for all outbound shares in namespace.
48
+ B. generate lineage with upstream tables from another namespace, if the database
49
+ is created from an inbound share
50
+
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: RedshiftConfig,
56
+ report: RedshiftReport,
57
+ graph: Optional[DataHubGraph],
58
+ ):
59
+ self.platform = "redshift"
60
+ self.config = config
61
+ self.report = report
62
+ self.graph = graph
63
+
64
+ def to_platform_resource(
65
+ self, shares: List[OutboundDatashare]
66
+ ) -> Iterable[MetadataChangeProposalWrapper]:
67
+ if not shares:
68
+ self.report.outbound_shares_count = 0
69
+ return
70
+
71
+ self.report.outbound_shares_count = len(shares)
72
+ # Producer namespace will be current namespace for all
73
+ # outbound data shares
74
+
75
+ for share in shares:
76
+ producer_namespace = share.producer_namespace
77
+ try:
78
+ platform_resource_key = PlatformResourceKey(
79
+ platform=self.platform,
80
+ platform_instance=self.config.platform_instance,
81
+ resource_type=PLATFORM_RESOURCE_TYPE,
82
+ primary_key=share.get_key(),
83
+ )
84
+
85
+ value = OutboundSharePlatformResource(
86
+ namespace=producer_namespace,
87
+ platform_instance=self.config.platform_instance,
88
+ env=self.config.env,
89
+ source_database=share.source_database,
90
+ share_name=share.share_name,
91
+ )
92
+
93
+ platform_resource = PlatformResource.create(
94
+ key=platform_resource_key,
95
+ value=value,
96
+ secondary_keys=[share.share_name, share.producer_namespace],
97
+ )
98
+
99
+ yield from platform_resource.to_mcps()
100
+
101
+ except Exception as exc:
102
+ self.report.warning(
103
+ title="Downstream lineage to outbound datashare may not work",
104
+ message="Failed to generate platform resource for outbound datashares",
105
+ context=f"Namespace {share.producer_namespace} Share {share.share_name}",
106
+ exc=exc,
107
+ )
108
+
109
+ def generate_lineage(
110
+ self,
111
+ share: Union[InboundDatashare, PartialInboundDatashare],
112
+ tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]],
113
+ ) -> Iterable[KnownLineageMapping]:
114
+ upstream_share = self.find_upstream_share(share)
115
+
116
+ if not upstream_share:
117
+ return
118
+
119
+ for schema in tables:
120
+ for table in tables[schema]:
121
+ dataset_urn = self.gen_dataset_urn(
122
+ f"{share.consumer_database}.{schema}.{table.name}",
123
+ self.config.platform_instance,
124
+ self.config.env,
125
+ )
126
+
127
+ upstream_dataset_urn = self.gen_dataset_urn(
128
+ f"{upstream_share.source_database}.{schema}.{table.name}",
129
+ upstream_share.platform_instance,
130
+ upstream_share.env,
131
+ )
132
+
133
+ yield KnownLineageMapping(
134
+ upstream_urn=upstream_dataset_urn, downstream_urn=dataset_urn
135
+ )
136
+
137
+ def find_upstream_share(
138
+ self, share: Union[InboundDatashare, PartialInboundDatashare]
139
+ ) -> Optional[OutboundSharePlatformResource]:
140
+ if not self.graph:
141
+ self.report.warning(
142
+ title="Upstream lineage of inbound datashare will be missing",
143
+ message="Missing datahub graph. Either use the datahub-rest sink or "
144
+ "set the top-level datahub_api config in the recipe",
145
+ )
146
+ else:
147
+ resources = self.get_platform_resources(self.graph, share)
148
+
149
+ if len(resources) == 0 or (
150
+ not any(
151
+ [
152
+ resource.resource_info is not None
153
+ and resource.resource_info.resource_type
154
+ == PLATFORM_RESOURCE_TYPE
155
+ for resource in resources
156
+ ]
157
+ )
158
+ ):
159
+ self.report.info(
160
+ title="Upstream lineage of inbound datashare will be missing",
161
+ message="Missing platform resource for share. "
162
+ "Setup redshift ingestion for namespace if not already done. If ingestion is setup, "
163
+ "check whether ingestion user has ALTER/SHARE permission to share.",
164
+ context=share.get_description(),
165
+ )
166
+ else:
167
+ # Ideally we should get only one resource as primary key is namespace+share
168
+ # and type is "OUTBOUND_DATASHARE"
169
+ for resource in resources:
170
+ try:
171
+ assert (
172
+ resource.resource_info is not None
173
+ and resource.resource_info.value is not None
174
+ )
175
+ return resource.resource_info.value.as_pydantic_object(
176
+ OutboundSharePlatformResource, True
177
+ )
178
+ except Exception as e:
179
+ self.report.warning(
180
+ title="Upstream lineage of inbound datashare will be missing",
181
+ message="Failed to parse platform resource for outbound datashare",
182
+ context=share.get_description(),
183
+ exc=e,
184
+ )
185
+
186
+ return None
187
+
188
+ def get_platform_resources(
189
+ self,
190
+ graph: DataHubGraph,
191
+ share: Union[InboundDatashare, PartialInboundDatashare],
192
+ ) -> List[PlatformResource]:
193
+ # NOTE: ideally we receive InboundDatashare and not PartialInboundDatashare.
194
+ # however due to varchar(128) type of database table that captures datashare options
195
+ # we may receive only partial information about inbound share
196
+ # Alternate option to get InboundDatashare using svv_datashares requires superuser
197
+ if isinstance(share, PartialInboundDatashare):
198
+ return list(
199
+ PlatformResource.search_by_filters(
200
+ graph,
201
+ ElasticPlatformResourceQuery.create_from()
202
+ .group(LogicalOperator.AND)
203
+ .add_field_match(
204
+ PlatformResourceSearchFields.RESOURCE_TYPE,
205
+ PLATFORM_RESOURCE_TYPE,
206
+ )
207
+ .add_field_match(
208
+ PlatformResourceSearchFields.PLATFORM, self.platform
209
+ )
210
+ .add_field_match(
211
+ PlatformResourceSearchFields.SECONDARY_KEYS,
212
+ share.share_name,
213
+ )
214
+ .add_wildcard(
215
+ PlatformResourceSearchFields.SECONDARY_KEYS.field_name,
216
+ f"{share.producer_namespace_prefix}*",
217
+ )
218
+ .end(),
219
+ )
220
+ )
221
+ return list(
222
+ PlatformResource.search_by_key(
223
+ graph, key=share.get_key(), primary=True, is_exact=True
224
+ )
225
+ )
226
+
227
+ # TODO: Refactor and move to new RedshiftIdentifierBuilder class
228
+ def gen_dataset_urn(
229
+ self, datahub_dataset_name: str, platform_instance: Optional[str], env: str
230
+ ) -> str:
231
+ return make_dataset_urn_with_platform_instance(
232
+ platform=self.platform,
233
+ name=datahub_dataset_name,
234
+ platform_instance=platform_instance,
235
+ env=env,
236
+ )
@@ -813,9 +813,13 @@ class RedshiftLineageExtractor:
813
813
  )
814
814
 
815
815
  tablename = table.name
816
- if table.type == "EXTERNAL_TABLE":
816
+ if (
817
+ table.is_external_table
818
+ and schema.is_external_schema
819
+ and schema.external_platform
820
+ ):
817
821
  # external_db_params = schema.option
818
- upstream_platform = schema.type.lower()
822
+ upstream_platform = schema.external_platform.lower()
819
823
  catalog_upstream = UpstreamClass(
820
824
  mce_builder.make_dataset_urn_with_platform_instance(
821
825
  upstream_platform,
@@ -401,11 +401,14 @@ class RedshiftSqlLineageV2(Closeable):
401
401
  ) -> None:
402
402
  for schema_name, tables in all_tables[self.database].items():
403
403
  for table in tables:
404
- if table.type == "EXTERNAL_TABLE":
405
- schema = db_schemas[self.database][schema_name]
406
-
404
+ schema = db_schemas[self.database][schema_name]
405
+ if (
406
+ table.is_external_table
407
+ and schema.is_external_schema
408
+ and schema.external_platform
409
+ ):
407
410
  # external_db_params = schema.option
408
- upstream_platform = schema.type.lower()
411
+ upstream_platform = schema.external_platform.lower()
409
412
 
410
413
  table_urn = mce_builder.make_dataset_urn_with_platform_instance(
411
414
  self.platform,
@@ -48,7 +48,7 @@ class RedshiftProfiler(GenericProfiler):
48
48
  if not self.config.schema_pattern.allowed(schema):
49
49
  continue
50
50
  for table in tables[db].get(schema, {}):
51
- if table.type == "EXTERNAL_TABLE":
51
+ if table.is_external_table:
52
52
  if not self.config.profiling.profile_external_tables:
53
53
  # Case 1: If user did not tell us to profile external tables, simply log this.
54
54
  self.report.profiling_skipped_other[schema] += 1
@@ -31,40 +31,62 @@ class RedshiftCommonQuery:
31
31
  AND (datname <> ('template1')::name)
32
32
  """
33
33
 
34
- list_schemas: str = """SELECT distinct n.nspname AS "schema_name",
35
- 'local' as schema_type,
36
- null as schema_owner_name,
37
- '' as schema_option,
38
- null as external_database
39
- FROM pg_catalog.pg_class c
40
- LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
41
- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
42
- WHERE c.relkind IN ('r','v','m','S','f')
43
- AND n.nspname !~ '^pg_'
44
- AND n.nspname != 'information_schema'
45
- UNION ALL
46
- SELECT schemaname as schema_name,
47
- CASE s.eskind
48
- WHEN '1' THEN 'GLUE'
49
- WHEN '2' THEN 'HIVE'
50
- WHEN '3' THEN 'POSTGRES'
51
- WHEN '4' THEN 'REDSHIFT'
52
- ELSE 'OTHER'
53
- END as schema_type,
54
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
55
- -- usename as schema_owner_name,
56
- null as schema_owner_name,
57
- esoptions as schema_option,
58
- databasename as external_database
34
+ # NOTE: although schema owner id is available in tables, we do not use it
35
+ # as getting username from id requires access to pg_catalog.pg_user_info
36
+ # which is available only to superusers.
37
+ # NOTE: Need union here instead of using svv_all_schemas, in order to get
38
+ # external platform related lineage
39
+ # NOTE: Using database_name filter for svv_redshift_schemas, as otherwise
40
+ # schemas from other shared databases also show up.
41
+ @staticmethod
42
+ def list_schemas(database: str) -> str:
43
+ return f"""
44
+ SELECT
45
+ schema_name,
46
+ schema_type,
47
+ schema_option,
48
+ cast(null as varchar(256)) as external_platform,
49
+ cast(null as varchar(256)) as external_database
50
+ FROM svv_redshift_schemas
51
+ WHERE database_name = '{database}'
52
+ AND schema_name != 'pg_catalog' and schema_name != 'information_schema'
53
+ UNION ALL
54
+ SELECT
55
+ schemaname as schema_name,
56
+ 'external' as schema_type,
57
+ esoptions as schema_option,
58
+ CASE s.eskind
59
+ WHEN '1' THEN 'GLUE'
60
+ WHEN '2' THEN 'HIVE'
61
+ WHEN '3' THEN 'POSTGRES'
62
+ WHEN '4' THEN 'REDSHIFT'
63
+ ELSE 'OTHER'
64
+ END as external_platform,
65
+ databasename as external_database
59
66
  FROM SVV_EXTERNAL_SCHEMAS as s
60
- -- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner
61
67
  ORDER BY SCHEMA_NAME;
62
68
  """
63
69
 
70
+ @staticmethod
71
+ def get_database_details(database):
72
+ return f"""\
73
+ select
74
+ database_name,
75
+ database_type,
76
+ database_options
77
+ from svv_redshift_databases
78
+ where database_name='{database}';"""
79
+
80
+ # NOTE: although table owner id is available in tables, we do not use it
81
+ # as getting username from id requires access to pg_catalog.pg_user_info
82
+ # which is available only to superusers.
83
+ # NOTE: Tables from shared database are not available in pg_catalog.pg_class
64
84
  @staticmethod
65
85
  def list_tables(
66
- skip_external_tables: bool = False,
86
+ skip_external_tables: bool = False, is_shared_database: bool = False
67
87
  ) -> str:
88
+ # NOTE: it looks like description is available only in pg_description
89
+ # So this remains preferrred way
68
90
  tables_query = """
69
91
  SELECT CASE c.relkind
70
92
  WHEN 'r' THEN 'TABLE'
@@ -83,8 +105,6 @@ SELECT schemaname as schema_name,
83
105
  WHEN 8 THEN 'ALL'
84
106
  END AS "diststyle",
85
107
  c.relowner AS "owner_id",
86
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
87
- -- u.usename AS "owner_name",
88
108
  null as "owner_name",
89
109
  TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition",
90
110
  pg_catalog.array_to_string(c.relacl,'\n') AS "privileges",
@@ -98,11 +118,11 @@ SELECT schemaname as schema_name,
98
118
  LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
99
119
  LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
100
120
  LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
101
- -- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
102
121
  WHERE c.relkind IN ('r','v','m','S','f')
103
122
  AND n.nspname !~ '^pg_'
104
123
  AND n.nspname != 'information_schema'
105
124
  """
125
+
106
126
  external_tables_query = """
107
127
  SELECT 'EXTERNAL_TABLE' as tabletype,
108
128
  NULL AS "schema_oid",
@@ -125,13 +145,62 @@ SELECT schemaname as schema_name,
125
145
  ORDER BY "schema",
126
146
  "relname"
127
147
  """
128
- if skip_external_tables:
148
+ shared_database_tables_query = """
149
+ SELECT table_type as tabletype,
150
+ NULL AS "schema_oid",
151
+ schema_name AS "schema",
152
+ NULL AS "rel_oid",
153
+ table_name AS "relname",
154
+ NULL as "creation_time",
155
+ NULL AS "diststyle",
156
+ table_owner AS "owner_id",
157
+ NULL AS "owner_name",
158
+ NULL AS "view_definition",
159
+ table_acl AS "privileges",
160
+ NULL as "location",
161
+ NULL as parameters,
162
+ NULL as input_format,
163
+ NULL As output_format,
164
+ NULL as serde_parameters,
165
+ NULL as table_description
166
+ FROM svv_redshift_tables
167
+ ORDER BY "schema",
168
+ "relname"
169
+ """
170
+ if is_shared_database:
171
+ return shared_database_tables_query
172
+ elif skip_external_tables:
129
173
  return tables_query
130
174
  else:
131
175
  return f"{tables_query} UNION {external_tables_query}"
132
176
 
133
- # Why is this unused. Is this a bug?
134
- list_columns: str = """
177
+ @staticmethod
178
+ def list_columns(is_shared_database: bool = False) -> str:
179
+ if is_shared_database:
180
+ return """
181
+ SELECT
182
+ schema_name as "schema",
183
+ table_name as "table_name",
184
+ column_name as "name",
185
+ encoding as "encode",
186
+ -- Spectrum represents data types differently.
187
+ -- Standardize, so we can infer types.
188
+ data_type AS "type",
189
+ distkey as "distkey",
190
+ sortkey as "sortkey",
191
+ (case when is_nullable = 'no' then TRUE else FALSE end) as "notnull",
192
+ null as "comment",
193
+ null as "adsrc",
194
+ ordinal_position as "attnum",
195
+ data_type AS "format_type",
196
+ column_default as "default",
197
+ null as "schema_oid",
198
+ null as "table_oid"
199
+ FROM SVV_REDSHIFT_COLUMNS
200
+ WHERE 1 and schema = '{schema_name}'
201
+ ORDER BY "schema", "table_name", "attnum"
202
+ """
203
+ return """
135
204
  SELECT
136
205
  n.nspname as "schema",
137
206
  c.relname as "table_name",
@@ -362,6 +431,29 @@ ORDER BY target_schema, target_table, filename
362
431
  ) -> str:
363
432
  raise NotImplementedError
364
433
 
434
+ @staticmethod
435
+ def list_outbound_datashares() -> str:
436
+ return """SELECT \
437
+ share_type, \
438
+ share_name, \
439
+ trim(producer_namespace) as producer_namespace, \
440
+ source_database \
441
+ FROM svv_datashares
442
+ WHERE share_type='OUTBOUND'\
443
+ """
444
+
445
+ @staticmethod
446
+ def get_inbound_datashare(database: str) -> str:
447
+ return f"""SELECT \
448
+ share_type, \
449
+ share_name, \
450
+ trim(producer_namespace) as producer_namespace, \
451
+ consumer_database \
452
+ FROM svv_datashares
453
+ WHERE share_type='INBOUND'
454
+ AND consumer_database= '{database}'\
455
+ """
456
+
365
457
 
366
458
  class RedshiftProvisionedQuery(RedshiftCommonQuery):
367
459
  @staticmethod
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
33
33
  TestableSource,
34
34
  TestConnectionReport,
35
35
  )
36
- from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
36
+ from datahub.ingestion.api.source_helpers import (
37
+ auto_workunit,
38
+ create_dataset_props_patch_builder,
39
+ )
37
40
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
41
  from datahub.ingestion.glossary.classification_mixin import (
39
42
  ClassificationHandler,
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
45
48
  DatasetSubTypes,
46
49
  )
47
50
  from datahub.ingestion.source.redshift.config import RedshiftConfig
51
+ from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
48
52
  from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
49
53
  from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
50
54
  from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
52
56
  from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
53
57
  from datahub.ingestion.source.redshift.redshift_schema import (
54
58
  RedshiftColumn,
59
+ RedshiftDatabase,
55
60
  RedshiftDataDictionary,
56
61
  RedshiftSchema,
57
62
  RedshiftTable,
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
150
155
  - Table, row, and column statistics via optional SQL profiling
151
156
  - Table lineage
152
157
  - Usage statistics
153
-
154
- ### Prerequisites
155
-
156
- This source needs to access system tables that require extra permissions.
157
- To grant these permissions, please alter your datahub Redshift user the following way:
158
- ```sql
159
- ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
160
- GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
161
- GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
162
- ```
163
-
164
- :::note
165
-
166
- Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
167
-
168
- :::
169
-
170
- ### Lineage
171
-
172
- There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
173
-
174
- #### stl_scan_based
175
- The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
176
- discover lineage between tables.
177
- Pros:
178
- - Fast
179
- - Reliable
180
-
181
- Cons:
182
- - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
183
- - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
184
-
185
- #### sql_based
186
- The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
187
- and uses sql parsing to discover the dependencies.
188
-
189
- Pros:
190
- - Works with Spectrum tables
191
- - Views are connected properly if a table depends on it
192
-
193
- Cons:
194
- - Slow.
195
- - Less reliable as the query parser can fail on certain queries
196
-
197
- #### mixed
198
- Using both collector above and first applying the sql based and then the stl_scan based one.
199
-
200
- Pros:
201
- - Works with Spectrum tables
202
- - Views are connected properly if a table depends on it
203
- - A bit more reliable than the sql_based one only
204
-
205
- Cons:
206
- - Slow
207
- - May be incorrect at times as the query parser can fail on certain queries
208
-
209
- :::note
210
-
211
- The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
212
-
213
- :::
214
-
215
- ### Profiling
216
- Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
217
-
218
- If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
219
- ```yaml
220
- profiling:
221
- profile_table_level_only: true
222
- ```
223
158
  """
224
159
 
225
160
  # TODO: Replace with standardized types in sql_types.py
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
330
265
  self.config: RedshiftConfig = config
331
266
  self.report: RedshiftReport = RedshiftReport()
332
267
  self.classification_handler = ClassificationHandler(self.config, self.report)
268
+ self.datashares_helper = RedshiftDatasharesHelper(
269
+ self.config, self.report, self.ctx.graph
270
+ )
333
271
  self.platform = "redshift"
334
272
  self.domain_registry = None
335
273
  if self.config.domain:
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
361
299
  is_serverless=self.config.is_serverless
362
300
  )
363
301
 
302
+ self.db: Optional[RedshiftDatabase] = None
364
303
  self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
365
304
  self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
366
305
  self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
424
363
 
425
364
  database = self.config.database
426
365
  logger.info(f"Processing db {database}")
366
+
367
+ self.db = self.data_dictionary.get_database_details(connection, database)
368
+ self.report.is_shared_database = (
369
+ self.db is not None and self.db.is_shared_database
370
+ )
427
371
  with self.report.new_stage(METADATA_EXTRACTION):
428
372
  self.db_tables[database] = defaultdict()
429
373
  self.db_views[database] = defaultdict()
@@ -563,7 +507,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
563
507
 
564
508
  schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
565
509
  schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
566
- conn=connection, schema=schema
510
+ conn=connection,
511
+ schema=schema,
512
+ is_shared_database=self.report.is_shared_database,
567
513
  )
568
514
 
569
515
  if self.config.include_tables:
@@ -887,6 +833,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
887
833
  tables, views = self.data_dictionary.get_tables_and_views(
888
834
  conn=connection,
889
835
  skip_external_tables=self.config.skip_external_tables,
836
+ is_shared_database=self.report.is_shared_database,
890
837
  )
891
838
  for schema in tables:
892
839
  if not is_schema_allowed(
@@ -1029,6 +976,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1029
976
  database: str,
1030
977
  lineage_extractor: RedshiftSqlLineageV2,
1031
978
  ) -> Iterable[MetadataWorkUnit]:
979
+ if self.config.include_share_lineage:
980
+ outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
981
+ yield from auto_workunit(
982
+ self.datashares_helper.to_platform_resource(list(outbound_shares))
983
+ )
984
+
985
+ if self.db and self.db.is_shared_database:
986
+ inbound_share = self.db.get_inbound_share()
987
+ if inbound_share is None:
988
+ self.report.warning(
989
+ title="Upstream lineage of inbound datashare will be missing",
990
+ message="Database options do not contain sufficient information",
991
+ context=f"Database: {database}, Options {self.db.options}",
992
+ )
993
+ else:
994
+ for known_lineage in self.datashares_helper.generate_lineage(
995
+ inbound_share, self.get_all_tables()[database]
996
+ ):
997
+ lineage_extractor.aggregator.add(known_lineage)
998
+
999
+ # TODO: distinguish between definition level lineage and audit log based lineage
1000
+ # definition level lineage should never be skipped
1032
1001
  if not self._should_ingest_lineage():
1033
1002
  return
1034
1003