acryl-datahub 1.0.0rc10__py3-none-any.whl → 1.0.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (28) hide show
  1. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/METADATA +2416 -2416
  2. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/RECORD +28 -27
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +4 -3
  5. datahub/ingestion/source/iceberg/iceberg_common.py +40 -1
  6. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  7. datahub/ingestion/source/redshift/config.py +4 -0
  8. datahub/ingestion/source/redshift/datashares.py +236 -0
  9. datahub/ingestion/source/redshift/lineage.py +6 -2
  10. datahub/ingestion/source/redshift/lineage_v2.py +7 -4
  11. datahub/ingestion/source/redshift/profile.py +1 -1
  12. datahub/ingestion/source/redshift/query.py +125 -33
  13. datahub/ingestion/source/redshift/redshift.py +41 -72
  14. datahub/ingestion/source/redshift/redshift_schema.py +166 -6
  15. datahub/ingestion/source/redshift/report.py +3 -0
  16. datahub/ingestion/source/sql/oracle.py +93 -63
  17. datahub/metadata/_schema_classes.py +5 -5
  18. datahub/metadata/schema.avsc +2 -1
  19. datahub/metadata/schemas/DomainKey.avsc +2 -1
  20. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  21. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  22. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  23. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  24. datahub/sql_parsing/sql_parsing_common.py +7 -0
  25. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/LICENSE +0 -0
  26. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/WHEEL +0 -0
  27. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/entry_points.txt +0 -0
  28. {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/top_level.txt +0 -0
@@ -31,40 +31,62 @@ class RedshiftCommonQuery:
31
31
  AND (datname <> ('template1')::name)
32
32
  """
33
33
 
34
- list_schemas: str = """SELECT distinct n.nspname AS "schema_name",
35
- 'local' as schema_type,
36
- null as schema_owner_name,
37
- '' as schema_option,
38
- null as external_database
39
- FROM pg_catalog.pg_class c
40
- LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
41
- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
42
- WHERE c.relkind IN ('r','v','m','S','f')
43
- AND n.nspname !~ '^pg_'
44
- AND n.nspname != 'information_schema'
45
- UNION ALL
46
- SELECT schemaname as schema_name,
47
- CASE s.eskind
48
- WHEN '1' THEN 'GLUE'
49
- WHEN '2' THEN 'HIVE'
50
- WHEN '3' THEN 'POSTGRES'
51
- WHEN '4' THEN 'REDSHIFT'
52
- ELSE 'OTHER'
53
- END as schema_type,
54
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
55
- -- usename as schema_owner_name,
56
- null as schema_owner_name,
57
- esoptions as schema_option,
58
- databasename as external_database
34
+ # NOTE: although schema owner id is available in tables, we do not use it
35
+ # as getting username from id requires access to pg_catalog.pg_user_info
36
+ # which is available only to superusers.
37
+ # NOTE: Need union here instead of using svv_all_schemas, in order to get
38
+ # external platform related lineage
39
+ # NOTE: Using database_name filter for svv_redshift_schemas, as otherwise
40
+ # schemas from other shared databases also show up.
41
+ @staticmethod
42
+ def list_schemas(database: str) -> str:
43
+ return f"""
44
+ SELECT
45
+ schema_name,
46
+ schema_type,
47
+ schema_option,
48
+ cast(null as varchar(256)) as external_platform,
49
+ cast(null as varchar(256)) as external_database
50
+ FROM svv_redshift_schemas
51
+ WHERE database_name = '{database}'
52
+ AND schema_name != 'pg_catalog' and schema_name != 'information_schema'
53
+ UNION ALL
54
+ SELECT
55
+ schemaname as schema_name,
56
+ 'external' as schema_type,
57
+ esoptions as schema_option,
58
+ CASE s.eskind
59
+ WHEN '1' THEN 'GLUE'
60
+ WHEN '2' THEN 'HIVE'
61
+ WHEN '3' THEN 'POSTGRES'
62
+ WHEN '4' THEN 'REDSHIFT'
63
+ ELSE 'OTHER'
64
+ END as external_platform,
65
+ databasename as external_database
59
66
  FROM SVV_EXTERNAL_SCHEMAS as s
60
- -- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner
61
67
  ORDER BY SCHEMA_NAME;
62
68
  """
63
69
 
70
+ @staticmethod
71
+ def get_database_details(database):
72
+ return f"""\
73
+ select
74
+ database_name,
75
+ database_type,
76
+ database_options
77
+ from svv_redshift_databases
78
+ where database_name='{database}';"""
79
+
80
+ # NOTE: although table owner id is available in tables, we do not use it
81
+ # as getting username from id requires access to pg_catalog.pg_user_info
82
+ # which is available only to superusers.
83
+ # NOTE: Tables from shared database are not available in pg_catalog.pg_class
64
84
  @staticmethod
65
85
  def list_tables(
66
- skip_external_tables: bool = False,
86
+ skip_external_tables: bool = False, is_shared_database: bool = False
67
87
  ) -> str:
88
+ # NOTE: it looks like description is available only in pg_description
89
+ # So this remains preferrred way
68
90
  tables_query = """
69
91
  SELECT CASE c.relkind
70
92
  WHEN 'r' THEN 'TABLE'
@@ -83,8 +105,6 @@ SELECT schemaname as schema_name,
83
105
  WHEN 8 THEN 'ALL'
84
106
  END AS "diststyle",
85
107
  c.relowner AS "owner_id",
86
- -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
87
- -- u.usename AS "owner_name",
88
108
  null as "owner_name",
89
109
  TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition",
90
110
  pg_catalog.array_to_string(c.relacl,'\n') AS "privileges",
@@ -98,11 +118,11 @@ SELECT schemaname as schema_name,
98
118
  LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
99
119
  LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
100
120
  LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
101
- -- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
102
121
  WHERE c.relkind IN ('r','v','m','S','f')
103
122
  AND n.nspname !~ '^pg_'
104
123
  AND n.nspname != 'information_schema'
105
124
  """
125
+
106
126
  external_tables_query = """
107
127
  SELECT 'EXTERNAL_TABLE' as tabletype,
108
128
  NULL AS "schema_oid",
@@ -125,13 +145,62 @@ SELECT schemaname as schema_name,
125
145
  ORDER BY "schema",
126
146
  "relname"
127
147
  """
128
- if skip_external_tables:
148
+ shared_database_tables_query = """
149
+ SELECT table_type as tabletype,
150
+ NULL AS "schema_oid",
151
+ schema_name AS "schema",
152
+ NULL AS "rel_oid",
153
+ table_name AS "relname",
154
+ NULL as "creation_time",
155
+ NULL AS "diststyle",
156
+ table_owner AS "owner_id",
157
+ NULL AS "owner_name",
158
+ NULL AS "view_definition",
159
+ table_acl AS "privileges",
160
+ NULL as "location",
161
+ NULL as parameters,
162
+ NULL as input_format,
163
+ NULL As output_format,
164
+ NULL as serde_parameters,
165
+ NULL as table_description
166
+ FROM svv_redshift_tables
167
+ ORDER BY "schema",
168
+ "relname"
169
+ """
170
+ if is_shared_database:
171
+ return shared_database_tables_query
172
+ elif skip_external_tables:
129
173
  return tables_query
130
174
  else:
131
175
  return f"{tables_query} UNION {external_tables_query}"
132
176
 
133
- # Why is this unused. Is this a bug?
134
- list_columns: str = """
177
+ @staticmethod
178
+ def list_columns(is_shared_database: bool = False) -> str:
179
+ if is_shared_database:
180
+ return """
181
+ SELECT
182
+ schema_name as "schema",
183
+ table_name as "table_name",
184
+ column_name as "name",
185
+ encoding as "encode",
186
+ -- Spectrum represents data types differently.
187
+ -- Standardize, so we can infer types.
188
+ data_type AS "type",
189
+ distkey as "distkey",
190
+ sortkey as "sortkey",
191
+ (case when is_nullable = 'no' then TRUE else FALSE end) as "notnull",
192
+ null as "comment",
193
+ null as "adsrc",
194
+ ordinal_position as "attnum",
195
+ data_type AS "format_type",
196
+ column_default as "default",
197
+ null as "schema_oid",
198
+ null as "table_oid"
199
+ FROM SVV_REDSHIFT_COLUMNS
200
+ WHERE 1 and schema = '{schema_name}'
201
+ ORDER BY "schema", "table_name", "attnum"
202
+ """
203
+ return """
135
204
  SELECT
136
205
  n.nspname as "schema",
137
206
  c.relname as "table_name",
@@ -362,6 +431,29 @@ ORDER BY target_schema, target_table, filename
362
431
  ) -> str:
363
432
  raise NotImplementedError
364
433
 
434
+ @staticmethod
435
+ def list_outbound_datashares() -> str:
436
+ return """SELECT \
437
+ share_type, \
438
+ share_name, \
439
+ trim(producer_namespace) as producer_namespace, \
440
+ source_database \
441
+ FROM svv_datashares
442
+ WHERE share_type='OUTBOUND'\
443
+ """
444
+
445
+ @staticmethod
446
+ def get_inbound_datashare(database: str) -> str:
447
+ return f"""SELECT \
448
+ share_type, \
449
+ share_name, \
450
+ trim(producer_namespace) as producer_namespace, \
451
+ consumer_database \
452
+ FROM svv_datashares
453
+ WHERE share_type='INBOUND'
454
+ AND consumer_database= '{database}'\
455
+ """
456
+
365
457
 
366
458
  class RedshiftProvisionedQuery(RedshiftCommonQuery):
367
459
  @staticmethod
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
33
33
  TestableSource,
34
34
  TestConnectionReport,
35
35
  )
36
- from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
36
+ from datahub.ingestion.api.source_helpers import (
37
+ auto_workunit,
38
+ create_dataset_props_patch_builder,
39
+ )
37
40
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
41
  from datahub.ingestion.glossary.classification_mixin import (
39
42
  ClassificationHandler,
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
45
48
  DatasetSubTypes,
46
49
  )
47
50
  from datahub.ingestion.source.redshift.config import RedshiftConfig
51
+ from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
48
52
  from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
49
53
  from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
50
54
  from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
52
56
  from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
53
57
  from datahub.ingestion.source.redshift.redshift_schema import (
54
58
  RedshiftColumn,
59
+ RedshiftDatabase,
55
60
  RedshiftDataDictionary,
56
61
  RedshiftSchema,
57
62
  RedshiftTable,
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
150
155
  - Table, row, and column statistics via optional SQL profiling
151
156
  - Table lineage
152
157
  - Usage statistics
153
-
154
- ### Prerequisites
155
-
156
- This source needs to access system tables that require extra permissions.
157
- To grant these permissions, please alter your datahub Redshift user the following way:
158
- ```sql
159
- ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
160
- GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
161
- GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
162
- ```
163
-
164
- :::note
165
-
166
- Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
167
-
168
- :::
169
-
170
- ### Lineage
171
-
172
- There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
173
-
174
- #### stl_scan_based
175
- The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
176
- discover lineage between tables.
177
- Pros:
178
- - Fast
179
- - Reliable
180
-
181
- Cons:
182
- - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
183
- - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
184
-
185
- #### sql_based
186
- The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
187
- and uses sql parsing to discover the dependencies.
188
-
189
- Pros:
190
- - Works with Spectrum tables
191
- - Views are connected properly if a table depends on it
192
-
193
- Cons:
194
- - Slow.
195
- - Less reliable as the query parser can fail on certain queries
196
-
197
- #### mixed
198
- Using both collector above and first applying the sql based and then the stl_scan based one.
199
-
200
- Pros:
201
- - Works with Spectrum tables
202
- - Views are connected properly if a table depends on it
203
- - A bit more reliable than the sql_based one only
204
-
205
- Cons:
206
- - Slow
207
- - May be incorrect at times as the query parser can fail on certain queries
208
-
209
- :::note
210
-
211
- The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
212
-
213
- :::
214
-
215
- ### Profiling
216
- Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
217
-
218
- If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
219
- ```yaml
220
- profiling:
221
- profile_table_level_only: true
222
- ```
223
158
  """
224
159
 
225
160
  # TODO: Replace with standardized types in sql_types.py
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
330
265
  self.config: RedshiftConfig = config
331
266
  self.report: RedshiftReport = RedshiftReport()
332
267
  self.classification_handler = ClassificationHandler(self.config, self.report)
268
+ self.datashares_helper = RedshiftDatasharesHelper(
269
+ self.config, self.report, self.ctx.graph
270
+ )
333
271
  self.platform = "redshift"
334
272
  self.domain_registry = None
335
273
  if self.config.domain:
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
361
299
  is_serverless=self.config.is_serverless
362
300
  )
363
301
 
302
+ self.db: Optional[RedshiftDatabase] = None
364
303
  self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
365
304
  self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
366
305
  self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
424
363
 
425
364
  database = self.config.database
426
365
  logger.info(f"Processing db {database}")
366
+
367
+ self.db = self.data_dictionary.get_database_details(connection, database)
368
+ self.report.is_shared_database = (
369
+ self.db is not None and self.db.is_shared_database
370
+ )
427
371
  with self.report.new_stage(METADATA_EXTRACTION):
428
372
  self.db_tables[database] = defaultdict()
429
373
  self.db_views[database] = defaultdict()
@@ -563,7 +507,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
563
507
 
564
508
  schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
565
509
  schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
566
- conn=connection, schema=schema
510
+ conn=connection,
511
+ schema=schema,
512
+ is_shared_database=self.report.is_shared_database,
567
513
  )
568
514
 
569
515
  if self.config.include_tables:
@@ -887,6 +833,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
887
833
  tables, views = self.data_dictionary.get_tables_and_views(
888
834
  conn=connection,
889
835
  skip_external_tables=self.config.skip_external_tables,
836
+ is_shared_database=self.report.is_shared_database,
890
837
  )
891
838
  for schema in tables:
892
839
  if not is_schema_allowed(
@@ -1029,6 +976,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1029
976
  database: str,
1030
977
  lineage_extractor: RedshiftSqlLineageV2,
1031
978
  ) -> Iterable[MetadataWorkUnit]:
979
+ if self.config.include_share_lineage:
980
+ outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
981
+ yield from auto_workunit(
982
+ self.datashares_helper.to_platform_resource(list(outbound_shares))
983
+ )
984
+
985
+ if self.db and self.db.is_shared_database:
986
+ inbound_share = self.db.get_inbound_share()
987
+ if inbound_share is None:
988
+ self.report.warning(
989
+ title="Upstream lineage of inbound datashare will be missing",
990
+ message="Database options do not contain sufficient information",
991
+ context=f"Database: {database}, Options {self.db.options}",
992
+ )
993
+ else:
994
+ for known_lineage in self.datashares_helper.generate_lineage(
995
+ inbound_share, self.get_all_tables()[database]
996
+ ):
997
+ lineage_extractor.aggregator.add(known_lineage)
998
+
999
+ # TODO: distinguish between definition level lineage and audit log based lineage
1000
+ # definition level lineage should never be skipped
1032
1001
  if not self._should_ingest_lineage():
1033
1002
  return
1034
1003
 
@@ -1,7 +1,8 @@
1
1
  import logging
2
+ import re
2
3
  from dataclasses import dataclass, field
3
4
  from datetime import datetime, timezone
4
- from typing import Dict, Iterable, List, Optional, Tuple
5
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
5
6
 
6
7
  import redshift_connector
7
8
 
@@ -41,6 +42,10 @@ class RedshiftTable(BaseTable):
41
42
  serde_parameters: Optional[str] = None
42
43
  last_altered: Optional[datetime] = None
43
44
 
45
+ @property
46
+ def is_external_table(self) -> bool:
47
+ return self.type == "EXTERNAL_TABLE"
48
+
44
49
 
45
50
  @dataclass
46
51
  class RedshiftView(BaseTable):
@@ -51,6 +56,10 @@ class RedshiftView(BaseTable):
51
56
  size_in_bytes: Optional[int] = None
52
57
  rows_count: Optional[int] = None
53
58
 
59
+ @property
60
+ def is_external_table(self) -> bool:
61
+ return self.type == "EXTERNAL_TABLE"
62
+
54
63
 
55
64
  @dataclass
56
65
  class RedshiftSchema:
@@ -59,8 +68,102 @@ class RedshiftSchema:
59
68
  type: str
60
69
  owner: Optional[str] = None
61
70
  option: Optional[str] = None
71
+ external_platform: Optional[str] = None
62
72
  external_database: Optional[str] = None
63
73
 
74
+ @property
75
+ def is_external_schema(self) -> bool:
76
+ return self.type == "external"
77
+
78
+
79
+ @dataclass
80
+ class PartialInboundDatashare:
81
+ share_name: str
82
+ producer_namespace_prefix: str
83
+ consumer_database: str
84
+
85
+ def get_description(self) -> str:
86
+ return (
87
+ f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}"
88
+ )
89
+
90
+
91
+ @dataclass
92
+ class OutboundDatashare:
93
+ share_name: str
94
+ producer_namespace: str
95
+ source_database: str
96
+
97
+ def get_key(self) -> str:
98
+ return f"{self.producer_namespace}.{self.share_name}"
99
+
100
+
101
+ @dataclass
102
+ class InboundDatashare:
103
+ share_name: str
104
+ producer_namespace: str
105
+ consumer_database: str
106
+
107
+ def get_key(self) -> str:
108
+ return f"{self.producer_namespace}.{self.share_name}"
109
+
110
+ def get_description(self) -> str:
111
+ return f"Namespace {self.producer_namespace} Share {self.share_name}"
112
+
113
+
114
+ @dataclass
115
+ class RedshiftDatabase:
116
+ name: str
117
+ type: str
118
+ options: Optional[str] = None
119
+
120
+ @property
121
+ def is_shared_database(self) -> bool:
122
+ return self.type == "shared"
123
+
124
+ # NOTE: ideally options are in form
125
+ # {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"}
126
+ # however due to varchar(128) type of database table that captures options
127
+ # we may receive only partial information about inbound share
128
+ def get_inbound_share(
129
+ self,
130
+ ) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
131
+ if not self.is_shared_database or not self.options:
132
+ return None
133
+
134
+ # Convert into single regex ??
135
+ share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options)
136
+ namespace_match = re.search(
137
+ r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options
138
+ )
139
+ partial_namespace_match = re.search(
140
+ r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options
141
+ )
142
+
143
+ if not share_name_match:
144
+ # We will always at least get share name
145
+ return None
146
+
147
+ share_name = share_name_match.group(1)
148
+ if namespace_match:
149
+ return InboundDatashare(
150
+ share_name=share_name,
151
+ producer_namespace=namespace_match.group(1),
152
+ consumer_database=self.name,
153
+ )
154
+ elif partial_namespace_match:
155
+ return PartialInboundDatashare(
156
+ share_name=share_name,
157
+ producer_namespace_prefix=partial_namespace_match.group(1),
158
+ consumer_database=self.name,
159
+ )
160
+ else:
161
+ return PartialInboundDatashare(
162
+ share_name=share_name,
163
+ producer_namespace_prefix="",
164
+ consumer_database=self.name,
165
+ )
166
+
64
167
 
65
168
  @dataclass
66
169
  class RedshiftExtraTableMeta:
@@ -141,13 +244,31 @@ class RedshiftDataDictionary:
141
244
 
142
245
  return [db[0] for db in dbs]
143
246
 
247
+ @staticmethod
248
+ def get_database_details(
249
+ conn: redshift_connector.Connection, database: str
250
+ ) -> Optional[RedshiftDatabase]:
251
+ cursor = RedshiftDataDictionary.get_query_result(
252
+ conn,
253
+ RedshiftCommonQuery.get_database_details(database),
254
+ )
255
+
256
+ row = cursor.fetchone()
257
+ if row is None:
258
+ return None
259
+ return RedshiftDatabase(
260
+ name=database,
261
+ type=row[1],
262
+ options=row[2],
263
+ )
264
+
144
265
  @staticmethod
145
266
  def get_schemas(
146
267
  conn: redshift_connector.Connection, database: str
147
268
  ) -> List[RedshiftSchema]:
148
269
  cursor = RedshiftDataDictionary.get_query_result(
149
270
  conn,
150
- RedshiftCommonQuery.list_schemas.format(database_name=database),
271
+ RedshiftCommonQuery.list_schemas(database),
151
272
  )
152
273
 
153
274
  schemas = cursor.fetchall()
@@ -158,8 +279,8 @@ class RedshiftDataDictionary:
158
279
  database=database,
159
280
  name=schema[field_names.index("schema_name")],
160
281
  type=schema[field_names.index("schema_type")],
161
- owner=schema[field_names.index("schema_owner_name")],
162
282
  option=schema[field_names.index("schema_option")],
283
+ external_platform=schema[field_names.index("external_platform")],
163
284
  external_database=schema[field_names.index("external_database")],
164
285
  )
165
286
  for schema in schemas
@@ -203,6 +324,7 @@ class RedshiftDataDictionary:
203
324
  self,
204
325
  conn: redshift_connector.Connection,
205
326
  skip_external_tables: bool = False,
327
+ is_shared_database: bool = False,
206
328
  ) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
207
329
  tables: Dict[str, List[RedshiftTable]] = {}
208
330
  views: Dict[str, List[RedshiftView]] = {}
@@ -213,7 +335,10 @@ class RedshiftDataDictionary:
213
335
 
214
336
  cur = RedshiftDataDictionary.get_query_result(
215
337
  conn,
216
- RedshiftCommonQuery.list_tables(skip_external_tables=skip_external_tables),
338
+ RedshiftCommonQuery.list_tables(
339
+ skip_external_tables=skip_external_tables,
340
+ is_shared_database=is_shared_database,
341
+ ),
217
342
  )
218
343
  field_names = [i[0] for i in cur.description]
219
344
  db_tables = cur.fetchall()
@@ -358,11 +483,15 @@ class RedshiftDataDictionary:
358
483
 
359
484
  @staticmethod
360
485
  def get_columns_for_schema(
361
- conn: redshift_connector.Connection, schema: RedshiftSchema
486
+ conn: redshift_connector.Connection,
487
+ schema: RedshiftSchema,
488
+ is_shared_database: bool = False,
362
489
  ) -> Dict[str, List[RedshiftColumn]]:
363
490
  cursor = RedshiftDataDictionary.get_query_result(
364
491
  conn,
365
- RedshiftCommonQuery.list_columns.format(schema_name=schema.name),
492
+ RedshiftCommonQuery.list_columns(
493
+ is_shared_database=is_shared_database
494
+ ).format(schema_name=schema.name),
366
495
  )
367
496
 
368
497
  table_columns: Dict[str, List[RedshiftColumn]] = {}
@@ -508,3 +637,34 @@ class RedshiftDataDictionary:
508
637
  start_time=row[field_names.index("start_time")],
509
638
  )
510
639
  rows = cursor.fetchmany()
640
+
641
+ @staticmethod
642
+ def get_outbound_datashares(
643
+ conn: redshift_connector.Connection,
644
+ ) -> Iterable[OutboundDatashare]:
645
+ cursor = conn.cursor()
646
+ cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
647
+ for item in cursor.fetchall():
648
+ yield OutboundDatashare(
649
+ share_name=item[1],
650
+ producer_namespace=item[2],
651
+ source_database=item[3],
652
+ )
653
+
654
+ # NOTE: this is not used right now as it requires superuser privilege
655
+ # We can use this in future if the permissions are lowered.
656
+ @staticmethod
657
+ def get_inbound_datashare(
658
+ conn: redshift_connector.Connection,
659
+ database: str,
660
+ ) -> Optional[InboundDatashare]:
661
+ cursor = conn.cursor()
662
+ cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
663
+ item = cursor.fetchone()
664
+ if item:
665
+ return InboundDatashare(
666
+ share_name=item[1],
667
+ producer_namespace=item[2],
668
+ consumer_database=item[3],
669
+ )
670
+ return None
@@ -60,5 +60,8 @@ class RedshiftReport(
60
60
  sql_aggregator: Optional[SqlAggregatorReport] = None
61
61
  lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict)
62
62
 
63
+ is_shared_database: bool = False
64
+ outbound_shares_count: Optional[int] = None
65
+
63
66
  def report_dropped(self, key: str) -> None:
64
67
  self.filtered.append(key)