acryl-datahub 1.0.0rc10__py3-none-any.whl → 1.0.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/METADATA +2416 -2416
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/RECORD +28 -27
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/ingestion/source/iceberg/iceberg_common.py +40 -1
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +7 -4
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +125 -33
- datahub/ingestion/source/redshift/redshift.py +41 -72
- datahub/ingestion/source/redshift/redshift_schema.py +166 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/sql/oracle.py +93 -63
- datahub/metadata/_schema_classes.py +5 -5
- datahub/metadata/schema.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc10.dist-info → acryl_datahub-1.0.0rc11.dist-info}/top_level.txt +0 -0
|
@@ -31,40 +31,62 @@ class RedshiftCommonQuery:
|
|
|
31
31
|
AND (datname <> ('template1')::name)
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
34
|
+
# NOTE: although schema owner id is available in tables, we do not use it
|
|
35
|
+
# as getting username from id requires access to pg_catalog.pg_user_info
|
|
36
|
+
# which is available only to superusers.
|
|
37
|
+
# NOTE: Need union here instead of using svv_all_schemas, in order to get
|
|
38
|
+
# external platform related lineage
|
|
39
|
+
# NOTE: Using database_name filter for svv_redshift_schemas, as otherwise
|
|
40
|
+
# schemas from other shared databases also show up.
|
|
41
|
+
@staticmethod
|
|
42
|
+
def list_schemas(database: str) -> str:
|
|
43
|
+
return f"""
|
|
44
|
+
SELECT
|
|
45
|
+
schema_name,
|
|
46
|
+
schema_type,
|
|
47
|
+
schema_option,
|
|
48
|
+
cast(null as varchar(256)) as external_platform,
|
|
49
|
+
cast(null as varchar(256)) as external_database
|
|
50
|
+
FROM svv_redshift_schemas
|
|
51
|
+
WHERE database_name = '{database}'
|
|
52
|
+
AND schema_name != 'pg_catalog' and schema_name != 'information_schema'
|
|
53
|
+
UNION ALL
|
|
54
|
+
SELECT
|
|
55
|
+
schemaname as schema_name,
|
|
56
|
+
'external' as schema_type,
|
|
57
|
+
esoptions as schema_option,
|
|
58
|
+
CASE s.eskind
|
|
59
|
+
WHEN '1' THEN 'GLUE'
|
|
60
|
+
WHEN '2' THEN 'HIVE'
|
|
61
|
+
WHEN '3' THEN 'POSTGRES'
|
|
62
|
+
WHEN '4' THEN 'REDSHIFT'
|
|
63
|
+
ELSE 'OTHER'
|
|
64
|
+
END as external_platform,
|
|
65
|
+
databasename as external_database
|
|
59
66
|
FROM SVV_EXTERNAL_SCHEMAS as s
|
|
60
|
-
-- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner
|
|
61
67
|
ORDER BY SCHEMA_NAME;
|
|
62
68
|
"""
|
|
63
69
|
|
|
70
|
+
@staticmethod
|
|
71
|
+
def get_database_details(database):
|
|
72
|
+
return f"""\
|
|
73
|
+
select
|
|
74
|
+
database_name,
|
|
75
|
+
database_type,
|
|
76
|
+
database_options
|
|
77
|
+
from svv_redshift_databases
|
|
78
|
+
where database_name='{database}';"""
|
|
79
|
+
|
|
80
|
+
# NOTE: although table owner id is available in tables, we do not use it
|
|
81
|
+
# as getting username from id requires access to pg_catalog.pg_user_info
|
|
82
|
+
# which is available only to superusers.
|
|
83
|
+
# NOTE: Tables from shared database are not available in pg_catalog.pg_class
|
|
64
84
|
@staticmethod
|
|
65
85
|
def list_tables(
|
|
66
|
-
skip_external_tables: bool = False,
|
|
86
|
+
skip_external_tables: bool = False, is_shared_database: bool = False
|
|
67
87
|
) -> str:
|
|
88
|
+
# NOTE: it looks like description is available only in pg_description
|
|
89
|
+
# So this remains preferrred way
|
|
68
90
|
tables_query = """
|
|
69
91
|
SELECT CASE c.relkind
|
|
70
92
|
WHEN 'r' THEN 'TABLE'
|
|
@@ -83,8 +105,6 @@ SELECT schemaname as schema_name,
|
|
|
83
105
|
WHEN 8 THEN 'ALL'
|
|
84
106
|
END AS "diststyle",
|
|
85
107
|
c.relowner AS "owner_id",
|
|
86
|
-
-- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need
|
|
87
|
-
-- u.usename AS "owner_name",
|
|
88
108
|
null as "owner_name",
|
|
89
109
|
TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition",
|
|
90
110
|
pg_catalog.array_to_string(c.relacl,'\n') AS "privileges",
|
|
@@ -98,11 +118,11 @@ SELECT schemaname as schema_name,
|
|
|
98
118
|
LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
|
|
99
119
|
LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
|
|
100
120
|
LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
|
|
101
|
-
-- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner
|
|
102
121
|
WHERE c.relkind IN ('r','v','m','S','f')
|
|
103
122
|
AND n.nspname !~ '^pg_'
|
|
104
123
|
AND n.nspname != 'information_schema'
|
|
105
124
|
"""
|
|
125
|
+
|
|
106
126
|
external_tables_query = """
|
|
107
127
|
SELECT 'EXTERNAL_TABLE' as tabletype,
|
|
108
128
|
NULL AS "schema_oid",
|
|
@@ -125,13 +145,62 @@ SELECT schemaname as schema_name,
|
|
|
125
145
|
ORDER BY "schema",
|
|
126
146
|
"relname"
|
|
127
147
|
"""
|
|
128
|
-
|
|
148
|
+
shared_database_tables_query = """
|
|
149
|
+
SELECT table_type as tabletype,
|
|
150
|
+
NULL AS "schema_oid",
|
|
151
|
+
schema_name AS "schema",
|
|
152
|
+
NULL AS "rel_oid",
|
|
153
|
+
table_name AS "relname",
|
|
154
|
+
NULL as "creation_time",
|
|
155
|
+
NULL AS "diststyle",
|
|
156
|
+
table_owner AS "owner_id",
|
|
157
|
+
NULL AS "owner_name",
|
|
158
|
+
NULL AS "view_definition",
|
|
159
|
+
table_acl AS "privileges",
|
|
160
|
+
NULL as "location",
|
|
161
|
+
NULL as parameters,
|
|
162
|
+
NULL as input_format,
|
|
163
|
+
NULL As output_format,
|
|
164
|
+
NULL as serde_parameters,
|
|
165
|
+
NULL as table_description
|
|
166
|
+
FROM svv_redshift_tables
|
|
167
|
+
ORDER BY "schema",
|
|
168
|
+
"relname"
|
|
169
|
+
"""
|
|
170
|
+
if is_shared_database:
|
|
171
|
+
return shared_database_tables_query
|
|
172
|
+
elif skip_external_tables:
|
|
129
173
|
return tables_query
|
|
130
174
|
else:
|
|
131
175
|
return f"{tables_query} UNION {external_tables_query}"
|
|
132
176
|
|
|
133
|
-
|
|
134
|
-
list_columns:
|
|
177
|
+
@staticmethod
|
|
178
|
+
def list_columns(is_shared_database: bool = False) -> str:
|
|
179
|
+
if is_shared_database:
|
|
180
|
+
return """
|
|
181
|
+
SELECT
|
|
182
|
+
schema_name as "schema",
|
|
183
|
+
table_name as "table_name",
|
|
184
|
+
column_name as "name",
|
|
185
|
+
encoding as "encode",
|
|
186
|
+
-- Spectrum represents data types differently.
|
|
187
|
+
-- Standardize, so we can infer types.
|
|
188
|
+
data_type AS "type",
|
|
189
|
+
distkey as "distkey",
|
|
190
|
+
sortkey as "sortkey",
|
|
191
|
+
(case when is_nullable = 'no' then TRUE else FALSE end) as "notnull",
|
|
192
|
+
null as "comment",
|
|
193
|
+
null as "adsrc",
|
|
194
|
+
ordinal_position as "attnum",
|
|
195
|
+
data_type AS "format_type",
|
|
196
|
+
column_default as "default",
|
|
197
|
+
null as "schema_oid",
|
|
198
|
+
null as "table_oid"
|
|
199
|
+
FROM SVV_REDSHIFT_COLUMNS
|
|
200
|
+
WHERE 1 and schema = '{schema_name}'
|
|
201
|
+
ORDER BY "schema", "table_name", "attnum"
|
|
202
|
+
"""
|
|
203
|
+
return """
|
|
135
204
|
SELECT
|
|
136
205
|
n.nspname as "schema",
|
|
137
206
|
c.relname as "table_name",
|
|
@@ -362,6 +431,29 @@ ORDER BY target_schema, target_table, filename
|
|
|
362
431
|
) -> str:
|
|
363
432
|
raise NotImplementedError
|
|
364
433
|
|
|
434
|
+
@staticmethod
|
|
435
|
+
def list_outbound_datashares() -> str:
|
|
436
|
+
return """SELECT \
|
|
437
|
+
share_type, \
|
|
438
|
+
share_name, \
|
|
439
|
+
trim(producer_namespace) as producer_namespace, \
|
|
440
|
+
source_database \
|
|
441
|
+
FROM svv_datashares
|
|
442
|
+
WHERE share_type='OUTBOUND'\
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
@staticmethod
|
|
446
|
+
def get_inbound_datashare(database: str) -> str:
|
|
447
|
+
return f"""SELECT \
|
|
448
|
+
share_type, \
|
|
449
|
+
share_name, \
|
|
450
|
+
trim(producer_namespace) as producer_namespace, \
|
|
451
|
+
consumer_database \
|
|
452
|
+
FROM svv_datashares
|
|
453
|
+
WHERE share_type='INBOUND'
|
|
454
|
+
AND consumer_database= '{database}'\
|
|
455
|
+
"""
|
|
456
|
+
|
|
365
457
|
|
|
366
458
|
class RedshiftProvisionedQuery(RedshiftCommonQuery):
|
|
367
459
|
@staticmethod
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
|
|
|
33
33
|
TestableSource,
|
|
34
34
|
TestConnectionReport,
|
|
35
35
|
)
|
|
36
|
-
from datahub.ingestion.api.source_helpers import
|
|
36
|
+
from datahub.ingestion.api.source_helpers import (
|
|
37
|
+
auto_workunit,
|
|
38
|
+
create_dataset_props_patch_builder,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
41
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
39
42
|
ClassificationHandler,
|
|
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
45
48
|
DatasetSubTypes,
|
|
46
49
|
)
|
|
47
50
|
from datahub.ingestion.source.redshift.config import RedshiftConfig
|
|
51
|
+
from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
|
|
48
52
|
from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
|
|
49
53
|
from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
|
|
50
54
|
from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
|
|
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
|
|
|
52
56
|
from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
|
|
53
57
|
from datahub.ingestion.source.redshift.redshift_schema import (
|
|
54
58
|
RedshiftColumn,
|
|
59
|
+
RedshiftDatabase,
|
|
55
60
|
RedshiftDataDictionary,
|
|
56
61
|
RedshiftSchema,
|
|
57
62
|
RedshiftTable,
|
|
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
150
155
|
- Table, row, and column statistics via optional SQL profiling
|
|
151
156
|
- Table lineage
|
|
152
157
|
- Usage statistics
|
|
153
|
-
|
|
154
|
-
### Prerequisites
|
|
155
|
-
|
|
156
|
-
This source needs to access system tables that require extra permissions.
|
|
157
|
-
To grant these permissions, please alter your datahub Redshift user the following way:
|
|
158
|
-
```sql
|
|
159
|
-
ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
|
|
160
|
-
GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
|
|
161
|
-
GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
:::note
|
|
165
|
-
|
|
166
|
-
Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
|
|
167
|
-
|
|
168
|
-
:::
|
|
169
|
-
|
|
170
|
-
### Lineage
|
|
171
|
-
|
|
172
|
-
There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
|
|
173
|
-
|
|
174
|
-
#### stl_scan_based
|
|
175
|
-
The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
|
|
176
|
-
discover lineage between tables.
|
|
177
|
-
Pros:
|
|
178
|
-
- Fast
|
|
179
|
-
- Reliable
|
|
180
|
-
|
|
181
|
-
Cons:
|
|
182
|
-
- Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
|
|
183
|
-
- If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
|
|
184
|
-
|
|
185
|
-
#### sql_based
|
|
186
|
-
The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
|
|
187
|
-
and uses sql parsing to discover the dependencies.
|
|
188
|
-
|
|
189
|
-
Pros:
|
|
190
|
-
- Works with Spectrum tables
|
|
191
|
-
- Views are connected properly if a table depends on it
|
|
192
|
-
|
|
193
|
-
Cons:
|
|
194
|
-
- Slow.
|
|
195
|
-
- Less reliable as the query parser can fail on certain queries
|
|
196
|
-
|
|
197
|
-
#### mixed
|
|
198
|
-
Using both collector above and first applying the sql based and then the stl_scan based one.
|
|
199
|
-
|
|
200
|
-
Pros:
|
|
201
|
-
- Works with Spectrum tables
|
|
202
|
-
- Views are connected properly if a table depends on it
|
|
203
|
-
- A bit more reliable than the sql_based one only
|
|
204
|
-
|
|
205
|
-
Cons:
|
|
206
|
-
- Slow
|
|
207
|
-
- May be incorrect at times as the query parser can fail on certain queries
|
|
208
|
-
|
|
209
|
-
:::note
|
|
210
|
-
|
|
211
|
-
The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
|
|
212
|
-
|
|
213
|
-
:::
|
|
214
|
-
|
|
215
|
-
### Profiling
|
|
216
|
-
Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
|
|
217
|
-
|
|
218
|
-
If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
|
|
219
|
-
```yaml
|
|
220
|
-
profiling:
|
|
221
|
-
profile_table_level_only: true
|
|
222
|
-
```
|
|
223
158
|
"""
|
|
224
159
|
|
|
225
160
|
# TODO: Replace with standardized types in sql_types.py
|
|
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
330
265
|
self.config: RedshiftConfig = config
|
|
331
266
|
self.report: RedshiftReport = RedshiftReport()
|
|
332
267
|
self.classification_handler = ClassificationHandler(self.config, self.report)
|
|
268
|
+
self.datashares_helper = RedshiftDatasharesHelper(
|
|
269
|
+
self.config, self.report, self.ctx.graph
|
|
270
|
+
)
|
|
333
271
|
self.platform = "redshift"
|
|
334
272
|
self.domain_registry = None
|
|
335
273
|
if self.config.domain:
|
|
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
361
299
|
is_serverless=self.config.is_serverless
|
|
362
300
|
)
|
|
363
301
|
|
|
302
|
+
self.db: Optional[RedshiftDatabase] = None
|
|
364
303
|
self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
|
|
365
304
|
self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
|
|
366
305
|
self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
|
|
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
424
363
|
|
|
425
364
|
database = self.config.database
|
|
426
365
|
logger.info(f"Processing db {database}")
|
|
366
|
+
|
|
367
|
+
self.db = self.data_dictionary.get_database_details(connection, database)
|
|
368
|
+
self.report.is_shared_database = (
|
|
369
|
+
self.db is not None and self.db.is_shared_database
|
|
370
|
+
)
|
|
427
371
|
with self.report.new_stage(METADATA_EXTRACTION):
|
|
428
372
|
self.db_tables[database] = defaultdict()
|
|
429
373
|
self.db_views[database] = defaultdict()
|
|
@@ -563,7 +507,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
563
507
|
|
|
564
508
|
schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
|
|
565
509
|
schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
|
|
566
|
-
conn=connection,
|
|
510
|
+
conn=connection,
|
|
511
|
+
schema=schema,
|
|
512
|
+
is_shared_database=self.report.is_shared_database,
|
|
567
513
|
)
|
|
568
514
|
|
|
569
515
|
if self.config.include_tables:
|
|
@@ -887,6 +833,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
887
833
|
tables, views = self.data_dictionary.get_tables_and_views(
|
|
888
834
|
conn=connection,
|
|
889
835
|
skip_external_tables=self.config.skip_external_tables,
|
|
836
|
+
is_shared_database=self.report.is_shared_database,
|
|
890
837
|
)
|
|
891
838
|
for schema in tables:
|
|
892
839
|
if not is_schema_allowed(
|
|
@@ -1029,6 +976,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1029
976
|
database: str,
|
|
1030
977
|
lineage_extractor: RedshiftSqlLineageV2,
|
|
1031
978
|
) -> Iterable[MetadataWorkUnit]:
|
|
979
|
+
if self.config.include_share_lineage:
|
|
980
|
+
outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
|
|
981
|
+
yield from auto_workunit(
|
|
982
|
+
self.datashares_helper.to_platform_resource(list(outbound_shares))
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
if self.db and self.db.is_shared_database:
|
|
986
|
+
inbound_share = self.db.get_inbound_share()
|
|
987
|
+
if inbound_share is None:
|
|
988
|
+
self.report.warning(
|
|
989
|
+
title="Upstream lineage of inbound datashare will be missing",
|
|
990
|
+
message="Database options do not contain sufficient information",
|
|
991
|
+
context=f"Database: {database}, Options {self.db.options}",
|
|
992
|
+
)
|
|
993
|
+
else:
|
|
994
|
+
for known_lineage in self.datashares_helper.generate_lineage(
|
|
995
|
+
inbound_share, self.get_all_tables()[database]
|
|
996
|
+
):
|
|
997
|
+
lineage_extractor.aggregator.add(known_lineage)
|
|
998
|
+
|
|
999
|
+
# TODO: distinguish between definition level lineage and audit log based lineage
|
|
1000
|
+
# definition level lineage should never be skipped
|
|
1032
1001
|
if not self._should_ingest_lineage():
|
|
1033
1002
|
return
|
|
1034
1003
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime, timezone
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import redshift_connector
|
|
7
8
|
|
|
@@ -41,6 +42,10 @@ class RedshiftTable(BaseTable):
|
|
|
41
42
|
serde_parameters: Optional[str] = None
|
|
42
43
|
last_altered: Optional[datetime] = None
|
|
43
44
|
|
|
45
|
+
@property
|
|
46
|
+
def is_external_table(self) -> bool:
|
|
47
|
+
return self.type == "EXTERNAL_TABLE"
|
|
48
|
+
|
|
44
49
|
|
|
45
50
|
@dataclass
|
|
46
51
|
class RedshiftView(BaseTable):
|
|
@@ -51,6 +56,10 @@ class RedshiftView(BaseTable):
|
|
|
51
56
|
size_in_bytes: Optional[int] = None
|
|
52
57
|
rows_count: Optional[int] = None
|
|
53
58
|
|
|
59
|
+
@property
|
|
60
|
+
def is_external_table(self) -> bool:
|
|
61
|
+
return self.type == "EXTERNAL_TABLE"
|
|
62
|
+
|
|
54
63
|
|
|
55
64
|
@dataclass
|
|
56
65
|
class RedshiftSchema:
|
|
@@ -59,8 +68,102 @@ class RedshiftSchema:
|
|
|
59
68
|
type: str
|
|
60
69
|
owner: Optional[str] = None
|
|
61
70
|
option: Optional[str] = None
|
|
71
|
+
external_platform: Optional[str] = None
|
|
62
72
|
external_database: Optional[str] = None
|
|
63
73
|
|
|
74
|
+
@property
|
|
75
|
+
def is_external_schema(self) -> bool:
|
|
76
|
+
return self.type == "external"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class PartialInboundDatashare:
|
|
81
|
+
share_name: str
|
|
82
|
+
producer_namespace_prefix: str
|
|
83
|
+
consumer_database: str
|
|
84
|
+
|
|
85
|
+
def get_description(self) -> str:
|
|
86
|
+
return (
|
|
87
|
+
f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class OutboundDatashare:
|
|
93
|
+
share_name: str
|
|
94
|
+
producer_namespace: str
|
|
95
|
+
source_database: str
|
|
96
|
+
|
|
97
|
+
def get_key(self) -> str:
|
|
98
|
+
return f"{self.producer_namespace}.{self.share_name}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class InboundDatashare:
|
|
103
|
+
share_name: str
|
|
104
|
+
producer_namespace: str
|
|
105
|
+
consumer_database: str
|
|
106
|
+
|
|
107
|
+
def get_key(self) -> str:
|
|
108
|
+
return f"{self.producer_namespace}.{self.share_name}"
|
|
109
|
+
|
|
110
|
+
def get_description(self) -> str:
|
|
111
|
+
return f"Namespace {self.producer_namespace} Share {self.share_name}"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class RedshiftDatabase:
|
|
116
|
+
name: str
|
|
117
|
+
type: str
|
|
118
|
+
options: Optional[str] = None
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def is_shared_database(self) -> bool:
|
|
122
|
+
return self.type == "shared"
|
|
123
|
+
|
|
124
|
+
# NOTE: ideally options are in form
|
|
125
|
+
# {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"}
|
|
126
|
+
# however due to varchar(128) type of database table that captures options
|
|
127
|
+
# we may receive only partial information about inbound share
|
|
128
|
+
def get_inbound_share(
|
|
129
|
+
self,
|
|
130
|
+
) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
|
|
131
|
+
if not self.is_shared_database or not self.options:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
# Convert into single regex ??
|
|
135
|
+
share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options)
|
|
136
|
+
namespace_match = re.search(
|
|
137
|
+
r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options
|
|
138
|
+
)
|
|
139
|
+
partial_namespace_match = re.search(
|
|
140
|
+
r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if not share_name_match:
|
|
144
|
+
# We will always at least get share name
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
share_name = share_name_match.group(1)
|
|
148
|
+
if namespace_match:
|
|
149
|
+
return InboundDatashare(
|
|
150
|
+
share_name=share_name,
|
|
151
|
+
producer_namespace=namespace_match.group(1),
|
|
152
|
+
consumer_database=self.name,
|
|
153
|
+
)
|
|
154
|
+
elif partial_namespace_match:
|
|
155
|
+
return PartialInboundDatashare(
|
|
156
|
+
share_name=share_name,
|
|
157
|
+
producer_namespace_prefix=partial_namespace_match.group(1),
|
|
158
|
+
consumer_database=self.name,
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
return PartialInboundDatashare(
|
|
162
|
+
share_name=share_name,
|
|
163
|
+
producer_namespace_prefix="",
|
|
164
|
+
consumer_database=self.name,
|
|
165
|
+
)
|
|
166
|
+
|
|
64
167
|
|
|
65
168
|
@dataclass
|
|
66
169
|
class RedshiftExtraTableMeta:
|
|
@@ -141,13 +244,31 @@ class RedshiftDataDictionary:
|
|
|
141
244
|
|
|
142
245
|
return [db[0] for db in dbs]
|
|
143
246
|
|
|
247
|
+
@staticmethod
|
|
248
|
+
def get_database_details(
|
|
249
|
+
conn: redshift_connector.Connection, database: str
|
|
250
|
+
) -> Optional[RedshiftDatabase]:
|
|
251
|
+
cursor = RedshiftDataDictionary.get_query_result(
|
|
252
|
+
conn,
|
|
253
|
+
RedshiftCommonQuery.get_database_details(database),
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
row = cursor.fetchone()
|
|
257
|
+
if row is None:
|
|
258
|
+
return None
|
|
259
|
+
return RedshiftDatabase(
|
|
260
|
+
name=database,
|
|
261
|
+
type=row[1],
|
|
262
|
+
options=row[2],
|
|
263
|
+
)
|
|
264
|
+
|
|
144
265
|
@staticmethod
|
|
145
266
|
def get_schemas(
|
|
146
267
|
conn: redshift_connector.Connection, database: str
|
|
147
268
|
) -> List[RedshiftSchema]:
|
|
148
269
|
cursor = RedshiftDataDictionary.get_query_result(
|
|
149
270
|
conn,
|
|
150
|
-
RedshiftCommonQuery.list_schemas
|
|
271
|
+
RedshiftCommonQuery.list_schemas(database),
|
|
151
272
|
)
|
|
152
273
|
|
|
153
274
|
schemas = cursor.fetchall()
|
|
@@ -158,8 +279,8 @@ class RedshiftDataDictionary:
|
|
|
158
279
|
database=database,
|
|
159
280
|
name=schema[field_names.index("schema_name")],
|
|
160
281
|
type=schema[field_names.index("schema_type")],
|
|
161
|
-
owner=schema[field_names.index("schema_owner_name")],
|
|
162
282
|
option=schema[field_names.index("schema_option")],
|
|
283
|
+
external_platform=schema[field_names.index("external_platform")],
|
|
163
284
|
external_database=schema[field_names.index("external_database")],
|
|
164
285
|
)
|
|
165
286
|
for schema in schemas
|
|
@@ -203,6 +324,7 @@ class RedshiftDataDictionary:
|
|
|
203
324
|
self,
|
|
204
325
|
conn: redshift_connector.Connection,
|
|
205
326
|
skip_external_tables: bool = False,
|
|
327
|
+
is_shared_database: bool = False,
|
|
206
328
|
) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
|
|
207
329
|
tables: Dict[str, List[RedshiftTable]] = {}
|
|
208
330
|
views: Dict[str, List[RedshiftView]] = {}
|
|
@@ -213,7 +335,10 @@ class RedshiftDataDictionary:
|
|
|
213
335
|
|
|
214
336
|
cur = RedshiftDataDictionary.get_query_result(
|
|
215
337
|
conn,
|
|
216
|
-
RedshiftCommonQuery.list_tables(
|
|
338
|
+
RedshiftCommonQuery.list_tables(
|
|
339
|
+
skip_external_tables=skip_external_tables,
|
|
340
|
+
is_shared_database=is_shared_database,
|
|
341
|
+
),
|
|
217
342
|
)
|
|
218
343
|
field_names = [i[0] for i in cur.description]
|
|
219
344
|
db_tables = cur.fetchall()
|
|
@@ -358,11 +483,15 @@ class RedshiftDataDictionary:
|
|
|
358
483
|
|
|
359
484
|
@staticmethod
|
|
360
485
|
def get_columns_for_schema(
|
|
361
|
-
conn: redshift_connector.Connection,
|
|
486
|
+
conn: redshift_connector.Connection,
|
|
487
|
+
schema: RedshiftSchema,
|
|
488
|
+
is_shared_database: bool = False,
|
|
362
489
|
) -> Dict[str, List[RedshiftColumn]]:
|
|
363
490
|
cursor = RedshiftDataDictionary.get_query_result(
|
|
364
491
|
conn,
|
|
365
|
-
RedshiftCommonQuery.list_columns
|
|
492
|
+
RedshiftCommonQuery.list_columns(
|
|
493
|
+
is_shared_database=is_shared_database
|
|
494
|
+
).format(schema_name=schema.name),
|
|
366
495
|
)
|
|
367
496
|
|
|
368
497
|
table_columns: Dict[str, List[RedshiftColumn]] = {}
|
|
@@ -508,3 +637,34 @@ class RedshiftDataDictionary:
|
|
|
508
637
|
start_time=row[field_names.index("start_time")],
|
|
509
638
|
)
|
|
510
639
|
rows = cursor.fetchmany()
|
|
640
|
+
|
|
641
|
+
@staticmethod
|
|
642
|
+
def get_outbound_datashares(
|
|
643
|
+
conn: redshift_connector.Connection,
|
|
644
|
+
) -> Iterable[OutboundDatashare]:
|
|
645
|
+
cursor = conn.cursor()
|
|
646
|
+
cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
|
|
647
|
+
for item in cursor.fetchall():
|
|
648
|
+
yield OutboundDatashare(
|
|
649
|
+
share_name=item[1],
|
|
650
|
+
producer_namespace=item[2],
|
|
651
|
+
source_database=item[3],
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# NOTE: this is not used right now as it requires superuser privilege
|
|
655
|
+
# We can use this in future if the permissions are lowered.
|
|
656
|
+
@staticmethod
|
|
657
|
+
def get_inbound_datashare(
|
|
658
|
+
conn: redshift_connector.Connection,
|
|
659
|
+
database: str,
|
|
660
|
+
) -> Optional[InboundDatashare]:
|
|
661
|
+
cursor = conn.cursor()
|
|
662
|
+
cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
|
|
663
|
+
item = cursor.fetchone()
|
|
664
|
+
if item:
|
|
665
|
+
return InboundDatashare(
|
|
666
|
+
share_name=item[1],
|
|
667
|
+
producer_namespace=item[2],
|
|
668
|
+
consumer_database=item[3],
|
|
669
|
+
)
|
|
670
|
+
return None
|
|
@@ -60,5 +60,8 @@ class RedshiftReport(
|
|
|
60
60
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
61
61
|
lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict)
|
|
62
62
|
|
|
63
|
+
is_shared_database: bool = False
|
|
64
|
+
outbound_shares_count: Optional[int] = None
|
|
65
|
+
|
|
63
66
|
def report_dropped(self, key: str) -> None:
|
|
64
67
|
self.filtered.append(key)
|