acryl-datahub 1.2.0.4rc3__py3-none-any.whl → 1.2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/METADATA +2454 -2454
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/RECORD +43 -41
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +514 -514
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16680 -16281
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
8
8
|
)
|
|
9
9
|
from datahub.utilities.prefix_batch_builder import PrefixGroup
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
SHOW_COMMAND_MAX_PAGE_SIZE = 10000
|
|
12
12
|
SHOW_STREAM_MAX_PAGE_SIZE = 10000
|
|
13
13
|
|
|
14
14
|
|
|
@@ -38,12 +38,23 @@ class SnowflakeQuery:
|
|
|
38
38
|
SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
|
|
39
39
|
SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
|
|
40
40
|
SnowflakeObjectDomain.STREAM.capitalize(),
|
|
41
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
41
42
|
}
|
|
42
43
|
|
|
43
44
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
44
45
|
",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS)
|
|
45
46
|
)
|
|
46
47
|
|
|
48
|
+
# Domains that can be downstream tables in lineage
|
|
49
|
+
DOWNSTREAM_TABLE_DOMAINS = {
|
|
50
|
+
SnowflakeObjectDomain.TABLE.capitalize(),
|
|
51
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE.capitalize(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
DOWNSTREAM_TABLE_DOMAINS_FILTER = "({})".format(
|
|
55
|
+
",".join(f"'{domain}'" for domain in DOWNSTREAM_TABLE_DOMAINS)
|
|
56
|
+
)
|
|
57
|
+
|
|
47
58
|
@staticmethod
|
|
48
59
|
def current_account() -> str:
|
|
49
60
|
return "select CURRENT_ACCOUNT()"
|
|
@@ -235,7 +246,7 @@ class SnowflakeQuery:
|
|
|
235
246
|
@staticmethod
|
|
236
247
|
def show_views_for_database(
|
|
237
248
|
db_name: str,
|
|
238
|
-
limit: int =
|
|
249
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
239
250
|
view_pagination_marker: Optional[str] = None,
|
|
240
251
|
) -> str:
|
|
241
252
|
# While there is an information_schema.views view, that only shows the view definition if the role
|
|
@@ -244,7 +255,7 @@ class SnowflakeQuery:
|
|
|
244
255
|
|
|
245
256
|
# SHOW VIEWS can return a maximum of 10000 rows.
|
|
246
257
|
# https://docs.snowflake.com/en/sql-reference/sql/show-views#usage-notes
|
|
247
|
-
assert limit <=
|
|
258
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
248
259
|
|
|
249
260
|
# To work around this, we paginate through the results using the FROM clause.
|
|
250
261
|
from_clause = (
|
|
@@ -686,7 +697,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
686
697
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
687
698
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
688
699
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
689
|
-
AND downstream_table_domain
|
|
700
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
690
701
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
691
702
|
),
|
|
692
703
|
column_upstream_jobs AS (
|
|
@@ -843,7 +854,7 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
843
854
|
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
844
855
|
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
845
856
|
AND upstream_table_domain in {allowed_upstream_table_domains}
|
|
846
|
-
AND downstream_table_domain
|
|
857
|
+
AND downstream_table_domain in {SnowflakeQuery.DOWNSTREAM_TABLE_DOMAINS_FILTER}
|
|
847
858
|
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
|
848
859
|
),
|
|
849
860
|
table_upstream_jobs_unique AS (
|
|
@@ -940,3 +951,37 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
940
951
|
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
941
952
|
)
|
|
942
953
|
return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""
|
|
954
|
+
|
|
955
|
+
@staticmethod
|
|
956
|
+
def show_dynamic_tables_for_database(
|
|
957
|
+
db_name: str,
|
|
958
|
+
limit: int = SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
959
|
+
dynamic_table_pagination_marker: Optional[str] = None,
|
|
960
|
+
) -> str:
|
|
961
|
+
"""Get dynamic table definitions using SHOW DYNAMIC TABLES."""
|
|
962
|
+
assert limit <= SHOW_COMMAND_MAX_PAGE_SIZE
|
|
963
|
+
|
|
964
|
+
from_clause = (
|
|
965
|
+
f"""FROM '{dynamic_table_pagination_marker}'"""
|
|
966
|
+
if dynamic_table_pagination_marker
|
|
967
|
+
else ""
|
|
968
|
+
)
|
|
969
|
+
return f"""\
|
|
970
|
+
SHOW DYNAMIC TABLES IN DATABASE "{db_name}"
|
|
971
|
+
LIMIT {limit} {from_clause};
|
|
972
|
+
"""
|
|
973
|
+
|
|
974
|
+
@staticmethod
|
|
975
|
+
def get_dynamic_table_graph_history(db_name: str) -> str:
|
|
976
|
+
"""Get dynamic table dependency information from information schema."""
|
|
977
|
+
return f"""
|
|
978
|
+
SELECT
|
|
979
|
+
name,
|
|
980
|
+
inputs,
|
|
981
|
+
target_lag_type,
|
|
982
|
+
target_lag_sec,
|
|
983
|
+
scheduling_state,
|
|
984
|
+
alter_trigger
|
|
985
|
+
FROM TABLE("{db_name}".INFORMATION_SCHEMA.DYNAMIC_TABLE_GRAPH_HISTORY())
|
|
986
|
+
ORDER BY name
|
|
987
|
+
"""
|
|
@@ -3,16 +3,17 @@ import os
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
7
7
|
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
10
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
11
11
|
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
|
|
12
12
|
from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
13
|
-
|
|
13
|
+
SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
|
+
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
16
17
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
17
18
|
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
18
19
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
@@ -103,6 +104,17 @@ class SnowflakeTable(BaseTable):
|
|
|
103
104
|
return DatasetSubTypes.TABLE
|
|
104
105
|
|
|
105
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class SnowflakeDynamicTable(SnowflakeTable):
|
|
109
|
+
definition: Optional[str] = (
|
|
110
|
+
None # SQL query that defines the dynamic table's content
|
|
111
|
+
)
|
|
112
|
+
target_lag: Optional[str] = None # Refresh frequency (e.g., "1 HOUR", "30 MINUTES")
|
|
113
|
+
|
|
114
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
115
|
+
return DatasetSubTypes.DYNAMIC_TABLE
|
|
116
|
+
|
|
117
|
+
|
|
106
118
|
@dataclass
|
|
107
119
|
class SnowflakeView(BaseView):
|
|
108
120
|
materialized: bool = False
|
|
@@ -226,8 +238,11 @@ class _SnowflakeTagCache:
|
|
|
226
238
|
|
|
227
239
|
|
|
228
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
229
|
-
def __init__(
|
|
241
|
+
def __init__(
|
|
242
|
+
self, connection: SnowflakeConnection, report: SnowflakeV2Report
|
|
243
|
+
) -> None:
|
|
230
244
|
self.connection = connection
|
|
245
|
+
self.report = report
|
|
231
246
|
|
|
232
247
|
def as_obj(self) -> Dict[str, Dict[str, int]]:
|
|
233
248
|
# TODO: Move this into a proper report type that gets computed.
|
|
@@ -355,8 +370,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
355
370
|
if table["TABLE_SCHEMA"] not in tables:
|
|
356
371
|
tables[table["TABLE_SCHEMA"]] = []
|
|
357
372
|
|
|
373
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
374
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
375
|
+
|
|
358
376
|
tables[table["TABLE_SCHEMA"]].append(
|
|
359
|
-
|
|
377
|
+
table_cls(
|
|
360
378
|
name=table["TABLE_NAME"],
|
|
361
379
|
type=table["TABLE_TYPE"],
|
|
362
380
|
created=table["CREATED"],
|
|
@@ -365,11 +383,15 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
365
383
|
rows_count=table["ROW_COUNT"],
|
|
366
384
|
comment=table["COMMENT"],
|
|
367
385
|
clustering_key=table["CLUSTERING_KEY"],
|
|
368
|
-
is_dynamic=
|
|
386
|
+
is_dynamic=is_dynamic,
|
|
369
387
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
370
388
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
371
389
|
)
|
|
372
390
|
)
|
|
391
|
+
|
|
392
|
+
# Populate dynamic table definitions
|
|
393
|
+
self.populate_dynamic_table_definitions(tables, db_name)
|
|
394
|
+
|
|
373
395
|
return tables
|
|
374
396
|
|
|
375
397
|
def get_tables_for_schema(
|
|
@@ -382,8 +404,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
382
404
|
)
|
|
383
405
|
|
|
384
406
|
for table in cur:
|
|
407
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
408
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
409
|
+
|
|
385
410
|
tables.append(
|
|
386
|
-
|
|
411
|
+
table_cls(
|
|
387
412
|
name=table["TABLE_NAME"],
|
|
388
413
|
type=table["TABLE_TYPE"],
|
|
389
414
|
created=table["CREATED"],
|
|
@@ -392,16 +417,21 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
392
417
|
rows_count=table["ROW_COUNT"],
|
|
393
418
|
comment=table["COMMENT"],
|
|
394
419
|
clustering_key=table["CLUSTERING_KEY"],
|
|
395
|
-
is_dynamic=
|
|
420
|
+
is_dynamic=is_dynamic,
|
|
396
421
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
397
422
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
398
423
|
)
|
|
399
424
|
)
|
|
425
|
+
|
|
426
|
+
# Populate dynamic table definitions for just this schema
|
|
427
|
+
schema_tables = {schema_name: tables}
|
|
428
|
+
self.populate_dynamic_table_definitions(schema_tables, db_name)
|
|
429
|
+
|
|
400
430
|
return tables
|
|
401
431
|
|
|
402
432
|
@serialized_lru_cache(maxsize=1)
|
|
403
433
|
def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]:
|
|
404
|
-
page_limit =
|
|
434
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
405
435
|
|
|
406
436
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
407
437
|
|
|
@@ -660,7 +690,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
660
690
|
def get_streams_for_database(
|
|
661
691
|
self, db_name: str
|
|
662
692
|
) -> Dict[str, List[SnowflakeStream]]:
|
|
663
|
-
page_limit =
|
|
693
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
664
694
|
|
|
665
695
|
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
666
696
|
|
|
@@ -743,3 +773,137 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
743
773
|
)
|
|
744
774
|
)
|
|
745
775
|
return procedures
|
|
776
|
+
|
|
777
|
+
@serialized_lru_cache(maxsize=1)
|
|
778
|
+
def get_dynamic_table_graph_info(self, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
779
|
+
"""Get dynamic table dependency information from information schema."""
|
|
780
|
+
dt_graph_info: Dict[str, Dict[str, Any]] = {}
|
|
781
|
+
try:
|
|
782
|
+
cur = self.connection.query(
|
|
783
|
+
SnowflakeQuery.get_dynamic_table_graph_history(db_name)
|
|
784
|
+
)
|
|
785
|
+
for row in cur:
|
|
786
|
+
dt_name = row["NAME"]
|
|
787
|
+
dt_graph_info[dt_name] = {
|
|
788
|
+
"inputs": row.get("INPUTS"),
|
|
789
|
+
"target_lag_type": row.get("TARGET_LAG_TYPE"),
|
|
790
|
+
"target_lag_sec": row.get("TARGET_LAG_SEC"),
|
|
791
|
+
"scheduling_state": row.get("SCHEDULING_STATE"),
|
|
792
|
+
"alter_trigger": row.get("ALTER_TRIGGER"),
|
|
793
|
+
}
|
|
794
|
+
logger.debug(
|
|
795
|
+
f"Successfully retrieved graph info for {len(dt_graph_info)} dynamic tables in {db_name}"
|
|
796
|
+
)
|
|
797
|
+
except Exception as e:
|
|
798
|
+
self.report.warning(
|
|
799
|
+
"Failed to get dynamic table graph history",
|
|
800
|
+
db_name,
|
|
801
|
+
exc=e,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
return dt_graph_info
|
|
805
|
+
|
|
806
|
+
@serialized_lru_cache(maxsize=1)
|
|
807
|
+
def get_dynamic_tables_with_definitions(
|
|
808
|
+
self, db_name: str
|
|
809
|
+
) -> Dict[str, List[SnowflakeDynamicTable]]:
|
|
810
|
+
"""Get dynamic tables with their definitions using SHOW DYNAMIC TABLES."""
|
|
811
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
812
|
+
dynamic_tables: Dict[str, List[SnowflakeDynamicTable]] = {}
|
|
813
|
+
|
|
814
|
+
# Get graph/dependency information (pass db_name)
|
|
815
|
+
dt_graph_info = self.get_dynamic_table_graph_info(db_name)
|
|
816
|
+
|
|
817
|
+
first_iteration = True
|
|
818
|
+
dt_pagination_marker: Optional[str] = None
|
|
819
|
+
|
|
820
|
+
while first_iteration or dt_pagination_marker is not None:
|
|
821
|
+
try:
|
|
822
|
+
cur = self.connection.query(
|
|
823
|
+
SnowflakeQuery.show_dynamic_tables_for_database(
|
|
824
|
+
db_name,
|
|
825
|
+
limit=page_limit,
|
|
826
|
+
dynamic_table_pagination_marker=dt_pagination_marker,
|
|
827
|
+
)
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
first_iteration = False
|
|
831
|
+
dt_pagination_marker = None
|
|
832
|
+
result_set_size = 0
|
|
833
|
+
|
|
834
|
+
for dt in cur:
|
|
835
|
+
result_set_size += 1
|
|
836
|
+
|
|
837
|
+
dt_name = dt["name"]
|
|
838
|
+
schema_name = dt["schema_name"]
|
|
839
|
+
|
|
840
|
+
if schema_name not in dynamic_tables:
|
|
841
|
+
dynamic_tables[schema_name] = []
|
|
842
|
+
|
|
843
|
+
# Get definition from SHOW result
|
|
844
|
+
definition = dt.get("text")
|
|
845
|
+
|
|
846
|
+
# Get target lag from SHOW result or graph info
|
|
847
|
+
target_lag = dt.get("target_lag")
|
|
848
|
+
if not target_lag and dt_graph_info:
|
|
849
|
+
qualified_name = f"{db_name}.{schema_name}.{dt_name}"
|
|
850
|
+
graph_info = dt_graph_info.get(qualified_name, {})
|
|
851
|
+
if graph_info.get("target_lag_type") and graph_info.get(
|
|
852
|
+
"target_lag_sec"
|
|
853
|
+
):
|
|
854
|
+
target_lag = f"{graph_info['target_lag_sec']} {graph_info['target_lag_type']}"
|
|
855
|
+
|
|
856
|
+
dynamic_tables[schema_name].append(
|
|
857
|
+
SnowflakeDynamicTable(
|
|
858
|
+
name=dt_name,
|
|
859
|
+
created=dt["created_on"],
|
|
860
|
+
last_altered=dt.get("created_on"),
|
|
861
|
+
size_in_bytes=dt.get("bytes", 0),
|
|
862
|
+
rows_count=dt.get("rows", 0),
|
|
863
|
+
comment=dt.get("comment"),
|
|
864
|
+
definition=definition,
|
|
865
|
+
target_lag=target_lag,
|
|
866
|
+
is_dynamic=True,
|
|
867
|
+
type="DYNAMIC TABLE",
|
|
868
|
+
)
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
if result_set_size >= page_limit:
|
|
872
|
+
logger.info(
|
|
873
|
+
f"Fetching next page of dynamic tables for {db_name} - after {dt_name}"
|
|
874
|
+
)
|
|
875
|
+
dt_pagination_marker = dt_name
|
|
876
|
+
|
|
877
|
+
except Exception as e:
|
|
878
|
+
logger.debug(
|
|
879
|
+
f"Failed to get dynamic tables for database {db_name}: {e}"
|
|
880
|
+
)
|
|
881
|
+
break
|
|
882
|
+
|
|
883
|
+
return dynamic_tables
|
|
884
|
+
|
|
885
|
+
def populate_dynamic_table_definitions(
|
|
886
|
+
self, tables: Dict[str, List[SnowflakeTable]], db_name: str
|
|
887
|
+
) -> None:
|
|
888
|
+
"""Populate dynamic table definitions for tables that are marked as dynamic."""
|
|
889
|
+
try:
|
|
890
|
+
# Get dynamic tables with definitions from SHOW command
|
|
891
|
+
dt_with_definitions = self.get_dynamic_tables_with_definitions(db_name)
|
|
892
|
+
|
|
893
|
+
for schema_name, table_list in tables.items():
|
|
894
|
+
for table in table_list:
|
|
895
|
+
if (
|
|
896
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
897
|
+
and table.definition is None
|
|
898
|
+
):
|
|
899
|
+
# Find matching dynamic table from SHOW results
|
|
900
|
+
show_dt_list = dt_with_definitions.get(schema_name, [])
|
|
901
|
+
for show_dt in show_dt_list:
|
|
902
|
+
if show_dt.name == table.name:
|
|
903
|
+
table.definition = show_dt.definition
|
|
904
|
+
table.target_lag = show_dt.target_lag
|
|
905
|
+
break
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.debug(
|
|
908
|
+
f"Failed to populate dynamic table definitions for {db_name}: {e}"
|
|
909
|
+
)
|
|
@@ -45,6 +45,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
45
45
|
SnowflakeColumn,
|
|
46
46
|
SnowflakeDatabase,
|
|
47
47
|
SnowflakeDataDictionary,
|
|
48
|
+
SnowflakeDynamicTable,
|
|
48
49
|
SnowflakeFK,
|
|
49
50
|
SnowflakePK,
|
|
50
51
|
SnowflakeSchema,
|
|
@@ -182,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
182
183
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
183
184
|
|
|
184
185
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
185
|
-
connection=self.connection
|
|
186
|
+
connection=self.connection, report=self.report
|
|
186
187
|
)
|
|
187
188
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
188
189
|
|
|
@@ -495,6 +496,22 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
495
496
|
if self.config.include_technical_schema:
|
|
496
497
|
data_reader = self.make_data_reader()
|
|
497
498
|
for table in tables:
|
|
499
|
+
# Handle dynamic table definitions for lineage
|
|
500
|
+
if (
|
|
501
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
502
|
+
and table.definition
|
|
503
|
+
and self.aggregator
|
|
504
|
+
):
|
|
505
|
+
table_identifier = self.identifiers.get_dataset_identifier(
|
|
506
|
+
table.name, schema_name, db_name
|
|
507
|
+
)
|
|
508
|
+
self.aggregator.add_view_definition(
|
|
509
|
+
view_urn=self.identifiers.gen_dataset_urn(table_identifier),
|
|
510
|
+
view_definition=table.definition,
|
|
511
|
+
default_db=db_name,
|
|
512
|
+
default_schema=schema_name,
|
|
513
|
+
)
|
|
514
|
+
|
|
498
515
|
table_wu_generator = self._process_table(
|
|
499
516
|
table, snowflake_schema, db_name
|
|
500
517
|
)
|
|
@@ -935,6 +952,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
935
952
|
}
|
|
936
953
|
)
|
|
937
954
|
|
|
955
|
+
if isinstance(table, SnowflakeDynamicTable):
|
|
956
|
+
if table.target_lag:
|
|
957
|
+
custom_properties["TARGET_LAG"] = table.target_lag
|
|
958
|
+
|
|
938
959
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
939
960
|
custom_properties["IS_SECURE"] = "true"
|
|
940
961
|
|
|
@@ -980,7 +1001,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
980
1001
|
schema_name,
|
|
981
1002
|
db_name,
|
|
982
1003
|
(
|
|
983
|
-
SnowflakeObjectDomain.
|
|
1004
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
1005
|
+
if isinstance(table, SnowflakeTable) and table.is_dynamic
|
|
1006
|
+
else SnowflakeObjectDomain.TABLE
|
|
984
1007
|
if isinstance(table, SnowflakeTable)
|
|
985
1008
|
else SnowflakeObjectDomain.VIEW
|
|
986
1009
|
),
|
|
@@ -93,9 +93,20 @@ class SnowsightUrlBuilder:
|
|
|
93
93
|
table_name: str,
|
|
94
94
|
schema_name: str,
|
|
95
95
|
db_name: str,
|
|
96
|
-
domain: Literal[
|
|
96
|
+
domain: Literal[
|
|
97
|
+
SnowflakeObjectDomain.TABLE,
|
|
98
|
+
SnowflakeObjectDomain.VIEW,
|
|
99
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
100
|
+
],
|
|
97
101
|
) -> Optional[str]:
|
|
98
|
-
|
|
102
|
+
# For dynamic tables, use the dynamic-table domain in the URL path
|
|
103
|
+
# Ensure only explicitly dynamic tables use dynamic-table URL path
|
|
104
|
+
url_domain = (
|
|
105
|
+
"dynamic-table"
|
|
106
|
+
if domain == SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
107
|
+
else str(domain)
|
|
108
|
+
)
|
|
109
|
+
return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{url_domain}/{table_name}/"
|
|
99
110
|
|
|
100
111
|
def get_external_url_for_schema(
|
|
101
112
|
self, schema_name: str, db_name: str
|
|
@@ -129,6 +140,7 @@ class SnowflakeFilter:
|
|
|
129
140
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
130
141
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
131
142
|
SnowflakeObjectDomain.STREAM,
|
|
143
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
132
144
|
):
|
|
133
145
|
return False
|
|
134
146
|
if _is_sys_table(dataset_name):
|
|
@@ -160,7 +172,8 @@ class SnowflakeFilter:
|
|
|
160
172
|
return False
|
|
161
173
|
|
|
162
174
|
if dataset_type.lower() in {
|
|
163
|
-
SnowflakeObjectDomain.TABLE
|
|
175
|
+
SnowflakeObjectDomain.TABLE,
|
|
176
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
164
177
|
} and not self.filter_config.table_pattern.allowed(
|
|
165
178
|
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
166
179
|
):
|
|
@@ -171,7 +171,9 @@ class SnowflakeV2Source(
|
|
|
171
171
|
)
|
|
172
172
|
|
|
173
173
|
# For database, schema, tables, views, etc
|
|
174
|
-
self.data_dictionary = SnowflakeDataDictionary(
|
|
174
|
+
self.data_dictionary = SnowflakeDataDictionary(
|
|
175
|
+
connection=self.connection, report=self.report
|
|
176
|
+
)
|
|
175
177
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
176
178
|
|
|
177
179
|
self.discovered_datasets: Optional[List[str]] = None
|
|
@@ -600,6 +602,7 @@ class SnowflakeV2Source(
|
|
|
600
602
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
601
603
|
user_email_pattern=self.config.user_email_pattern,
|
|
602
604
|
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
605
|
+
pushdown_allow_usernames=self.config.pushdown_allow_usernames,
|
|
603
606
|
query_dedup_strategy=self.config.query_dedup_strategy,
|
|
604
607
|
push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
|
|
605
608
|
additional_database_names_allowlist=self.config.additional_database_names_allowlist,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
import urllib.parse
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
@@ -41,7 +41,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.ingestion.source.sql.sql_common import (
|
|
43
43
|
SQLAlchemySource,
|
|
44
|
-
SqlWorkUnit,
|
|
45
44
|
register_custom_type,
|
|
46
45
|
)
|
|
47
46
|
from datahub.ingestion.source.sql.sql_config import (
|
|
@@ -346,28 +345,6 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
346
345
|
exc=e,
|
|
347
346
|
)
|
|
348
347
|
|
|
349
|
-
def get_schema_level_workunits(
|
|
350
|
-
self,
|
|
351
|
-
inspector: Inspector,
|
|
352
|
-
schema: str,
|
|
353
|
-
database: str,
|
|
354
|
-
) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
355
|
-
yield from super().get_schema_level_workunits(
|
|
356
|
-
inspector=inspector,
|
|
357
|
-
schema=schema,
|
|
358
|
-
database=database,
|
|
359
|
-
)
|
|
360
|
-
if self.config.include_stored_procedures:
|
|
361
|
-
try:
|
|
362
|
-
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
363
|
-
except Exception as e:
|
|
364
|
-
self.report.failure(
|
|
365
|
-
message="Failed to list stored procedures",
|
|
366
|
-
title="SQL Server Stored Procedures Extraction",
|
|
367
|
-
context="Error occurred during schema-level stored procedure extraction",
|
|
368
|
-
exc=e,
|
|
369
|
-
)
|
|
370
|
-
|
|
371
348
|
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
372
349
|
"""
|
|
373
350
|
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
@@ -636,7 +613,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
636
613
|
self,
|
|
637
614
|
inspector: Inspector,
|
|
638
615
|
schema: str,
|
|
639
|
-
sql_config: SQLServerConfig,
|
|
616
|
+
sql_config: SQLServerConfig, # type: ignore
|
|
640
617
|
) -> Iterable[MetadataWorkUnit]:
|
|
641
618
|
"""
|
|
642
619
|
Loop schema data for get stored procedures as dataJob-s.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# This import verifies that the dependencies are available.
|
|
2
2
|
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
3
5
|
import pymysql # noqa: F401
|
|
4
6
|
from pydantic.fields import Field
|
|
5
7
|
from sqlalchemy import util
|
|
@@ -7,6 +9,7 @@ from sqlalchemy.dialects.mysql import BIT, base
|
|
|
7
9
|
from sqlalchemy.dialects.mysql.enumerated import SET
|
|
8
10
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
11
|
|
|
12
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
10
13
|
from datahub.ingestion.api.decorators import (
|
|
11
14
|
SourceCapability,
|
|
12
15
|
SupportStatus,
|
|
@@ -20,6 +23,9 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
20
23
|
register_custom_type,
|
|
21
24
|
)
|
|
22
25
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
26
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
27
|
+
BaseProcedure,
|
|
28
|
+
)
|
|
23
29
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
24
30
|
TwoTierSQLAlchemyConfig,
|
|
25
31
|
TwoTierSQLAlchemySource,
|
|
@@ -58,6 +64,17 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
|
58
64
|
def get_identifier(self, *, schema: str, table: str) -> str:
|
|
59
65
|
return f"{schema}.{table}"
|
|
60
66
|
|
|
67
|
+
include_stored_procedures: bool = Field(
|
|
68
|
+
default=True,
|
|
69
|
+
description="Include ingest of stored procedures.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
73
|
+
default=AllowDenyPattern.allow_all(),
|
|
74
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
75
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
76
|
+
)
|
|
77
|
+
|
|
61
78
|
|
|
62
79
|
@platform_name("MySQL")
|
|
63
80
|
@config_class(MySQLConfig)
|
|
@@ -95,3 +112,40 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
95
112
|
self.profile_metadata_info.dataset_name_to_storage_bytes[
|
|
96
113
|
f"{row.TABLE_SCHEMA}.{row.TABLE_NAME}"
|
|
97
114
|
] = row.DATA_LENGTH
|
|
115
|
+
|
|
116
|
+
def get_procedures_for_schema(
|
|
117
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
118
|
+
) -> List[BaseProcedure]:
|
|
119
|
+
"""
|
|
120
|
+
Get stored procedures for a specific schema.
|
|
121
|
+
"""
|
|
122
|
+
base_procedures = []
|
|
123
|
+
with inspector.engine.connect() as conn:
|
|
124
|
+
procedures = conn.execute(
|
|
125
|
+
"""
|
|
126
|
+
SELECT ROUTINE_NAME AS name,
|
|
127
|
+
ROUTINE_DEFINITION AS definition,
|
|
128
|
+
EXTERNAL_LANGUAGE AS language
|
|
129
|
+
FROM information_schema.ROUTINES
|
|
130
|
+
WHERE ROUTINE_TYPE = 'PROCEDURE'
|
|
131
|
+
AND ROUTINE_SCHEMA = %s
|
|
132
|
+
""",
|
|
133
|
+
(schema,),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
procedure_rows = list(procedures)
|
|
137
|
+
for row in procedure_rows:
|
|
138
|
+
base_procedures.append(
|
|
139
|
+
BaseProcedure(
|
|
140
|
+
name=row.name,
|
|
141
|
+
language=row.language,
|
|
142
|
+
argument_signature=None,
|
|
143
|
+
return_type=None,
|
|
144
|
+
procedure_definition=row.definition,
|
|
145
|
+
created=None,
|
|
146
|
+
last_altered=None,
|
|
147
|
+
extra_properties=None,
|
|
148
|
+
comment=None,
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
return base_procedures
|