acryl-datahub 1.2.0.4rc3__py3-none-any.whl → 1.2.0.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/METADATA +2604 -2604
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/RECORD +40 -38
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +514 -514
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16680 -16281
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc3.dist-info → acryl_datahub-1.2.0.5rc1.dist-info}/top_level.txt +0 -0
|
@@ -3,16 +3,17 @@ import os
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
7
7
|
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
10
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
11
11
|
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
|
|
12
12
|
from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
13
|
-
|
|
13
|
+
SHOW_COMMAND_MAX_PAGE_SIZE,
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
|
+
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
16
17
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
17
18
|
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
18
19
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
@@ -103,6 +104,17 @@ class SnowflakeTable(BaseTable):
|
|
|
103
104
|
return DatasetSubTypes.TABLE
|
|
104
105
|
|
|
105
106
|
|
|
107
|
+
@dataclass
|
|
108
|
+
class SnowflakeDynamicTable(SnowflakeTable):
|
|
109
|
+
definition: Optional[str] = (
|
|
110
|
+
None # SQL query that defines the dynamic table's content
|
|
111
|
+
)
|
|
112
|
+
target_lag: Optional[str] = None # Refresh frequency (e.g., "1 HOUR", "30 MINUTES")
|
|
113
|
+
|
|
114
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
115
|
+
return DatasetSubTypes.DYNAMIC_TABLE
|
|
116
|
+
|
|
117
|
+
|
|
106
118
|
@dataclass
|
|
107
119
|
class SnowflakeView(BaseView):
|
|
108
120
|
materialized: bool = False
|
|
@@ -226,8 +238,11 @@ class _SnowflakeTagCache:
|
|
|
226
238
|
|
|
227
239
|
|
|
228
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
229
|
-
def __init__(
|
|
241
|
+
def __init__(
|
|
242
|
+
self, connection: SnowflakeConnection, report: SnowflakeV2Report
|
|
243
|
+
) -> None:
|
|
230
244
|
self.connection = connection
|
|
245
|
+
self.report = report
|
|
231
246
|
|
|
232
247
|
def as_obj(self) -> Dict[str, Dict[str, int]]:
|
|
233
248
|
# TODO: Move this into a proper report type that gets computed.
|
|
@@ -355,8 +370,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
355
370
|
if table["TABLE_SCHEMA"] not in tables:
|
|
356
371
|
tables[table["TABLE_SCHEMA"]] = []
|
|
357
372
|
|
|
373
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
374
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
375
|
+
|
|
358
376
|
tables[table["TABLE_SCHEMA"]].append(
|
|
359
|
-
|
|
377
|
+
table_cls(
|
|
360
378
|
name=table["TABLE_NAME"],
|
|
361
379
|
type=table["TABLE_TYPE"],
|
|
362
380
|
created=table["CREATED"],
|
|
@@ -365,11 +383,15 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
365
383
|
rows_count=table["ROW_COUNT"],
|
|
366
384
|
comment=table["COMMENT"],
|
|
367
385
|
clustering_key=table["CLUSTERING_KEY"],
|
|
368
|
-
is_dynamic=
|
|
386
|
+
is_dynamic=is_dynamic,
|
|
369
387
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
370
388
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
371
389
|
)
|
|
372
390
|
)
|
|
391
|
+
|
|
392
|
+
# Populate dynamic table definitions
|
|
393
|
+
self.populate_dynamic_table_definitions(tables, db_name)
|
|
394
|
+
|
|
373
395
|
return tables
|
|
374
396
|
|
|
375
397
|
def get_tables_for_schema(
|
|
@@ -382,8 +404,11 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
382
404
|
)
|
|
383
405
|
|
|
384
406
|
for table in cur:
|
|
407
|
+
is_dynamic = table.get("IS_DYNAMIC", "NO").upper() == "YES"
|
|
408
|
+
table_cls = SnowflakeDynamicTable if is_dynamic else SnowflakeTable
|
|
409
|
+
|
|
385
410
|
tables.append(
|
|
386
|
-
|
|
411
|
+
table_cls(
|
|
387
412
|
name=table["TABLE_NAME"],
|
|
388
413
|
type=table["TABLE_TYPE"],
|
|
389
414
|
created=table["CREATED"],
|
|
@@ -392,16 +417,21 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
392
417
|
rows_count=table["ROW_COUNT"],
|
|
393
418
|
comment=table["COMMENT"],
|
|
394
419
|
clustering_key=table["CLUSTERING_KEY"],
|
|
395
|
-
is_dynamic=
|
|
420
|
+
is_dynamic=is_dynamic,
|
|
396
421
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
397
422
|
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
398
423
|
)
|
|
399
424
|
)
|
|
425
|
+
|
|
426
|
+
# Populate dynamic table definitions for just this schema
|
|
427
|
+
schema_tables = {schema_name: tables}
|
|
428
|
+
self.populate_dynamic_table_definitions(schema_tables, db_name)
|
|
429
|
+
|
|
400
430
|
return tables
|
|
401
431
|
|
|
402
432
|
@serialized_lru_cache(maxsize=1)
|
|
403
433
|
def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]:
|
|
404
|
-
page_limit =
|
|
434
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
405
435
|
|
|
406
436
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
407
437
|
|
|
@@ -660,7 +690,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
660
690
|
def get_streams_for_database(
|
|
661
691
|
self, db_name: str
|
|
662
692
|
) -> Dict[str, List[SnowflakeStream]]:
|
|
663
|
-
page_limit =
|
|
693
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
664
694
|
|
|
665
695
|
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
666
696
|
|
|
@@ -743,3 +773,137 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
743
773
|
)
|
|
744
774
|
)
|
|
745
775
|
return procedures
|
|
776
|
+
|
|
777
|
+
@serialized_lru_cache(maxsize=1)
|
|
778
|
+
def get_dynamic_table_graph_info(self, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
779
|
+
"""Get dynamic table dependency information from information schema."""
|
|
780
|
+
dt_graph_info: Dict[str, Dict[str, Any]] = {}
|
|
781
|
+
try:
|
|
782
|
+
cur = self.connection.query(
|
|
783
|
+
SnowflakeQuery.get_dynamic_table_graph_history(db_name)
|
|
784
|
+
)
|
|
785
|
+
for row in cur:
|
|
786
|
+
dt_name = row["NAME"]
|
|
787
|
+
dt_graph_info[dt_name] = {
|
|
788
|
+
"inputs": row.get("INPUTS"),
|
|
789
|
+
"target_lag_type": row.get("TARGET_LAG_TYPE"),
|
|
790
|
+
"target_lag_sec": row.get("TARGET_LAG_SEC"),
|
|
791
|
+
"scheduling_state": row.get("SCHEDULING_STATE"),
|
|
792
|
+
"alter_trigger": row.get("ALTER_TRIGGER"),
|
|
793
|
+
}
|
|
794
|
+
logger.debug(
|
|
795
|
+
f"Successfully retrieved graph info for {len(dt_graph_info)} dynamic tables in {db_name}"
|
|
796
|
+
)
|
|
797
|
+
except Exception as e:
|
|
798
|
+
self.report.warning(
|
|
799
|
+
"Failed to get dynamic table graph history",
|
|
800
|
+
db_name,
|
|
801
|
+
exc=e,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
return dt_graph_info
|
|
805
|
+
|
|
806
|
+
@serialized_lru_cache(maxsize=1)
|
|
807
|
+
def get_dynamic_tables_with_definitions(
|
|
808
|
+
self, db_name: str
|
|
809
|
+
) -> Dict[str, List[SnowflakeDynamicTable]]:
|
|
810
|
+
"""Get dynamic tables with their definitions using SHOW DYNAMIC TABLES."""
|
|
811
|
+
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
812
|
+
dynamic_tables: Dict[str, List[SnowflakeDynamicTable]] = {}
|
|
813
|
+
|
|
814
|
+
# Get graph/dependency information (pass db_name)
|
|
815
|
+
dt_graph_info = self.get_dynamic_table_graph_info(db_name)
|
|
816
|
+
|
|
817
|
+
first_iteration = True
|
|
818
|
+
dt_pagination_marker: Optional[str] = None
|
|
819
|
+
|
|
820
|
+
while first_iteration or dt_pagination_marker is not None:
|
|
821
|
+
try:
|
|
822
|
+
cur = self.connection.query(
|
|
823
|
+
SnowflakeQuery.show_dynamic_tables_for_database(
|
|
824
|
+
db_name,
|
|
825
|
+
limit=page_limit,
|
|
826
|
+
dynamic_table_pagination_marker=dt_pagination_marker,
|
|
827
|
+
)
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
first_iteration = False
|
|
831
|
+
dt_pagination_marker = None
|
|
832
|
+
result_set_size = 0
|
|
833
|
+
|
|
834
|
+
for dt in cur:
|
|
835
|
+
result_set_size += 1
|
|
836
|
+
|
|
837
|
+
dt_name = dt["name"]
|
|
838
|
+
schema_name = dt["schema_name"]
|
|
839
|
+
|
|
840
|
+
if schema_name not in dynamic_tables:
|
|
841
|
+
dynamic_tables[schema_name] = []
|
|
842
|
+
|
|
843
|
+
# Get definition from SHOW result
|
|
844
|
+
definition = dt.get("text")
|
|
845
|
+
|
|
846
|
+
# Get target lag from SHOW result or graph info
|
|
847
|
+
target_lag = dt.get("target_lag")
|
|
848
|
+
if not target_lag and dt_graph_info:
|
|
849
|
+
qualified_name = f"{db_name}.{schema_name}.{dt_name}"
|
|
850
|
+
graph_info = dt_graph_info.get(qualified_name, {})
|
|
851
|
+
if graph_info.get("target_lag_type") and graph_info.get(
|
|
852
|
+
"target_lag_sec"
|
|
853
|
+
):
|
|
854
|
+
target_lag = f"{graph_info['target_lag_sec']} {graph_info['target_lag_type']}"
|
|
855
|
+
|
|
856
|
+
dynamic_tables[schema_name].append(
|
|
857
|
+
SnowflakeDynamicTable(
|
|
858
|
+
name=dt_name,
|
|
859
|
+
created=dt["created_on"],
|
|
860
|
+
last_altered=dt.get("created_on"),
|
|
861
|
+
size_in_bytes=dt.get("bytes", 0),
|
|
862
|
+
rows_count=dt.get("rows", 0),
|
|
863
|
+
comment=dt.get("comment"),
|
|
864
|
+
definition=definition,
|
|
865
|
+
target_lag=target_lag,
|
|
866
|
+
is_dynamic=True,
|
|
867
|
+
type="DYNAMIC TABLE",
|
|
868
|
+
)
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
if result_set_size >= page_limit:
|
|
872
|
+
logger.info(
|
|
873
|
+
f"Fetching next page of dynamic tables for {db_name} - after {dt_name}"
|
|
874
|
+
)
|
|
875
|
+
dt_pagination_marker = dt_name
|
|
876
|
+
|
|
877
|
+
except Exception as e:
|
|
878
|
+
logger.debug(
|
|
879
|
+
f"Failed to get dynamic tables for database {db_name}: {e}"
|
|
880
|
+
)
|
|
881
|
+
break
|
|
882
|
+
|
|
883
|
+
return dynamic_tables
|
|
884
|
+
|
|
885
|
+
def populate_dynamic_table_definitions(
|
|
886
|
+
self, tables: Dict[str, List[SnowflakeTable]], db_name: str
|
|
887
|
+
) -> None:
|
|
888
|
+
"""Populate dynamic table definitions for tables that are marked as dynamic."""
|
|
889
|
+
try:
|
|
890
|
+
# Get dynamic tables with definitions from SHOW command
|
|
891
|
+
dt_with_definitions = self.get_dynamic_tables_with_definitions(db_name)
|
|
892
|
+
|
|
893
|
+
for schema_name, table_list in tables.items():
|
|
894
|
+
for table in table_list:
|
|
895
|
+
if (
|
|
896
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
897
|
+
and table.definition is None
|
|
898
|
+
):
|
|
899
|
+
# Find matching dynamic table from SHOW results
|
|
900
|
+
show_dt_list = dt_with_definitions.get(schema_name, [])
|
|
901
|
+
for show_dt in show_dt_list:
|
|
902
|
+
if show_dt.name == table.name:
|
|
903
|
+
table.definition = show_dt.definition
|
|
904
|
+
table.target_lag = show_dt.target_lag
|
|
905
|
+
break
|
|
906
|
+
except Exception as e:
|
|
907
|
+
logger.debug(
|
|
908
|
+
f"Failed to populate dynamic table definitions for {db_name}: {e}"
|
|
909
|
+
)
|
|
@@ -45,6 +45,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
45
45
|
SnowflakeColumn,
|
|
46
46
|
SnowflakeDatabase,
|
|
47
47
|
SnowflakeDataDictionary,
|
|
48
|
+
SnowflakeDynamicTable,
|
|
48
49
|
SnowflakeFK,
|
|
49
50
|
SnowflakePK,
|
|
50
51
|
SnowflakeSchema,
|
|
@@ -182,7 +183,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
182
183
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
183
184
|
|
|
184
185
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
185
|
-
connection=self.connection
|
|
186
|
+
connection=self.connection, report=self.report
|
|
186
187
|
)
|
|
187
188
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
188
189
|
|
|
@@ -495,6 +496,22 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
495
496
|
if self.config.include_technical_schema:
|
|
496
497
|
data_reader = self.make_data_reader()
|
|
497
498
|
for table in tables:
|
|
499
|
+
# Handle dynamic table definitions for lineage
|
|
500
|
+
if (
|
|
501
|
+
isinstance(table, SnowflakeDynamicTable)
|
|
502
|
+
and table.definition
|
|
503
|
+
and self.aggregator
|
|
504
|
+
):
|
|
505
|
+
table_identifier = self.identifiers.get_dataset_identifier(
|
|
506
|
+
table.name, schema_name, db_name
|
|
507
|
+
)
|
|
508
|
+
self.aggregator.add_view_definition(
|
|
509
|
+
view_urn=self.identifiers.gen_dataset_urn(table_identifier),
|
|
510
|
+
view_definition=table.definition,
|
|
511
|
+
default_db=db_name,
|
|
512
|
+
default_schema=schema_name,
|
|
513
|
+
)
|
|
514
|
+
|
|
498
515
|
table_wu_generator = self._process_table(
|
|
499
516
|
table, snowflake_schema, db_name
|
|
500
517
|
)
|
|
@@ -935,6 +952,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
935
952
|
}
|
|
936
953
|
)
|
|
937
954
|
|
|
955
|
+
if isinstance(table, SnowflakeDynamicTable):
|
|
956
|
+
if table.target_lag:
|
|
957
|
+
custom_properties["TARGET_LAG"] = table.target_lag
|
|
958
|
+
|
|
938
959
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
939
960
|
custom_properties["IS_SECURE"] = "true"
|
|
940
961
|
|
|
@@ -980,7 +1001,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
980
1001
|
schema_name,
|
|
981
1002
|
db_name,
|
|
982
1003
|
(
|
|
983
|
-
SnowflakeObjectDomain.
|
|
1004
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
1005
|
+
if isinstance(table, SnowflakeTable) and table.is_dynamic
|
|
1006
|
+
else SnowflakeObjectDomain.TABLE
|
|
984
1007
|
if isinstance(table, SnowflakeTable)
|
|
985
1008
|
else SnowflakeObjectDomain.VIEW
|
|
986
1009
|
),
|
|
@@ -93,9 +93,20 @@ class SnowsightUrlBuilder:
|
|
|
93
93
|
table_name: str,
|
|
94
94
|
schema_name: str,
|
|
95
95
|
db_name: str,
|
|
96
|
-
domain: Literal[
|
|
96
|
+
domain: Literal[
|
|
97
|
+
SnowflakeObjectDomain.TABLE,
|
|
98
|
+
SnowflakeObjectDomain.VIEW,
|
|
99
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
100
|
+
],
|
|
97
101
|
) -> Optional[str]:
|
|
98
|
-
|
|
102
|
+
# For dynamic tables, use the dynamic-table domain in the URL path
|
|
103
|
+
# Ensure only explicitly dynamic tables use dynamic-table URL path
|
|
104
|
+
url_domain = (
|
|
105
|
+
"dynamic-table"
|
|
106
|
+
if domain == SnowflakeObjectDomain.DYNAMIC_TABLE
|
|
107
|
+
else str(domain)
|
|
108
|
+
)
|
|
109
|
+
return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{url_domain}/{table_name}/"
|
|
99
110
|
|
|
100
111
|
def get_external_url_for_schema(
|
|
101
112
|
self, schema_name: str, db_name: str
|
|
@@ -129,6 +140,7 @@ class SnowflakeFilter:
|
|
|
129
140
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
130
141
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
131
142
|
SnowflakeObjectDomain.STREAM,
|
|
143
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
132
144
|
):
|
|
133
145
|
return False
|
|
134
146
|
if _is_sys_table(dataset_name):
|
|
@@ -160,7 +172,8 @@ class SnowflakeFilter:
|
|
|
160
172
|
return False
|
|
161
173
|
|
|
162
174
|
if dataset_type.lower() in {
|
|
163
|
-
SnowflakeObjectDomain.TABLE
|
|
175
|
+
SnowflakeObjectDomain.TABLE,
|
|
176
|
+
SnowflakeObjectDomain.DYNAMIC_TABLE,
|
|
164
177
|
} and not self.filter_config.table_pattern.allowed(
|
|
165
178
|
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
166
179
|
):
|
|
@@ -171,7 +171,9 @@ class SnowflakeV2Source(
|
|
|
171
171
|
)
|
|
172
172
|
|
|
173
173
|
# For database, schema, tables, views, etc
|
|
174
|
-
self.data_dictionary = SnowflakeDataDictionary(
|
|
174
|
+
self.data_dictionary = SnowflakeDataDictionary(
|
|
175
|
+
connection=self.connection, report=self.report
|
|
176
|
+
)
|
|
175
177
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
176
178
|
|
|
177
179
|
self.discovered_datasets: Optional[List[str]] = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
import urllib.parse
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
@@ -41,7 +41,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
41
41
|
)
|
|
42
42
|
from datahub.ingestion.source.sql.sql_common import (
|
|
43
43
|
SQLAlchemySource,
|
|
44
|
-
SqlWorkUnit,
|
|
45
44
|
register_custom_type,
|
|
46
45
|
)
|
|
47
46
|
from datahub.ingestion.source.sql.sql_config import (
|
|
@@ -346,28 +345,6 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
346
345
|
exc=e,
|
|
347
346
|
)
|
|
348
347
|
|
|
349
|
-
def get_schema_level_workunits(
|
|
350
|
-
self,
|
|
351
|
-
inspector: Inspector,
|
|
352
|
-
schema: str,
|
|
353
|
-
database: str,
|
|
354
|
-
) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
355
|
-
yield from super().get_schema_level_workunits(
|
|
356
|
-
inspector=inspector,
|
|
357
|
-
schema=schema,
|
|
358
|
-
database=database,
|
|
359
|
-
)
|
|
360
|
-
if self.config.include_stored_procedures:
|
|
361
|
-
try:
|
|
362
|
-
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
363
|
-
except Exception as e:
|
|
364
|
-
self.report.failure(
|
|
365
|
-
message="Failed to list stored procedures",
|
|
366
|
-
title="SQL Server Stored Procedures Extraction",
|
|
367
|
-
context="Error occurred during schema-level stored procedure extraction",
|
|
368
|
-
exc=e,
|
|
369
|
-
)
|
|
370
|
-
|
|
371
348
|
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
372
349
|
"""
|
|
373
350
|
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
@@ -636,7 +613,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
636
613
|
self,
|
|
637
614
|
inspector: Inspector,
|
|
638
615
|
schema: str,
|
|
639
|
-
sql_config: SQLServerConfig,
|
|
616
|
+
sql_config: SQLServerConfig, # type: ignore
|
|
640
617
|
) -> Iterable[MetadataWorkUnit]:
|
|
641
618
|
"""
|
|
642
619
|
Loop schema data for get stored procedures as dataJob-s.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# This import verifies that the dependencies are available.
|
|
2
2
|
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
3
5
|
import pymysql # noqa: F401
|
|
4
6
|
from pydantic.fields import Field
|
|
5
7
|
from sqlalchemy import util
|
|
@@ -7,6 +9,7 @@ from sqlalchemy.dialects.mysql import BIT, base
|
|
|
7
9
|
from sqlalchemy.dialects.mysql.enumerated import SET
|
|
8
10
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
11
|
|
|
12
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
10
13
|
from datahub.ingestion.api.decorators import (
|
|
11
14
|
SourceCapability,
|
|
12
15
|
SupportStatus,
|
|
@@ -20,6 +23,9 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
20
23
|
register_custom_type,
|
|
21
24
|
)
|
|
22
25
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
26
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
27
|
+
BaseProcedure,
|
|
28
|
+
)
|
|
23
29
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
24
30
|
TwoTierSQLAlchemyConfig,
|
|
25
31
|
TwoTierSQLAlchemySource,
|
|
@@ -58,6 +64,17 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
|
58
64
|
def get_identifier(self, *, schema: str, table: str) -> str:
|
|
59
65
|
return f"{schema}.{table}"
|
|
60
66
|
|
|
67
|
+
include_stored_procedures: bool = Field(
|
|
68
|
+
default=True,
|
|
69
|
+
description="Include ingest of stored procedures.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
73
|
+
default=AllowDenyPattern.allow_all(),
|
|
74
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
75
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
76
|
+
)
|
|
77
|
+
|
|
61
78
|
|
|
62
79
|
@platform_name("MySQL")
|
|
63
80
|
@config_class(MySQLConfig)
|
|
@@ -95,3 +112,40 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|
|
95
112
|
self.profile_metadata_info.dataset_name_to_storage_bytes[
|
|
96
113
|
f"{row.TABLE_SCHEMA}.{row.TABLE_NAME}"
|
|
97
114
|
] = row.DATA_LENGTH
|
|
115
|
+
|
|
116
|
+
def get_procedures_for_schema(
|
|
117
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
118
|
+
) -> List[BaseProcedure]:
|
|
119
|
+
"""
|
|
120
|
+
Get stored procedures for a specific schema.
|
|
121
|
+
"""
|
|
122
|
+
base_procedures = []
|
|
123
|
+
with inspector.engine.connect() as conn:
|
|
124
|
+
procedures = conn.execute(
|
|
125
|
+
"""
|
|
126
|
+
SELECT ROUTINE_NAME AS name,
|
|
127
|
+
ROUTINE_DEFINITION AS definition,
|
|
128
|
+
EXTERNAL_LANGUAGE AS language
|
|
129
|
+
FROM information_schema.ROUTINES
|
|
130
|
+
WHERE ROUTINE_TYPE = 'PROCEDURE'
|
|
131
|
+
AND ROUTINE_SCHEMA = %s
|
|
132
|
+
""",
|
|
133
|
+
(schema,),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
procedure_rows = list(procedures)
|
|
137
|
+
for row in procedure_rows:
|
|
138
|
+
base_procedures.append(
|
|
139
|
+
BaseProcedure(
|
|
140
|
+
name=row.name,
|
|
141
|
+
language=row.language,
|
|
142
|
+
argument_signature=None,
|
|
143
|
+
return_type=None,
|
|
144
|
+
procedure_definition=row.definition,
|
|
145
|
+
created=None,
|
|
146
|
+
last_altered=None,
|
|
147
|
+
extra_properties=None,
|
|
148
|
+
comment=None,
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
return base_procedures
|
|
@@ -36,14 +36,8 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
36
36
|
register_custom_type,
|
|
37
37
|
)
|
|
38
38
|
from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
|
|
39
|
-
from datahub.ingestion.source.sql.sql_utils import (
|
|
40
|
-
gen_database_key,
|
|
41
|
-
gen_schema_key,
|
|
42
|
-
)
|
|
43
39
|
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
44
40
|
BaseProcedure,
|
|
45
|
-
generate_procedure_container_workunits,
|
|
46
|
-
generate_procedure_workunits,
|
|
47
41
|
)
|
|
48
42
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
49
43
|
ArrayTypeClass,
|
|
@@ -132,10 +126,12 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
132
126
|
"Note: this is not used if `database` or `sqlalchemy_uri` are provided."
|
|
133
127
|
),
|
|
134
128
|
)
|
|
129
|
+
|
|
135
130
|
include_stored_procedures: bool = Field(
|
|
136
131
|
default=True,
|
|
137
132
|
description="Include ingest of stored procedures.",
|
|
138
133
|
)
|
|
134
|
+
|
|
139
135
|
procedure_pattern: AllowDenyPattern = Field(
|
|
140
136
|
default=AllowDenyPattern.allow_all(),
|
|
141
137
|
description="Regex patterns for stored procedures to filter in ingestion."
|
|
@@ -310,73 +306,6 @@ class PostgresSource(SQLAlchemySource):
|
|
|
310
306
|
except Exception as e:
|
|
311
307
|
logger.error(f"failed to fetch profile metadata: {e}")
|
|
312
308
|
|
|
313
|
-
def get_schema_level_workunits(
|
|
314
|
-
self,
|
|
315
|
-
inspector: Inspector,
|
|
316
|
-
schema: str,
|
|
317
|
-
database: str,
|
|
318
|
-
) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
319
|
-
yield from super().get_schema_level_workunits(
|
|
320
|
-
inspector=inspector,
|
|
321
|
-
schema=schema,
|
|
322
|
-
database=database,
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
if self.config.include_stored_procedures:
|
|
326
|
-
try:
|
|
327
|
-
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
328
|
-
except Exception as e:
|
|
329
|
-
self.report.failure(
|
|
330
|
-
title="Failed to list stored procedures for schema",
|
|
331
|
-
message="An error occurred while listing procedures for the schema.",
|
|
332
|
-
context=f"{database}.{schema}",
|
|
333
|
-
exc=e,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
def loop_stored_procedures(
|
|
337
|
-
self,
|
|
338
|
-
inspector: Inspector,
|
|
339
|
-
schema: str,
|
|
340
|
-
config: PostgresConfig,
|
|
341
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
342
|
-
"""
|
|
343
|
-
Loop schema data for get stored procedures as dataJob-s.
|
|
344
|
-
"""
|
|
345
|
-
db_name = self.get_db_name(inspector)
|
|
346
|
-
|
|
347
|
-
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
348
|
-
if procedures:
|
|
349
|
-
yield from self._process_procedures(procedures, db_name, schema)
|
|
350
|
-
|
|
351
|
-
def fetch_procedures_for_schema(
|
|
352
|
-
self, inspector: Inspector, schema: str, db_name: str
|
|
353
|
-
) -> List[BaseProcedure]:
|
|
354
|
-
try:
|
|
355
|
-
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
356
|
-
inspector, schema, db_name
|
|
357
|
-
)
|
|
358
|
-
procedures: List[BaseProcedure] = []
|
|
359
|
-
for procedure in raw_procedures:
|
|
360
|
-
procedure_qualified_name = self.get_identifier(
|
|
361
|
-
schema=schema,
|
|
362
|
-
entity=procedure.name,
|
|
363
|
-
inspector=inspector,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
if not self.config.procedure_pattern.allowed(procedure_qualified_name):
|
|
367
|
-
self.report.report_dropped(procedure_qualified_name)
|
|
368
|
-
else:
|
|
369
|
-
procedures.append(procedure)
|
|
370
|
-
return procedures
|
|
371
|
-
except Exception as e:
|
|
372
|
-
self.report.warning(
|
|
373
|
-
title="Failed to get procedures for schema",
|
|
374
|
-
message="An error occurred while fetching procedures for the schema.",
|
|
375
|
-
context=f"{db_name}.{schema}",
|
|
376
|
-
exc=e,
|
|
377
|
-
)
|
|
378
|
-
return []
|
|
379
|
-
|
|
380
309
|
def get_procedures_for_schema(
|
|
381
310
|
self, inspector: Inspector, schema: str, db_name: str
|
|
382
311
|
) -> List[BaseProcedure]:
|
|
@@ -401,10 +330,9 @@ class PostgresSource(SQLAlchemySource):
|
|
|
401
330
|
pg_language l ON l.oid = p.prolang
|
|
402
331
|
WHERE
|
|
403
332
|
p.prokind = 'p'
|
|
404
|
-
AND n.nspname =
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
"""
|
|
333
|
+
AND n.nspname = %s;
|
|
334
|
+
""",
|
|
335
|
+
(schema,),
|
|
408
336
|
)
|
|
409
337
|
|
|
410
338
|
procedure_rows = list(procedures)
|
|
@@ -423,60 +351,3 @@ class PostgresSource(SQLAlchemySource):
|
|
|
423
351
|
)
|
|
424
352
|
)
|
|
425
353
|
return base_procedures
|
|
426
|
-
|
|
427
|
-
def _process_procedures(
|
|
428
|
-
self,
|
|
429
|
-
procedures: List[BaseProcedure],
|
|
430
|
-
db_name: str,
|
|
431
|
-
schema: str,
|
|
432
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
433
|
-
if procedures:
|
|
434
|
-
yield from generate_procedure_container_workunits(
|
|
435
|
-
database_key=gen_database_key(
|
|
436
|
-
database=db_name,
|
|
437
|
-
platform=self.platform,
|
|
438
|
-
platform_instance=self.config.platform_instance,
|
|
439
|
-
env=self.config.env,
|
|
440
|
-
),
|
|
441
|
-
schema_key=gen_schema_key(
|
|
442
|
-
db_name=db_name,
|
|
443
|
-
schema=schema,
|
|
444
|
-
platform=self.platform,
|
|
445
|
-
platform_instance=self.config.platform_instance,
|
|
446
|
-
env=self.config.env,
|
|
447
|
-
),
|
|
448
|
-
)
|
|
449
|
-
for procedure in procedures:
|
|
450
|
-
yield from self._process_procedure(procedure, schema, db_name)
|
|
451
|
-
|
|
452
|
-
def _process_procedure(
|
|
453
|
-
self,
|
|
454
|
-
procedure: BaseProcedure,
|
|
455
|
-
schema: str,
|
|
456
|
-
db_name: str,
|
|
457
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
458
|
-
try:
|
|
459
|
-
yield from generate_procedure_workunits(
|
|
460
|
-
procedure=procedure,
|
|
461
|
-
database_key=gen_database_key(
|
|
462
|
-
database=db_name,
|
|
463
|
-
platform=self.platform,
|
|
464
|
-
platform_instance=self.config.platform_instance,
|
|
465
|
-
env=self.config.env,
|
|
466
|
-
),
|
|
467
|
-
schema_key=gen_schema_key(
|
|
468
|
-
db_name=db_name,
|
|
469
|
-
schema=schema,
|
|
470
|
-
platform=self.platform,
|
|
471
|
-
platform_instance=self.config.platform_instance,
|
|
472
|
-
env=self.config.env,
|
|
473
|
-
),
|
|
474
|
-
schema_resolver=self.get_schema_resolver(),
|
|
475
|
-
)
|
|
476
|
-
except Exception as e:
|
|
477
|
-
self.report.warning(
|
|
478
|
-
title="Failed to emit stored procedure",
|
|
479
|
-
message="An error occurred while emitting stored procedure",
|
|
480
|
-
context=procedure.name,
|
|
481
|
-
exc=e,
|
|
482
|
-
)
|