acryl-datahub 0.15.0.5rc10__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2394 -2394
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +22 -22
- datahub/_version.py +1 -1
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/powerbi/config.py +1 -0
- datahub/ingestion/source/powerbi/powerbi.py +28 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
- datahub/ingestion/source/snowflake/snowflake_report.py +6 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
- datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
- datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
|
14
14
|
)
|
|
15
15
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
16
16
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
17
|
-
from datahub.utilities.prefix_batch_builder import build_prefix_batches
|
|
17
|
+
from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
|
|
18
18
|
from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
19
19
|
|
|
20
20
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -118,6 +118,7 @@ class SnowflakeSchema:
|
|
|
118
118
|
comment: Optional[str]
|
|
119
119
|
tables: List[str] = field(default_factory=list)
|
|
120
120
|
views: List[str] = field(default_factory=list)
|
|
121
|
+
streams: List[str] = field(default_factory=list)
|
|
121
122
|
tags: Optional[List[SnowflakeTag]] = None
|
|
122
123
|
|
|
123
124
|
|
|
@@ -131,6 +132,29 @@ class SnowflakeDatabase:
|
|
|
131
132
|
tags: Optional[List[SnowflakeTag]] = None
|
|
132
133
|
|
|
133
134
|
|
|
135
|
+
@dataclass
|
|
136
|
+
class SnowflakeStream:
|
|
137
|
+
name: str
|
|
138
|
+
created: datetime
|
|
139
|
+
owner: str
|
|
140
|
+
source_type: str
|
|
141
|
+
type: str
|
|
142
|
+
stale: str
|
|
143
|
+
mode: str
|
|
144
|
+
invalid_reason: str
|
|
145
|
+
owner_role_type: str
|
|
146
|
+
database_name: str
|
|
147
|
+
schema_name: str
|
|
148
|
+
table_name: str
|
|
149
|
+
comment: Optional[str]
|
|
150
|
+
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
151
|
+
stale_after: Optional[datetime] = None
|
|
152
|
+
base_tables: Optional[str] = None
|
|
153
|
+
tags: Optional[List[SnowflakeTag]] = None
|
|
154
|
+
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
155
|
+
last_altered: Optional[datetime] = None
|
|
156
|
+
|
|
157
|
+
|
|
134
158
|
class _SnowflakeTagCache:
|
|
135
159
|
def __init__(self) -> None:
|
|
136
160
|
# self._database_tags[<database_name>] = list of tags applied to database
|
|
@@ -208,6 +232,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
208
232
|
self.get_tables_for_database,
|
|
209
233
|
self.get_views_for_database,
|
|
210
234
|
self.get_columns_for_schema,
|
|
235
|
+
self.get_streams_for_database,
|
|
211
236
|
self.get_pk_constraints_for_schema,
|
|
212
237
|
self.get_fk_constraints_for_schema,
|
|
213
238
|
]
|
|
@@ -431,9 +456,18 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
431
456
|
# For massive schemas, use a FileBackedDict to avoid memory issues.
|
|
432
457
|
columns = FileBackedDict()
|
|
433
458
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
459
|
+
# Single prefix table case (for streams)
|
|
460
|
+
if len(all_objects) == 1:
|
|
461
|
+
object_batches = [
|
|
462
|
+
[PrefixGroup(prefix=all_objects[0], names=[], exact_match=True)]
|
|
463
|
+
]
|
|
464
|
+
else:
|
|
465
|
+
# Build batches for full schema scan
|
|
466
|
+
object_batches = build_prefix_batches(
|
|
467
|
+
all_objects, max_batch_size=10000, max_groups_in_batch=5
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Process batches
|
|
437
471
|
for batch_index, object_batch in enumerate(object_batches):
|
|
438
472
|
if batch_index > 0:
|
|
439
473
|
logger.info(
|
|
@@ -611,3 +645,63 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
611
645
|
tags[column_name].append(snowflake_tag)
|
|
612
646
|
|
|
613
647
|
return tags
|
|
648
|
+
|
|
649
|
+
@serialized_lru_cache(maxsize=1)
|
|
650
|
+
def get_streams_for_database(
|
|
651
|
+
self, db_name: str
|
|
652
|
+
) -> Dict[str, List[SnowflakeStream]]:
|
|
653
|
+
page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
|
|
654
|
+
|
|
655
|
+
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
656
|
+
|
|
657
|
+
first_iteration = True
|
|
658
|
+
stream_pagination_marker: Optional[str] = None
|
|
659
|
+
while first_iteration or stream_pagination_marker is not None:
|
|
660
|
+
cur = self.connection.query(
|
|
661
|
+
SnowflakeQuery.streams_for_database(
|
|
662
|
+
db_name,
|
|
663
|
+
limit=page_limit,
|
|
664
|
+
stream_pagination_marker=stream_pagination_marker,
|
|
665
|
+
)
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
first_iteration = False
|
|
669
|
+
stream_pagination_marker = None
|
|
670
|
+
|
|
671
|
+
result_set_size = 0
|
|
672
|
+
for stream in cur:
|
|
673
|
+
result_set_size += 1
|
|
674
|
+
|
|
675
|
+
stream_name = stream["name"]
|
|
676
|
+
schema_name = stream["schema_name"]
|
|
677
|
+
if schema_name not in streams:
|
|
678
|
+
streams[schema_name] = []
|
|
679
|
+
streams[stream["schema_name"]].append(
|
|
680
|
+
SnowflakeStream(
|
|
681
|
+
name=stream["name"],
|
|
682
|
+
created=stream["created_on"],
|
|
683
|
+
owner=stream["owner"],
|
|
684
|
+
comment=stream["comment"],
|
|
685
|
+
source_type=stream["source_type"],
|
|
686
|
+
type=stream["type"],
|
|
687
|
+
stale=stream["stale"],
|
|
688
|
+
mode=stream["mode"],
|
|
689
|
+
database_name=stream["database_name"],
|
|
690
|
+
schema_name=stream["schema_name"],
|
|
691
|
+
invalid_reason=stream["invalid_reason"],
|
|
692
|
+
owner_role_type=stream["owner_role_type"],
|
|
693
|
+
stale_after=stream["stale_after"],
|
|
694
|
+
table_name=stream["table_name"],
|
|
695
|
+
base_tables=stream["base_tables"],
|
|
696
|
+
last_altered=stream["created_on"],
|
|
697
|
+
)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
if result_set_size >= page_limit:
|
|
701
|
+
# If we hit the limit, we need to send another request to get the next page.
|
|
702
|
+
logger.info(
|
|
703
|
+
f"Fetching next page of streams for {db_name} - after {stream_name}"
|
|
704
|
+
)
|
|
705
|
+
stream_pagination_marker = stream_name
|
|
706
|
+
|
|
707
|
+
return streams
|
|
@@ -48,6 +48,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
48
48
|
SnowflakeFK,
|
|
49
49
|
SnowflakePK,
|
|
50
50
|
SnowflakeSchema,
|
|
51
|
+
SnowflakeStream,
|
|
51
52
|
SnowflakeTable,
|
|
52
53
|
SnowflakeTag,
|
|
53
54
|
SnowflakeView,
|
|
@@ -58,6 +59,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
58
59
|
SnowflakeIdentifierBuilder,
|
|
59
60
|
SnowflakeStructuredReportMixin,
|
|
60
61
|
SnowsightUrlBuilder,
|
|
62
|
+
split_qualified_name,
|
|
61
63
|
)
|
|
62
64
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
63
65
|
add_table_to_schema_container,
|
|
@@ -70,6 +72,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
70
72
|
)
|
|
71
73
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
72
74
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
75
|
+
LINEAGE_EXTRACTION,
|
|
73
76
|
METADATA_EXTRACTION,
|
|
74
77
|
PROFILING,
|
|
75
78
|
)
|
|
@@ -81,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
|
81
84
|
TimeStamp,
|
|
82
85
|
)
|
|
83
86
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
87
|
+
DatasetLineageTypeClass,
|
|
84
88
|
DatasetProperties,
|
|
85
89
|
ViewProperties,
|
|
86
90
|
)
|
|
@@ -420,73 +424,120 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
420
424
|
schema_name = snowflake_schema.name
|
|
421
425
|
|
|
422
426
|
if self.config.extract_tags != TagOption.skip:
|
|
423
|
-
snowflake_schema
|
|
424
|
-
schema_name=schema_name, db_name=db_name, domain="schema"
|
|
425
|
-
)
|
|
427
|
+
self._process_tags(snowflake_schema, schema_name, db_name, domain="schema")
|
|
426
428
|
|
|
427
429
|
if self.config.include_technical_schema:
|
|
428
430
|
yield from self.gen_schema_containers(snowflake_schema, db_name)
|
|
429
431
|
|
|
430
|
-
|
|
432
|
+
tables, views, streams = [], [], []
|
|
433
|
+
|
|
431
434
|
if self.config.include_tables:
|
|
432
435
|
tables = self.fetch_tables_for_schema(
|
|
433
436
|
snowflake_schema, db_name, schema_name
|
|
434
437
|
)
|
|
438
|
+
db_tables[schema_name] = tables
|
|
439
|
+
yield from self._process_tables(
|
|
440
|
+
tables, snowflake_schema, db_name, schema_name
|
|
441
|
+
)
|
|
442
|
+
|
|
435
443
|
if self.config.include_views:
|
|
436
444
|
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
445
|
+
yield from self._process_views(
|
|
446
|
+
views, snowflake_schema, db_name, schema_name
|
|
447
|
+
)
|
|
437
448
|
|
|
438
|
-
if self.config.
|
|
439
|
-
|
|
449
|
+
if self.config.include_streams:
|
|
450
|
+
self.report.num_get_streams_for_schema_queries += 1
|
|
451
|
+
streams = self.fetch_streams_for_schema(
|
|
452
|
+
snowflake_schema, db_name, schema_name
|
|
453
|
+
)
|
|
454
|
+
yield from self._process_streams(streams, snowflake_schema, db_name)
|
|
440
455
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
for table in tables:
|
|
444
|
-
table_wu_generator = self._process_table(
|
|
445
|
-
table, snowflake_schema, db_name
|
|
446
|
-
)
|
|
456
|
+
if self.config.include_technical_schema and snowflake_schema.tags:
|
|
457
|
+
yield from self._process_tags_in_schema(snowflake_schema)
|
|
447
458
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
459
|
+
if (
|
|
460
|
+
not snowflake_schema.views
|
|
461
|
+
and not snowflake_schema.tables
|
|
462
|
+
and not snowflake_schema.streams
|
|
463
|
+
):
|
|
464
|
+
self.structured_reporter.info(
|
|
465
|
+
title="No tables/views/streams found in schema",
|
|
466
|
+
message="If objects exist, please grant REFERENCES or SELECT permissions on them.",
|
|
467
|
+
context=f"{db_name}.{schema_name}",
|
|
468
|
+
)
|
|
454
469
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
470
|
+
def _process_tags(self, snowflake_schema, schema_name, db_name, domain):
|
|
471
|
+
snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
|
|
472
|
+
schema_name=schema_name, db_name=db_name, domain=domain
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def _process_tables(
|
|
476
|
+
self,
|
|
477
|
+
tables: List[SnowflakeTable],
|
|
478
|
+
snowflake_schema: SnowflakeSchema,
|
|
479
|
+
db_name: str,
|
|
480
|
+
schema_name: str,
|
|
481
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
482
|
+
if self.config.include_technical_schema:
|
|
483
|
+
data_reader = self.make_data_reader()
|
|
484
|
+
for table in tables:
|
|
485
|
+
table_wu_generator = self._process_table(
|
|
486
|
+
table, snowflake_schema, db_name
|
|
487
|
+
)
|
|
488
|
+
yield from classification_workunit_processor(
|
|
489
|
+
table_wu_generator,
|
|
490
|
+
self.classification_handler,
|
|
491
|
+
data_reader,
|
|
492
|
+
[db_name, schema_name, table.name],
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
def _process_views(
|
|
496
|
+
self,
|
|
497
|
+
views: List[SnowflakeView],
|
|
498
|
+
snowflake_schema: SnowflakeSchema,
|
|
499
|
+
db_name: str,
|
|
500
|
+
schema_name: str,
|
|
501
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
502
|
+
if self.aggregator:
|
|
503
|
+
for view in views:
|
|
504
|
+
view_identifier = self.identifiers.get_dataset_identifier(
|
|
505
|
+
view.name, schema_name, db_name
|
|
506
|
+
)
|
|
507
|
+
if view.is_secure and not view.view_definition:
|
|
508
|
+
view.view_definition = self.fetch_secure_view_definition(
|
|
459
509
|
view.name, schema_name, db_name
|
|
460
510
|
)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
default_schema=schema_name,
|
|
471
|
-
)
|
|
472
|
-
elif view.is_secure:
|
|
473
|
-
self.report.num_secure_views_missing_definition += 1
|
|
511
|
+
if view.view_definition:
|
|
512
|
+
self.aggregator.add_view_definition(
|
|
513
|
+
view_urn=self.identifiers.gen_dataset_urn(view_identifier),
|
|
514
|
+
view_definition=view.view_definition,
|
|
515
|
+
default_db=db_name,
|
|
516
|
+
default_schema=schema_name,
|
|
517
|
+
)
|
|
518
|
+
elif view.is_secure:
|
|
519
|
+
self.report.num_secure_views_missing_definition += 1
|
|
474
520
|
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
521
|
+
if self.config.include_technical_schema:
|
|
522
|
+
for view in views:
|
|
523
|
+
yield from self._process_view(view, snowflake_schema, db_name)
|
|
478
524
|
|
|
479
|
-
|
|
525
|
+
def _process_streams(
|
|
526
|
+
self,
|
|
527
|
+
streams: List[SnowflakeStream],
|
|
528
|
+
snowflake_schema: SnowflakeSchema,
|
|
529
|
+
db_name: str,
|
|
530
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
531
|
+
for stream in streams:
|
|
532
|
+
yield from self._process_stream(stream, snowflake_schema, db_name)
|
|
533
|
+
|
|
534
|
+
def _process_tags_in_schema(
|
|
535
|
+
self, snowflake_schema: SnowflakeSchema
|
|
536
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
537
|
+
if snowflake_schema.tags:
|
|
480
538
|
for tag in snowflake_schema.tags:
|
|
481
539
|
yield from self._process_tag(tag)
|
|
482
540
|
|
|
483
|
-
if not snowflake_schema.views and not snowflake_schema.tables:
|
|
484
|
-
self.structured_reporter.info(
|
|
485
|
-
title="No tables/views found in schema",
|
|
486
|
-
message="If tables exist, please grant REFERENCES or SELECT permissions on them.",
|
|
487
|
-
context=f"{db_name}.{schema_name}",
|
|
488
|
-
)
|
|
489
|
-
|
|
490
541
|
def fetch_secure_view_definition(
|
|
491
542
|
self, table_name: str, schema_name: str, db_name: str
|
|
492
543
|
) -> Optional[str]:
|
|
@@ -729,7 +780,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
729
780
|
|
|
730
781
|
def gen_dataset_workunits(
|
|
731
782
|
self,
|
|
732
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
783
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
733
784
|
schema_name: str,
|
|
734
785
|
db_name: str,
|
|
735
786
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -788,7 +839,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
788
839
|
|
|
789
840
|
subTypes = SubTypes(
|
|
790
841
|
typeNames=(
|
|
791
|
-
[DatasetSubTypes.
|
|
842
|
+
[DatasetSubTypes.SNOWFLAKE_STREAM]
|
|
843
|
+
if isinstance(table, SnowflakeStream)
|
|
844
|
+
else [DatasetSubTypes.VIEW]
|
|
792
845
|
if isinstance(table, SnowflakeView)
|
|
793
846
|
else [DatasetSubTypes.TABLE]
|
|
794
847
|
)
|
|
@@ -843,28 +896,50 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
843
896
|
|
|
844
897
|
def get_dataset_properties(
|
|
845
898
|
self,
|
|
846
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
899
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
847
900
|
schema_name: str,
|
|
848
901
|
db_name: str,
|
|
849
902
|
) -> DatasetProperties:
|
|
850
903
|
custom_properties = {}
|
|
851
904
|
|
|
852
905
|
if isinstance(table, SnowflakeTable):
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
906
|
+
custom_properties.update(
|
|
907
|
+
{
|
|
908
|
+
k: v
|
|
909
|
+
for k, v in {
|
|
910
|
+
"CLUSTERING_KEY": table.clustering_key,
|
|
911
|
+
"IS_HYBRID": "true" if table.is_hybrid else None,
|
|
912
|
+
"IS_DYNAMIC": "true" if table.is_dynamic else None,
|
|
913
|
+
"IS_ICEBERG": "true" if table.is_iceberg else None,
|
|
914
|
+
}.items()
|
|
915
|
+
if v
|
|
916
|
+
}
|
|
917
|
+
)
|
|
864
918
|
|
|
865
919
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
866
920
|
custom_properties["IS_SECURE"] = "true"
|
|
867
921
|
|
|
922
|
+
elif isinstance(table, SnowflakeStream):
|
|
923
|
+
custom_properties.update(
|
|
924
|
+
{
|
|
925
|
+
k: v
|
|
926
|
+
for k, v in {
|
|
927
|
+
"SOURCE_TYPE": table.source_type,
|
|
928
|
+
"TYPE": table.type,
|
|
929
|
+
"STALE": table.stale,
|
|
930
|
+
"MODE": table.mode,
|
|
931
|
+
"INVALID_REASON": table.invalid_reason,
|
|
932
|
+
"OWNER_ROLE_TYPE": table.owner_role_type,
|
|
933
|
+
"TABLE_NAME": table.table_name,
|
|
934
|
+
"BASE_TABLES": table.base_tables,
|
|
935
|
+
"STALE_AFTER": table.stale_after.isoformat()
|
|
936
|
+
if table.stale_after
|
|
937
|
+
else None,
|
|
938
|
+
}.items()
|
|
939
|
+
if v
|
|
940
|
+
}
|
|
941
|
+
)
|
|
942
|
+
|
|
868
943
|
return DatasetProperties(
|
|
869
944
|
name=table.name,
|
|
870
945
|
created=(
|
|
@@ -909,7 +984,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
909
984
|
).as_workunit()
|
|
910
985
|
|
|
911
986
|
def gen_column_tags_as_structured_properties(
|
|
912
|
-
self,
|
|
987
|
+
self,
|
|
988
|
+
dataset_urn: str,
|
|
989
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
913
990
|
) -> Iterable[MetadataWorkUnit]:
|
|
914
991
|
for column_name in table.column_tags:
|
|
915
992
|
schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
|
|
@@ -922,7 +999,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
922
999
|
|
|
923
1000
|
def gen_schema_metadata(
|
|
924
1001
|
self,
|
|
925
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
1002
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
926
1003
|
schema_name: str,
|
|
927
1004
|
db_name: str,
|
|
928
1005
|
) -> SchemaMetadata:
|
|
@@ -1214,3 +1291,158 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1214
1291
|
"External table ddl lineage extraction failed",
|
|
1215
1292
|
exc=e,
|
|
1216
1293
|
)
|
|
1294
|
+
|
|
1295
|
+
def fetch_streams_for_schema(
|
|
1296
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
|
|
1297
|
+
) -> List[SnowflakeStream]:
|
|
1298
|
+
try:
|
|
1299
|
+
streams: List[SnowflakeStream] = []
|
|
1300
|
+
for stream in self.get_streams_for_schema(schema_name, db_name):
|
|
1301
|
+
stream_identifier = self.identifiers.get_dataset_identifier(
|
|
1302
|
+
stream.name, schema_name, db_name
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
self.report.report_entity_scanned(stream_identifier, "stream")
|
|
1306
|
+
|
|
1307
|
+
if not self.filters.is_dataset_pattern_allowed(
|
|
1308
|
+
stream_identifier, SnowflakeObjectDomain.STREAM
|
|
1309
|
+
):
|
|
1310
|
+
self.report.report_dropped(stream_identifier)
|
|
1311
|
+
else:
|
|
1312
|
+
streams.append(stream)
|
|
1313
|
+
snowflake_schema.streams = [stream.name for stream in streams]
|
|
1314
|
+
return streams
|
|
1315
|
+
except Exception as e:
|
|
1316
|
+
if isinstance(e, SnowflakePermissionError):
|
|
1317
|
+
error_msg = f"Failed to get streams for schema {db_name}.{schema_name}. Please check permissions."
|
|
1318
|
+
raise SnowflakePermissionError(error_msg) from e.__cause__
|
|
1319
|
+
else:
|
|
1320
|
+
self.structured_reporter.warning(
|
|
1321
|
+
"Failed to get streams for schema",
|
|
1322
|
+
f"{db_name}.{schema_name}",
|
|
1323
|
+
exc=e,
|
|
1324
|
+
)
|
|
1325
|
+
return []
|
|
1326
|
+
|
|
1327
|
+
def get_streams_for_schema(
|
|
1328
|
+
self, schema_name: str, db_name: str
|
|
1329
|
+
) -> List[SnowflakeStream]:
|
|
1330
|
+
streams = self.data_dictionary.get_streams_for_database(db_name)
|
|
1331
|
+
|
|
1332
|
+
return streams.get(schema_name, [])
|
|
1333
|
+
|
|
1334
|
+
def _process_stream(
|
|
1335
|
+
self,
|
|
1336
|
+
stream: SnowflakeStream,
|
|
1337
|
+
snowflake_schema: SnowflakeSchema,
|
|
1338
|
+
db_name: str,
|
|
1339
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1340
|
+
schema_name = snowflake_schema.name
|
|
1341
|
+
|
|
1342
|
+
try:
|
|
1343
|
+
# Retrieve and register the schema without metadata to prevent columns from mapping upstream
|
|
1344
|
+
stream.columns = self.get_columns_for_stream(stream.table_name)
|
|
1345
|
+
yield from self.gen_dataset_workunits(stream, schema_name, db_name)
|
|
1346
|
+
|
|
1347
|
+
if self.config.include_column_lineage:
|
|
1348
|
+
with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
|
|
1349
|
+
self.populate_stream_upstreams(stream, db_name, schema_name)
|
|
1350
|
+
|
|
1351
|
+
except Exception as e:
|
|
1352
|
+
self.structured_reporter.warning(
|
|
1353
|
+
"Failed to get columns for stream:", stream.name, exc=e
|
|
1354
|
+
)
|
|
1355
|
+
|
|
1356
|
+
def get_columns_for_stream(
|
|
1357
|
+
self,
|
|
1358
|
+
source_object: str, # Qualified name of source table/view
|
|
1359
|
+
) -> List[SnowflakeColumn]:
|
|
1360
|
+
"""
|
|
1361
|
+
Get column information for a stream by getting source object columns and adding metadata columns.
|
|
1362
|
+
Stream includes all columns from source object plus metadata columns like:
|
|
1363
|
+
- METADATA$ACTION
|
|
1364
|
+
- METADATA$ISUPDATE
|
|
1365
|
+
- METADATA$ROW_ID
|
|
1366
|
+
"""
|
|
1367
|
+
columns: List[SnowflakeColumn] = []
|
|
1368
|
+
|
|
1369
|
+
source_parts = split_qualified_name(source_object)
|
|
1370
|
+
|
|
1371
|
+
source_db, source_schema, source_name = source_parts
|
|
1372
|
+
|
|
1373
|
+
# Get columns from source object
|
|
1374
|
+
source_columns = self.data_dictionary.get_columns_for_schema(
|
|
1375
|
+
source_schema, source_db, itertools.chain([source_name])
|
|
1376
|
+
).get(source_name, [])
|
|
1377
|
+
|
|
1378
|
+
# Add all source columns
|
|
1379
|
+
columns.extend(source_columns)
|
|
1380
|
+
|
|
1381
|
+
# Add standard stream metadata columns
|
|
1382
|
+
metadata_columns = [
|
|
1383
|
+
SnowflakeColumn(
|
|
1384
|
+
name="METADATA$ACTION",
|
|
1385
|
+
ordinal_position=len(columns) + 1,
|
|
1386
|
+
is_nullable=False,
|
|
1387
|
+
data_type="VARCHAR",
|
|
1388
|
+
comment="Type of DML operation (INSERT/DELETE)",
|
|
1389
|
+
character_maximum_length=10,
|
|
1390
|
+
numeric_precision=None,
|
|
1391
|
+
numeric_scale=None,
|
|
1392
|
+
),
|
|
1393
|
+
SnowflakeColumn(
|
|
1394
|
+
name="METADATA$ISUPDATE",
|
|
1395
|
+
ordinal_position=len(columns) + 2,
|
|
1396
|
+
is_nullable=False,
|
|
1397
|
+
data_type="BOOLEAN",
|
|
1398
|
+
comment="Whether row is from UPDATE operation",
|
|
1399
|
+
character_maximum_length=None,
|
|
1400
|
+
numeric_precision=None,
|
|
1401
|
+
numeric_scale=None,
|
|
1402
|
+
),
|
|
1403
|
+
SnowflakeColumn(
|
|
1404
|
+
name="METADATA$ROW_ID",
|
|
1405
|
+
ordinal_position=len(columns) + 3,
|
|
1406
|
+
is_nullable=False,
|
|
1407
|
+
data_type="NUMBER",
|
|
1408
|
+
comment="Unique row identifier",
|
|
1409
|
+
character_maximum_length=None,
|
|
1410
|
+
numeric_precision=38,
|
|
1411
|
+
numeric_scale=0,
|
|
1412
|
+
),
|
|
1413
|
+
]
|
|
1414
|
+
|
|
1415
|
+
columns.extend(metadata_columns)
|
|
1416
|
+
|
|
1417
|
+
return columns
|
|
1418
|
+
|
|
1419
|
+
def populate_stream_upstreams(
|
|
1420
|
+
self, stream: SnowflakeStream, db_name: str, schema_name: str
|
|
1421
|
+
) -> None:
|
|
1422
|
+
"""
|
|
1423
|
+
Populate Streams upstream tables
|
|
1424
|
+
"""
|
|
1425
|
+
self.report.num_streams_with_known_upstreams += 1
|
|
1426
|
+
if self.aggregator:
|
|
1427
|
+
source_parts = split_qualified_name(stream.table_name)
|
|
1428
|
+
source_db, source_schema, source_name = source_parts
|
|
1429
|
+
|
|
1430
|
+
dataset_identifier = self.identifiers.get_dataset_identifier(
|
|
1431
|
+
stream.name, schema_name, db_name
|
|
1432
|
+
)
|
|
1433
|
+
dataset_urn = self.identifiers.gen_dataset_urn(dataset_identifier)
|
|
1434
|
+
|
|
1435
|
+
upstream_identifier = self.identifiers.get_dataset_identifier(
|
|
1436
|
+
source_name, source_schema, source_db
|
|
1437
|
+
)
|
|
1438
|
+
upstream_urn = self.identifiers.gen_dataset_urn(upstream_identifier)
|
|
1439
|
+
|
|
1440
|
+
logger.debug(
|
|
1441
|
+
f"""upstream_urn: {upstream_urn}, downstream_urn: {dataset_urn}"""
|
|
1442
|
+
)
|
|
1443
|
+
|
|
1444
|
+
self.aggregator.add_known_lineage_mapping(
|
|
1445
|
+
upstream_urn=upstream_urn,
|
|
1446
|
+
downstream_urn=dataset_urn,
|
|
1447
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
1448
|
+
)
|
|
@@ -124,19 +124,20 @@ class SnowflakeFilter:
|
|
|
124
124
|
SnowflakeObjectDomain.VIEW,
|
|
125
125
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
126
126
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
127
|
+
SnowflakeObjectDomain.STREAM,
|
|
127
128
|
):
|
|
128
129
|
return False
|
|
129
130
|
if _is_sys_table(dataset_name):
|
|
130
131
|
return False
|
|
131
132
|
|
|
132
|
-
dataset_params =
|
|
133
|
+
dataset_params = split_qualified_name(dataset_name)
|
|
133
134
|
if len(dataset_params) != 3:
|
|
134
135
|
self.structured_reporter.info(
|
|
135
136
|
title="Unexpected dataset pattern",
|
|
136
137
|
message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.",
|
|
137
138
|
context=dataset_name,
|
|
138
139
|
)
|
|
139
|
-
# We fall-through here so table/view filtering still works.
|
|
140
|
+
# We fall-through here so table/view/stream filtering still works.
|
|
140
141
|
|
|
141
142
|
if (
|
|
142
143
|
len(dataset_params) >= 1
|
|
@@ -169,6 +170,14 @@ class SnowflakeFilter:
|
|
|
169
170
|
):
|
|
170
171
|
return False
|
|
171
172
|
|
|
173
|
+
if (
|
|
174
|
+
dataset_type.lower() == SnowflakeObjectDomain.STREAM
|
|
175
|
+
and not self.filter_config.stream_pattern.allowed(
|
|
176
|
+
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
177
|
+
)
|
|
178
|
+
):
|
|
179
|
+
return False
|
|
180
|
+
|
|
172
181
|
return True
|
|
173
182
|
|
|
174
183
|
|
|
@@ -183,17 +192,17 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
183
192
|
return table_name.lower().startswith("sys$")
|
|
184
193
|
|
|
185
194
|
|
|
186
|
-
def
|
|
195
|
+
def split_qualified_name(qualified_name: str) -> List[str]:
|
|
187
196
|
"""
|
|
188
197
|
Split a qualified name into its constituent parts.
|
|
189
198
|
|
|
190
|
-
>>>
|
|
199
|
+
>>> split_qualified_name("db.my_schema.my_table")
|
|
191
200
|
['db', 'my_schema', 'my_table']
|
|
192
|
-
>>>
|
|
201
|
+
>>> split_qualified_name('"db"."my_schema"."my_table"')
|
|
193
202
|
['db', 'my_schema', 'my_table']
|
|
194
|
-
>>>
|
|
203
|
+
>>> split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
|
|
195
204
|
['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
|
|
196
|
-
>>>
|
|
205
|
+
>>> split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
|
|
197
206
|
['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
|
|
198
207
|
"""
|
|
199
208
|
|
|
@@ -231,7 +240,7 @@ def _split_qualified_name(qualified_name: str) -> List[str]:
|
|
|
231
240
|
def _cleanup_qualified_name(
|
|
232
241
|
qualified_name: str, structured_reporter: SourceReport
|
|
233
242
|
) -> str:
|
|
234
|
-
name_parts =
|
|
243
|
+
name_parts = split_qualified_name(qualified_name)
|
|
235
244
|
if len(name_parts) != 3:
|
|
236
245
|
if not _is_sys_table(qualified_name):
|
|
237
246
|
structured_reporter.info(
|