acryl-datahub 0.15.0.5rc10__py3-none-any.whl → 0.15.0.6rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/METADATA +2482 -2482
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/RECORD +35 -24
- datahub/_version.py +1 -1
- datahub/errors.py +35 -0
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/mongodb.py +17 -16
- datahub/ingestion/source/powerbi/config.py +1 -0
- datahub/ingestion/source/powerbi/powerbi.py +28 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
- datahub/ingestion/source/s3/source.py +14 -5
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
- datahub/ingestion/source/snowflake/snowflake_report.py +6 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +108 -4
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +298 -69
- datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
- datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
- datahub/sdk/__init__.py +33 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_attribution.py +48 -0
- datahub/sdk/_entity.py +89 -0
- datahub/sdk/_shared.py +338 -0
- datahub/sdk/container.py +193 -0
- datahub/sdk/dataset.py +584 -0
- datahub/sdk/entity_client.py +115 -0
- datahub/sdk/main_client.py +56 -0
- datahub/sdk/resolver_client.py +101 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/top_level.txt +0 -0
|
@@ -104,6 +104,7 @@ class SnowflakeV2Report(
|
|
|
104
104
|
schemas_scanned: int = 0
|
|
105
105
|
databases_scanned: int = 0
|
|
106
106
|
tags_scanned: int = 0
|
|
107
|
+
streams_scanned: int = 0
|
|
107
108
|
|
|
108
109
|
include_usage_stats: bool = False
|
|
109
110
|
include_operational_stats: bool = False
|
|
@@ -113,6 +114,7 @@ class SnowflakeV2Report(
|
|
|
113
114
|
table_lineage_query_secs: float = -1
|
|
114
115
|
external_lineage_queries_secs: float = -1
|
|
115
116
|
num_tables_with_known_upstreams: int = 0
|
|
117
|
+
num_streams_with_known_upstreams: int = 0
|
|
116
118
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
117
119
|
num_secure_views_missing_definition: int = 0
|
|
118
120
|
num_structured_property_templates_created: int = 0
|
|
@@ -131,6 +133,8 @@ class SnowflakeV2Report(
|
|
|
131
133
|
num_get_tags_for_object_queries: int = 0
|
|
132
134
|
num_get_tags_on_columns_for_table_queries: int = 0
|
|
133
135
|
|
|
136
|
+
num_get_streams_for_schema_queries: int = 0
|
|
137
|
+
|
|
134
138
|
rows_zero_objects_modified: int = 0
|
|
135
139
|
|
|
136
140
|
_processed_tags: MutableSet[str] = field(default_factory=set)
|
|
@@ -157,6 +161,8 @@ class SnowflakeV2Report(
|
|
|
157
161
|
return
|
|
158
162
|
self._scanned_tags.add(name)
|
|
159
163
|
self.tags_scanned += 1
|
|
164
|
+
elif ent_type == "stream":
|
|
165
|
+
self.streams_scanned += 1
|
|
160
166
|
else:
|
|
161
167
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
162
168
|
|
|
@@ -6,6 +6,7 @@ from datetime import datetime
|
|
|
6
6
|
from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
7
7
|
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
9
10
|
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
|
10
11
|
from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
|
|
11
12
|
from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
@@ -14,7 +15,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
|
14
15
|
)
|
|
15
16
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
16
17
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
17
|
-
from datahub.utilities.prefix_batch_builder import build_prefix_batches
|
|
18
|
+
from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
|
|
18
19
|
from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
19
20
|
|
|
20
21
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -100,6 +101,9 @@ class SnowflakeTable(BaseTable):
|
|
|
100
101
|
def is_hybrid(self) -> bool:
|
|
101
102
|
return self.type is not None and self.type == "HYBRID TABLE"
|
|
102
103
|
|
|
104
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
105
|
+
return DatasetSubTypes.TABLE
|
|
106
|
+
|
|
103
107
|
|
|
104
108
|
@dataclass
|
|
105
109
|
class SnowflakeView(BaseView):
|
|
@@ -109,6 +113,9 @@ class SnowflakeView(BaseView):
|
|
|
109
113
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
110
114
|
is_secure: bool = False
|
|
111
115
|
|
|
116
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
117
|
+
return DatasetSubTypes.VIEW
|
|
118
|
+
|
|
112
119
|
|
|
113
120
|
@dataclass
|
|
114
121
|
class SnowflakeSchema:
|
|
@@ -118,6 +125,7 @@ class SnowflakeSchema:
|
|
|
118
125
|
comment: Optional[str]
|
|
119
126
|
tables: List[str] = field(default_factory=list)
|
|
120
127
|
views: List[str] = field(default_factory=list)
|
|
128
|
+
streams: List[str] = field(default_factory=list)
|
|
121
129
|
tags: Optional[List[SnowflakeTag]] = None
|
|
122
130
|
|
|
123
131
|
|
|
@@ -131,6 +139,32 @@ class SnowflakeDatabase:
|
|
|
131
139
|
tags: Optional[List[SnowflakeTag]] = None
|
|
132
140
|
|
|
133
141
|
|
|
142
|
+
@dataclass
|
|
143
|
+
class SnowflakeStream:
|
|
144
|
+
name: str
|
|
145
|
+
created: datetime
|
|
146
|
+
owner: str
|
|
147
|
+
source_type: str
|
|
148
|
+
type: str
|
|
149
|
+
stale: str
|
|
150
|
+
mode: str
|
|
151
|
+
invalid_reason: str
|
|
152
|
+
owner_role_type: str
|
|
153
|
+
database_name: str
|
|
154
|
+
schema_name: str
|
|
155
|
+
table_name: str
|
|
156
|
+
comment: Optional[str]
|
|
157
|
+
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
158
|
+
stale_after: Optional[datetime] = None
|
|
159
|
+
base_tables: Optional[str] = None
|
|
160
|
+
tags: Optional[List[SnowflakeTag]] = None
|
|
161
|
+
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
162
|
+
last_altered: Optional[datetime] = None
|
|
163
|
+
|
|
164
|
+
def get_subtype(self) -> DatasetSubTypes:
|
|
165
|
+
return DatasetSubTypes.SNOWFLAKE_STREAM
|
|
166
|
+
|
|
167
|
+
|
|
134
168
|
class _SnowflakeTagCache:
|
|
135
169
|
def __init__(self) -> None:
|
|
136
170
|
# self._database_tags[<database_name>] = list of tags applied to database
|
|
@@ -208,6 +242,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
208
242
|
self.get_tables_for_database,
|
|
209
243
|
self.get_views_for_database,
|
|
210
244
|
self.get_columns_for_schema,
|
|
245
|
+
self.get_streams_for_database,
|
|
211
246
|
self.get_pk_constraints_for_schema,
|
|
212
247
|
self.get_fk_constraints_for_schema,
|
|
213
248
|
]
|
|
@@ -431,9 +466,18 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
431
466
|
# For massive schemas, use a FileBackedDict to avoid memory issues.
|
|
432
467
|
columns = FileBackedDict()
|
|
433
468
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
469
|
+
# Single prefix table case (for streams)
|
|
470
|
+
if len(all_objects) == 1:
|
|
471
|
+
object_batches = [
|
|
472
|
+
[PrefixGroup(prefix=all_objects[0], names=[], exact_match=True)]
|
|
473
|
+
]
|
|
474
|
+
else:
|
|
475
|
+
# Build batches for full schema scan
|
|
476
|
+
object_batches = build_prefix_batches(
|
|
477
|
+
all_objects, max_batch_size=10000, max_groups_in_batch=5
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Process batches
|
|
437
481
|
for batch_index, object_batch in enumerate(object_batches):
|
|
438
482
|
if batch_index > 0:
|
|
439
483
|
logger.info(
|
|
@@ -611,3 +655,63 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
611
655
|
tags[column_name].append(snowflake_tag)
|
|
612
656
|
|
|
613
657
|
return tags
|
|
658
|
+
|
|
659
|
+
@serialized_lru_cache(maxsize=1)
|
|
660
|
+
def get_streams_for_database(
|
|
661
|
+
self, db_name: str
|
|
662
|
+
) -> Dict[str, List[SnowflakeStream]]:
|
|
663
|
+
page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
|
|
664
|
+
|
|
665
|
+
streams: Dict[str, List[SnowflakeStream]] = {}
|
|
666
|
+
|
|
667
|
+
first_iteration = True
|
|
668
|
+
stream_pagination_marker: Optional[str] = None
|
|
669
|
+
while first_iteration or stream_pagination_marker is not None:
|
|
670
|
+
cur = self.connection.query(
|
|
671
|
+
SnowflakeQuery.streams_for_database(
|
|
672
|
+
db_name,
|
|
673
|
+
limit=page_limit,
|
|
674
|
+
stream_pagination_marker=stream_pagination_marker,
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
first_iteration = False
|
|
679
|
+
stream_pagination_marker = None
|
|
680
|
+
|
|
681
|
+
result_set_size = 0
|
|
682
|
+
for stream in cur:
|
|
683
|
+
result_set_size += 1
|
|
684
|
+
|
|
685
|
+
stream_name = stream["name"]
|
|
686
|
+
schema_name = stream["schema_name"]
|
|
687
|
+
if schema_name not in streams:
|
|
688
|
+
streams[schema_name] = []
|
|
689
|
+
streams[stream["schema_name"]].append(
|
|
690
|
+
SnowflakeStream(
|
|
691
|
+
name=stream["name"],
|
|
692
|
+
created=stream["created_on"],
|
|
693
|
+
owner=stream["owner"],
|
|
694
|
+
comment=stream["comment"],
|
|
695
|
+
source_type=stream["source_type"],
|
|
696
|
+
type=stream["type"],
|
|
697
|
+
stale=stream["stale"],
|
|
698
|
+
mode=stream["mode"],
|
|
699
|
+
database_name=stream["database_name"],
|
|
700
|
+
schema_name=stream["schema_name"],
|
|
701
|
+
invalid_reason=stream["invalid_reason"],
|
|
702
|
+
owner_role_type=stream["owner_role_type"],
|
|
703
|
+
stale_after=stream["stale_after"],
|
|
704
|
+
table_name=stream["table_name"],
|
|
705
|
+
base_tables=stream["base_tables"],
|
|
706
|
+
last_altered=stream["created_on"],
|
|
707
|
+
)
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
if result_set_size >= page_limit:
|
|
711
|
+
# If we hit the limit, we need to send another request to get the next page.
|
|
712
|
+
logger.info(
|
|
713
|
+
f"Fetching next page of streams for {db_name} - after {stream_name}"
|
|
714
|
+
)
|
|
715
|
+
stream_pagination_marker = stream_name
|
|
716
|
+
|
|
717
|
+
return streams
|
|
@@ -21,7 +21,6 @@ from datahub.ingestion.glossary.classification_mixin import (
|
|
|
21
21
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
|
|
22
22
|
from datahub.ingestion.source.common.subtypes import (
|
|
23
23
|
DatasetContainerSubTypes,
|
|
24
|
-
DatasetSubTypes,
|
|
25
24
|
)
|
|
26
25
|
from datahub.ingestion.source.snowflake.constants import (
|
|
27
26
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
@@ -48,6 +47,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
|
48
47
|
SnowflakeFK,
|
|
49
48
|
SnowflakePK,
|
|
50
49
|
SnowflakeSchema,
|
|
50
|
+
SnowflakeStream,
|
|
51
51
|
SnowflakeTable,
|
|
52
52
|
SnowflakeTag,
|
|
53
53
|
SnowflakeView,
|
|
@@ -58,6 +58,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
58
58
|
SnowflakeIdentifierBuilder,
|
|
59
59
|
SnowflakeStructuredReportMixin,
|
|
60
60
|
SnowsightUrlBuilder,
|
|
61
|
+
split_qualified_name,
|
|
61
62
|
)
|
|
62
63
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
63
64
|
add_table_to_schema_container,
|
|
@@ -70,6 +71,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
70
71
|
)
|
|
71
72
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
72
73
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
74
|
+
LINEAGE_EXTRACTION,
|
|
73
75
|
METADATA_EXTRACTION,
|
|
74
76
|
PROFILING,
|
|
75
77
|
)
|
|
@@ -81,6 +83,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
|
81
83
|
TimeStamp,
|
|
82
84
|
)
|
|
83
85
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
86
|
+
DatasetLineageTypeClass,
|
|
84
87
|
DatasetProperties,
|
|
85
88
|
ViewProperties,
|
|
86
89
|
)
|
|
@@ -420,73 +423,126 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
420
423
|
schema_name = snowflake_schema.name
|
|
421
424
|
|
|
422
425
|
if self.config.extract_tags != TagOption.skip:
|
|
423
|
-
snowflake_schema
|
|
424
|
-
schema_name=schema_name, db_name=db_name, domain="schema"
|
|
425
|
-
)
|
|
426
|
+
self._process_tags(snowflake_schema, schema_name, db_name, domain="schema")
|
|
426
427
|
|
|
427
428
|
if self.config.include_technical_schema:
|
|
428
429
|
yield from self.gen_schema_containers(snowflake_schema, db_name)
|
|
429
430
|
|
|
430
|
-
|
|
431
|
+
tables, views, streams = [], [], []
|
|
432
|
+
|
|
431
433
|
if self.config.include_tables:
|
|
432
434
|
tables = self.fetch_tables_for_schema(
|
|
433
435
|
snowflake_schema, db_name, schema_name
|
|
434
436
|
)
|
|
437
|
+
db_tables[schema_name] = tables
|
|
438
|
+
yield from self._process_tables(
|
|
439
|
+
tables, snowflake_schema, db_name, schema_name
|
|
440
|
+
)
|
|
441
|
+
|
|
435
442
|
if self.config.include_views:
|
|
436
443
|
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
444
|
+
yield from self._process_views(
|
|
445
|
+
views, snowflake_schema, db_name, schema_name
|
|
446
|
+
)
|
|
437
447
|
|
|
438
|
-
if self.config.
|
|
439
|
-
|
|
448
|
+
if self.config.include_streams:
|
|
449
|
+
self.report.num_get_streams_for_schema_queries += 1
|
|
450
|
+
streams = self.fetch_streams_for_schema(
|
|
451
|
+
snowflake_schema, db_name, schema_name
|
|
452
|
+
)
|
|
453
|
+
yield from self._process_streams(streams, snowflake_schema, db_name)
|
|
440
454
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
for table in tables:
|
|
444
|
-
table_wu_generator = self._process_table(
|
|
445
|
-
table, snowflake_schema, db_name
|
|
446
|
-
)
|
|
455
|
+
if self.config.include_technical_schema and snowflake_schema.tags:
|
|
456
|
+
yield from self._process_tags_in_schema(snowflake_schema)
|
|
447
457
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
458
|
+
if (
|
|
459
|
+
not snowflake_schema.views
|
|
460
|
+
and not snowflake_schema.tables
|
|
461
|
+
and not snowflake_schema.streams
|
|
462
|
+
):
|
|
463
|
+
self.structured_reporter.info(
|
|
464
|
+
title="No tables/views/streams found in schema",
|
|
465
|
+
message="If objects exist, please grant REFERENCES or SELECT permissions on them.",
|
|
466
|
+
context=f"{db_name}.{schema_name}",
|
|
467
|
+
)
|
|
454
468
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
469
|
+
def _process_tags(
|
|
470
|
+
self,
|
|
471
|
+
snowflake_schema: SnowflakeSchema,
|
|
472
|
+
schema_name: str,
|
|
473
|
+
db_name: str,
|
|
474
|
+
domain: str,
|
|
475
|
+
) -> None:
|
|
476
|
+
snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
|
|
477
|
+
schema_name=schema_name, db_name=db_name, domain=domain
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def _process_tables(
|
|
481
|
+
self,
|
|
482
|
+
tables: List[SnowflakeTable],
|
|
483
|
+
snowflake_schema: SnowflakeSchema,
|
|
484
|
+
db_name: str,
|
|
485
|
+
schema_name: str,
|
|
486
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
487
|
+
if self.config.include_technical_schema:
|
|
488
|
+
data_reader = self.make_data_reader()
|
|
489
|
+
for table in tables:
|
|
490
|
+
table_wu_generator = self._process_table(
|
|
491
|
+
table, snowflake_schema, db_name
|
|
492
|
+
)
|
|
493
|
+
yield from classification_workunit_processor(
|
|
494
|
+
table_wu_generator,
|
|
495
|
+
self.classification_handler,
|
|
496
|
+
data_reader,
|
|
497
|
+
[db_name, schema_name, table.name],
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
def _process_views(
|
|
501
|
+
self,
|
|
502
|
+
views: List[SnowflakeView],
|
|
503
|
+
snowflake_schema: SnowflakeSchema,
|
|
504
|
+
db_name: str,
|
|
505
|
+
schema_name: str,
|
|
506
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
507
|
+
if self.aggregator:
|
|
508
|
+
for view in views:
|
|
509
|
+
view_identifier = self.identifiers.get_dataset_identifier(
|
|
510
|
+
view.name, schema_name, db_name
|
|
511
|
+
)
|
|
512
|
+
if view.is_secure and not view.view_definition:
|
|
513
|
+
view.view_definition = self.fetch_secure_view_definition(
|
|
459
514
|
view.name, schema_name, db_name
|
|
460
515
|
)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
default_schema=schema_name,
|
|
471
|
-
)
|
|
472
|
-
elif view.is_secure:
|
|
473
|
-
self.report.num_secure_views_missing_definition += 1
|
|
516
|
+
if view.view_definition:
|
|
517
|
+
self.aggregator.add_view_definition(
|
|
518
|
+
view_urn=self.identifiers.gen_dataset_urn(view_identifier),
|
|
519
|
+
view_definition=view.view_definition,
|
|
520
|
+
default_db=db_name,
|
|
521
|
+
default_schema=schema_name,
|
|
522
|
+
)
|
|
523
|
+
elif view.is_secure:
|
|
524
|
+
self.report.num_secure_views_missing_definition += 1
|
|
474
525
|
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
526
|
+
if self.config.include_technical_schema:
|
|
527
|
+
for view in views:
|
|
528
|
+
yield from self._process_view(view, snowflake_schema, db_name)
|
|
478
529
|
|
|
479
|
-
|
|
530
|
+
def _process_streams(
|
|
531
|
+
self,
|
|
532
|
+
streams: List[SnowflakeStream],
|
|
533
|
+
snowflake_schema: SnowflakeSchema,
|
|
534
|
+
db_name: str,
|
|
535
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
536
|
+
for stream in streams:
|
|
537
|
+
yield from self._process_stream(stream, snowflake_schema, db_name)
|
|
538
|
+
|
|
539
|
+
def _process_tags_in_schema(
|
|
540
|
+
self, snowflake_schema: SnowflakeSchema
|
|
541
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
542
|
+
if snowflake_schema.tags:
|
|
480
543
|
for tag in snowflake_schema.tags:
|
|
481
544
|
yield from self._process_tag(tag)
|
|
482
545
|
|
|
483
|
-
if not snowflake_schema.views and not snowflake_schema.tables:
|
|
484
|
-
self.structured_reporter.info(
|
|
485
|
-
title="No tables/views found in schema",
|
|
486
|
-
message="If tables exist, please grant REFERENCES or SELECT permissions on them.",
|
|
487
|
-
context=f"{db_name}.{schema_name}",
|
|
488
|
-
)
|
|
489
|
-
|
|
490
546
|
def fetch_secure_view_definition(
|
|
491
547
|
self, table_name: str, schema_name: str, db_name: str
|
|
492
548
|
) -> Optional[str]:
|
|
@@ -729,7 +785,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
729
785
|
|
|
730
786
|
def gen_dataset_workunits(
|
|
731
787
|
self,
|
|
732
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
788
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
733
789
|
schema_name: str,
|
|
734
790
|
db_name: str,
|
|
735
791
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -786,13 +842,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
786
842
|
if dpi_aspect:
|
|
787
843
|
yield dpi_aspect
|
|
788
844
|
|
|
789
|
-
subTypes = SubTypes(
|
|
790
|
-
typeNames=(
|
|
791
|
-
[DatasetSubTypes.VIEW]
|
|
792
|
-
if isinstance(table, SnowflakeView)
|
|
793
|
-
else [DatasetSubTypes.TABLE]
|
|
794
|
-
)
|
|
795
|
-
)
|
|
845
|
+
subTypes = SubTypes(typeNames=[table.get_subtype()])
|
|
796
846
|
|
|
797
847
|
yield MetadataChangeProposalWrapper(
|
|
798
848
|
entityUrn=dataset_urn, aspect=subTypes
|
|
@@ -843,28 +893,50 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
843
893
|
|
|
844
894
|
def get_dataset_properties(
|
|
845
895
|
self,
|
|
846
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
896
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
847
897
|
schema_name: str,
|
|
848
898
|
db_name: str,
|
|
849
899
|
) -> DatasetProperties:
|
|
850
900
|
custom_properties = {}
|
|
851
901
|
|
|
852
902
|
if isinstance(table, SnowflakeTable):
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
903
|
+
custom_properties.update(
|
|
904
|
+
{
|
|
905
|
+
k: v
|
|
906
|
+
for k, v in {
|
|
907
|
+
"CLUSTERING_KEY": table.clustering_key,
|
|
908
|
+
"IS_HYBRID": "true" if table.is_hybrid else None,
|
|
909
|
+
"IS_DYNAMIC": "true" if table.is_dynamic else None,
|
|
910
|
+
"IS_ICEBERG": "true" if table.is_iceberg else None,
|
|
911
|
+
}.items()
|
|
912
|
+
if v
|
|
913
|
+
}
|
|
914
|
+
)
|
|
864
915
|
|
|
865
916
|
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
866
917
|
custom_properties["IS_SECURE"] = "true"
|
|
867
918
|
|
|
919
|
+
elif isinstance(table, SnowflakeStream):
|
|
920
|
+
custom_properties.update(
|
|
921
|
+
{
|
|
922
|
+
k: v
|
|
923
|
+
for k, v in {
|
|
924
|
+
"SOURCE_TYPE": table.source_type,
|
|
925
|
+
"TYPE": table.type,
|
|
926
|
+
"STALE": table.stale,
|
|
927
|
+
"MODE": table.mode,
|
|
928
|
+
"INVALID_REASON": table.invalid_reason,
|
|
929
|
+
"OWNER_ROLE_TYPE": table.owner_role_type,
|
|
930
|
+
"TABLE_NAME": table.table_name,
|
|
931
|
+
"BASE_TABLES": table.base_tables,
|
|
932
|
+
"STALE_AFTER": (
|
|
933
|
+
table.stale_after.isoformat() if table.stale_after else None
|
|
934
|
+
),
|
|
935
|
+
}.items()
|
|
936
|
+
if v
|
|
937
|
+
}
|
|
938
|
+
)
|
|
939
|
+
|
|
868
940
|
return DatasetProperties(
|
|
869
941
|
name=table.name,
|
|
870
942
|
created=(
|
|
@@ -909,7 +981,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
909
981
|
).as_workunit()
|
|
910
982
|
|
|
911
983
|
def gen_column_tags_as_structured_properties(
|
|
912
|
-
self,
|
|
984
|
+
self,
|
|
985
|
+
dataset_urn: str,
|
|
986
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
913
987
|
) -> Iterable[MetadataWorkUnit]:
|
|
914
988
|
for column_name in table.column_tags:
|
|
915
989
|
schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
|
|
@@ -922,7 +996,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
922
996
|
|
|
923
997
|
def gen_schema_metadata(
|
|
924
998
|
self,
|
|
925
|
-
table: Union[SnowflakeTable, SnowflakeView],
|
|
999
|
+
table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
|
|
926
1000
|
schema_name: str,
|
|
927
1001
|
db_name: str,
|
|
928
1002
|
) -> SchemaMetadata:
|
|
@@ -1214,3 +1288,158 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1214
1288
|
"External table ddl lineage extraction failed",
|
|
1215
1289
|
exc=e,
|
|
1216
1290
|
)
|
|
1291
|
+
|
|
1292
|
+
def fetch_streams_for_schema(
|
|
1293
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
|
|
1294
|
+
) -> List[SnowflakeStream]:
|
|
1295
|
+
try:
|
|
1296
|
+
streams: List[SnowflakeStream] = []
|
|
1297
|
+
for stream in self.get_streams_for_schema(schema_name, db_name):
|
|
1298
|
+
stream_identifier = self.identifiers.get_dataset_identifier(
|
|
1299
|
+
stream.name, schema_name, db_name
|
|
1300
|
+
)
|
|
1301
|
+
|
|
1302
|
+
self.report.report_entity_scanned(stream_identifier, "stream")
|
|
1303
|
+
|
|
1304
|
+
if not self.filters.is_dataset_pattern_allowed(
|
|
1305
|
+
stream_identifier, SnowflakeObjectDomain.STREAM
|
|
1306
|
+
):
|
|
1307
|
+
self.report.report_dropped(stream_identifier)
|
|
1308
|
+
else:
|
|
1309
|
+
streams.append(stream)
|
|
1310
|
+
snowflake_schema.streams = [stream.name for stream in streams]
|
|
1311
|
+
return streams
|
|
1312
|
+
except Exception as e:
|
|
1313
|
+
if isinstance(e, SnowflakePermissionError):
|
|
1314
|
+
error_msg = f"Failed to get streams for schema {db_name}.{schema_name}. Please check permissions."
|
|
1315
|
+
raise SnowflakePermissionError(error_msg) from e.__cause__
|
|
1316
|
+
else:
|
|
1317
|
+
self.structured_reporter.warning(
|
|
1318
|
+
"Failed to get streams for schema",
|
|
1319
|
+
f"{db_name}.{schema_name}",
|
|
1320
|
+
exc=e,
|
|
1321
|
+
)
|
|
1322
|
+
return []
|
|
1323
|
+
|
|
1324
|
+
def get_streams_for_schema(
|
|
1325
|
+
self, schema_name: str, db_name: str
|
|
1326
|
+
) -> List[SnowflakeStream]:
|
|
1327
|
+
streams = self.data_dictionary.get_streams_for_database(db_name)
|
|
1328
|
+
|
|
1329
|
+
return streams.get(schema_name, [])
|
|
1330
|
+
|
|
1331
|
+
def _process_stream(
|
|
1332
|
+
self,
|
|
1333
|
+
stream: SnowflakeStream,
|
|
1334
|
+
snowflake_schema: SnowflakeSchema,
|
|
1335
|
+
db_name: str,
|
|
1336
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1337
|
+
schema_name = snowflake_schema.name
|
|
1338
|
+
|
|
1339
|
+
try:
|
|
1340
|
+
# Retrieve and register the schema without metadata to prevent columns from mapping upstream
|
|
1341
|
+
stream.columns = self.get_columns_for_stream(stream.table_name)
|
|
1342
|
+
yield from self.gen_dataset_workunits(stream, schema_name, db_name)
|
|
1343
|
+
|
|
1344
|
+
if self.config.include_column_lineage:
|
|
1345
|
+
with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
|
|
1346
|
+
self.populate_stream_upstreams(stream, db_name, schema_name)
|
|
1347
|
+
|
|
1348
|
+
except Exception as e:
|
|
1349
|
+
self.structured_reporter.warning(
|
|
1350
|
+
"Failed to get columns for stream:", stream.name, exc=e
|
|
1351
|
+
)
|
|
1352
|
+
|
|
1353
|
+
def get_columns_for_stream(
|
|
1354
|
+
self,
|
|
1355
|
+
source_object: str, # Qualified name of source table/view
|
|
1356
|
+
) -> List[SnowflakeColumn]:
|
|
1357
|
+
"""
|
|
1358
|
+
Get column information for a stream by getting source object columns and adding metadata columns.
|
|
1359
|
+
Stream includes all columns from source object plus metadata columns like:
|
|
1360
|
+
- METADATA$ACTION
|
|
1361
|
+
- METADATA$ISUPDATE
|
|
1362
|
+
- METADATA$ROW_ID
|
|
1363
|
+
"""
|
|
1364
|
+
columns: List[SnowflakeColumn] = []
|
|
1365
|
+
|
|
1366
|
+
source_parts = split_qualified_name(source_object)
|
|
1367
|
+
|
|
1368
|
+
source_db, source_schema, source_name = source_parts
|
|
1369
|
+
|
|
1370
|
+
# Get columns from source object
|
|
1371
|
+
source_columns = self.data_dictionary.get_columns_for_schema(
|
|
1372
|
+
source_schema, source_db, itertools.chain([source_name])
|
|
1373
|
+
).get(source_name, [])
|
|
1374
|
+
|
|
1375
|
+
# Add all source columns
|
|
1376
|
+
columns.extend(source_columns)
|
|
1377
|
+
|
|
1378
|
+
# Add standard stream metadata columns
|
|
1379
|
+
metadata_columns = [
|
|
1380
|
+
SnowflakeColumn(
|
|
1381
|
+
name="METADATA$ACTION",
|
|
1382
|
+
ordinal_position=len(columns) + 1,
|
|
1383
|
+
is_nullable=False,
|
|
1384
|
+
data_type="VARCHAR",
|
|
1385
|
+
comment="Type of DML operation (INSERT/DELETE)",
|
|
1386
|
+
character_maximum_length=10,
|
|
1387
|
+
numeric_precision=None,
|
|
1388
|
+
numeric_scale=None,
|
|
1389
|
+
),
|
|
1390
|
+
SnowflakeColumn(
|
|
1391
|
+
name="METADATA$ISUPDATE",
|
|
1392
|
+
ordinal_position=len(columns) + 2,
|
|
1393
|
+
is_nullable=False,
|
|
1394
|
+
data_type="BOOLEAN",
|
|
1395
|
+
comment="Whether row is from UPDATE operation",
|
|
1396
|
+
character_maximum_length=None,
|
|
1397
|
+
numeric_precision=None,
|
|
1398
|
+
numeric_scale=None,
|
|
1399
|
+
),
|
|
1400
|
+
SnowflakeColumn(
|
|
1401
|
+
name="METADATA$ROW_ID",
|
|
1402
|
+
ordinal_position=len(columns) + 3,
|
|
1403
|
+
is_nullable=False,
|
|
1404
|
+
data_type="NUMBER",
|
|
1405
|
+
comment="Unique row identifier",
|
|
1406
|
+
character_maximum_length=None,
|
|
1407
|
+
numeric_precision=38,
|
|
1408
|
+
numeric_scale=0,
|
|
1409
|
+
),
|
|
1410
|
+
]
|
|
1411
|
+
|
|
1412
|
+
columns.extend(metadata_columns)
|
|
1413
|
+
|
|
1414
|
+
return columns
|
|
1415
|
+
|
|
1416
|
+
def populate_stream_upstreams(
|
|
1417
|
+
self, stream: SnowflakeStream, db_name: str, schema_name: str
|
|
1418
|
+
) -> None:
|
|
1419
|
+
"""
|
|
1420
|
+
Populate Streams upstream tables
|
|
1421
|
+
"""
|
|
1422
|
+
self.report.num_streams_with_known_upstreams += 1
|
|
1423
|
+
if self.aggregator:
|
|
1424
|
+
source_parts = split_qualified_name(stream.table_name)
|
|
1425
|
+
source_db, source_schema, source_name = source_parts
|
|
1426
|
+
|
|
1427
|
+
dataset_identifier = self.identifiers.get_dataset_identifier(
|
|
1428
|
+
stream.name, schema_name, db_name
|
|
1429
|
+
)
|
|
1430
|
+
dataset_urn = self.identifiers.gen_dataset_urn(dataset_identifier)
|
|
1431
|
+
|
|
1432
|
+
upstream_identifier = self.identifiers.get_dataset_identifier(
|
|
1433
|
+
source_name, source_schema, source_db
|
|
1434
|
+
)
|
|
1435
|
+
upstream_urn = self.identifiers.gen_dataset_urn(upstream_identifier)
|
|
1436
|
+
|
|
1437
|
+
logger.debug(
|
|
1438
|
+
f"""upstream_urn: {upstream_urn}, downstream_urn: {dataset_urn}"""
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
self.aggregator.add_known_lineage_mapping(
|
|
1442
|
+
upstream_urn=upstream_urn,
|
|
1443
|
+
downstream_urn=dataset_urn,
|
|
1444
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
1445
|
+
)
|