acryl-datahub 0.15.0.5rc10__py3-none-any.whl → 0.15.0.6rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (35) hide show
  1. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/METADATA +2482 -2482
  2. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/RECORD +35 -24
  3. datahub/_version.py +1 -1
  4. datahub/errors.py +35 -0
  5. datahub/ingestion/source/common/subtypes.py +1 -0
  6. datahub/ingestion/source/mongodb.py +17 -16
  7. datahub/ingestion/source/powerbi/config.py +1 -0
  8. datahub/ingestion/source/powerbi/powerbi.py +28 -3
  9. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
  10. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
  11. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
  12. datahub/ingestion/source/s3/source.py +14 -5
  13. datahub/ingestion/source/snowflake/constants.py +1 -0
  14. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  15. datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
  16. datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
  17. datahub/ingestion/source/snowflake/snowflake_report.py +6 -0
  18. datahub/ingestion/source/snowflake/snowflake_schema.py +108 -4
  19. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +298 -69
  20. datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
  21. datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
  22. datahub/sdk/__init__.py +33 -0
  23. datahub/sdk/_all_entities.py +15 -0
  24. datahub/sdk/_attribution.py +48 -0
  25. datahub/sdk/_entity.py +89 -0
  26. datahub/sdk/_shared.py +338 -0
  27. datahub/sdk/container.py +193 -0
  28. datahub/sdk/dataset.py +584 -0
  29. datahub/sdk/entity_client.py +115 -0
  30. datahub/sdk/main_client.py +56 -0
  31. datahub/sdk/resolver_client.py +101 -0
  32. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/LICENSE +0 -0
  33. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/WHEEL +0 -0
  34. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/entry_points.txt +0 -0
  35. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc2.dist-info}/top_level.txt +0 -0
@@ -104,6 +104,7 @@ class SnowflakeV2Report(
104
104
  schemas_scanned: int = 0
105
105
  databases_scanned: int = 0
106
106
  tags_scanned: int = 0
107
+ streams_scanned: int = 0
107
108
 
108
109
  include_usage_stats: bool = False
109
110
  include_operational_stats: bool = False
@@ -113,6 +114,7 @@ class SnowflakeV2Report(
113
114
  table_lineage_query_secs: float = -1
114
115
  external_lineage_queries_secs: float = -1
115
116
  num_tables_with_known_upstreams: int = 0
117
+ num_streams_with_known_upstreams: int = 0
116
118
  num_upstream_lineage_edge_parsing_failed: int = 0
117
119
  num_secure_views_missing_definition: int = 0
118
120
  num_structured_property_templates_created: int = 0
@@ -131,6 +133,8 @@ class SnowflakeV2Report(
131
133
  num_get_tags_for_object_queries: int = 0
132
134
  num_get_tags_on_columns_for_table_queries: int = 0
133
135
 
136
+ num_get_streams_for_schema_queries: int = 0
137
+
134
138
  rows_zero_objects_modified: int = 0
135
139
 
136
140
  _processed_tags: MutableSet[str] = field(default_factory=set)
@@ -157,6 +161,8 @@ class SnowflakeV2Report(
157
161
  return
158
162
  self._scanned_tags.add(name)
159
163
  self.tags_scanned += 1
164
+ elif ent_type == "stream":
165
+ self.streams_scanned += 1
160
166
  else:
161
167
  raise KeyError(f"Unknown entity {ent_type}.")
162
168
 
@@ -6,6 +6,7 @@ from datetime import datetime
6
6
  from typing import Callable, Dict, Iterable, List, MutableMapping, Optional
7
7
 
8
8
  from datahub.ingestion.api.report import SupportsAsObj
9
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
9
10
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
10
11
  from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection
11
12
  from datahub.ingestion.source.snowflake.snowflake_query import (
@@ -14,7 +15,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
14
15
  )
15
16
  from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
16
17
  from datahub.utilities.file_backed_collections import FileBackedDict
17
- from datahub.utilities.prefix_batch_builder import build_prefix_batches
18
+ from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
18
19
  from datahub.utilities.serialized_lru_cache import serialized_lru_cache
19
20
 
20
21
  logger: logging.Logger = logging.getLogger(__name__)
@@ -100,6 +101,9 @@ class SnowflakeTable(BaseTable):
100
101
  def is_hybrid(self) -> bool:
101
102
  return self.type is not None and self.type == "HYBRID TABLE"
102
103
 
104
+ def get_subtype(self) -> DatasetSubTypes:
105
+ return DatasetSubTypes.TABLE
106
+
103
107
 
104
108
  @dataclass
105
109
  class SnowflakeView(BaseView):
@@ -109,6 +113,9 @@ class SnowflakeView(BaseView):
109
113
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
110
114
  is_secure: bool = False
111
115
 
116
+ def get_subtype(self) -> DatasetSubTypes:
117
+ return DatasetSubTypes.VIEW
118
+
112
119
 
113
120
  @dataclass
114
121
  class SnowflakeSchema:
@@ -118,6 +125,7 @@ class SnowflakeSchema:
118
125
  comment: Optional[str]
119
126
  tables: List[str] = field(default_factory=list)
120
127
  views: List[str] = field(default_factory=list)
128
+ streams: List[str] = field(default_factory=list)
121
129
  tags: Optional[List[SnowflakeTag]] = None
122
130
 
123
131
 
@@ -131,6 +139,32 @@ class SnowflakeDatabase:
131
139
  tags: Optional[List[SnowflakeTag]] = None
132
140
 
133
141
 
142
+ @dataclass
143
+ class SnowflakeStream:
144
+ name: str
145
+ created: datetime
146
+ owner: str
147
+ source_type: str
148
+ type: str
149
+ stale: str
150
+ mode: str
151
+ invalid_reason: str
152
+ owner_role_type: str
153
+ database_name: str
154
+ schema_name: str
155
+ table_name: str
156
+ comment: Optional[str]
157
+ columns: List[SnowflakeColumn] = field(default_factory=list)
158
+ stale_after: Optional[datetime] = None
159
+ base_tables: Optional[str] = None
160
+ tags: Optional[List[SnowflakeTag]] = None
161
+ column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
162
+ last_altered: Optional[datetime] = None
163
+
164
+ def get_subtype(self) -> DatasetSubTypes:
165
+ return DatasetSubTypes.SNOWFLAKE_STREAM
166
+
167
+
134
168
  class _SnowflakeTagCache:
135
169
  def __init__(self) -> None:
136
170
  # self._database_tags[<database_name>] = list of tags applied to database
@@ -208,6 +242,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
208
242
  self.get_tables_for_database,
209
243
  self.get_views_for_database,
210
244
  self.get_columns_for_schema,
245
+ self.get_streams_for_database,
211
246
  self.get_pk_constraints_for_schema,
212
247
  self.get_fk_constraints_for_schema,
213
248
  ]
@@ -431,9 +466,18 @@ class SnowflakeDataDictionary(SupportsAsObj):
431
466
  # For massive schemas, use a FileBackedDict to avoid memory issues.
432
467
  columns = FileBackedDict()
433
468
 
434
- object_batches = build_prefix_batches(
435
- all_objects, max_batch_size=10000, max_groups_in_batch=5
436
- )
469
+ # Single prefix table case (for streams)
470
+ if len(all_objects) == 1:
471
+ object_batches = [
472
+ [PrefixGroup(prefix=all_objects[0], names=[], exact_match=True)]
473
+ ]
474
+ else:
475
+ # Build batches for full schema scan
476
+ object_batches = build_prefix_batches(
477
+ all_objects, max_batch_size=10000, max_groups_in_batch=5
478
+ )
479
+
480
+ # Process batches
437
481
  for batch_index, object_batch in enumerate(object_batches):
438
482
  if batch_index > 0:
439
483
  logger.info(
@@ -611,3 +655,63 @@ class SnowflakeDataDictionary(SupportsAsObj):
611
655
  tags[column_name].append(snowflake_tag)
612
656
 
613
657
  return tags
658
+
659
+ @serialized_lru_cache(maxsize=1)
660
+ def get_streams_for_database(
661
+ self, db_name: str
662
+ ) -> Dict[str, List[SnowflakeStream]]:
663
+ page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
664
+
665
+ streams: Dict[str, List[SnowflakeStream]] = {}
666
+
667
+ first_iteration = True
668
+ stream_pagination_marker: Optional[str] = None
669
+ while first_iteration or stream_pagination_marker is not None:
670
+ cur = self.connection.query(
671
+ SnowflakeQuery.streams_for_database(
672
+ db_name,
673
+ limit=page_limit,
674
+ stream_pagination_marker=stream_pagination_marker,
675
+ )
676
+ )
677
+
678
+ first_iteration = False
679
+ stream_pagination_marker = None
680
+
681
+ result_set_size = 0
682
+ for stream in cur:
683
+ result_set_size += 1
684
+
685
+ stream_name = stream["name"]
686
+ schema_name = stream["schema_name"]
687
+ if schema_name not in streams:
688
+ streams[schema_name] = []
689
+ streams[stream["schema_name"]].append(
690
+ SnowflakeStream(
691
+ name=stream["name"],
692
+ created=stream["created_on"],
693
+ owner=stream["owner"],
694
+ comment=stream["comment"],
695
+ source_type=stream["source_type"],
696
+ type=stream["type"],
697
+ stale=stream["stale"],
698
+ mode=stream["mode"],
699
+ database_name=stream["database_name"],
700
+ schema_name=stream["schema_name"],
701
+ invalid_reason=stream["invalid_reason"],
702
+ owner_role_type=stream["owner_role_type"],
703
+ stale_after=stream["stale_after"],
704
+ table_name=stream["table_name"],
705
+ base_tables=stream["base_tables"],
706
+ last_altered=stream["created_on"],
707
+ )
708
+ )
709
+
710
+ if result_set_size >= page_limit:
711
+ # If we hit the limit, we need to send another request to get the next page.
712
+ logger.info(
713
+ f"Fetching next page of streams for {db_name} - after {stream_name}"
714
+ )
715
+ stream_pagination_marker = stream_name
716
+
717
+ return streams
@@ -21,7 +21,6 @@ from datahub.ingestion.glossary.classification_mixin import (
21
21
  from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage
22
22
  from datahub.ingestion.source.common.subtypes import (
23
23
  DatasetContainerSubTypes,
24
- DatasetSubTypes,
25
24
  )
26
25
  from datahub.ingestion.source.snowflake.constants import (
27
26
  GENERIC_PERMISSION_ERROR_KEY,
@@ -48,6 +47,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
48
47
  SnowflakeFK,
49
48
  SnowflakePK,
50
49
  SnowflakeSchema,
50
+ SnowflakeStream,
51
51
  SnowflakeTable,
52
52
  SnowflakeTag,
53
53
  SnowflakeView,
@@ -58,6 +58,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
58
58
  SnowflakeIdentifierBuilder,
59
59
  SnowflakeStructuredReportMixin,
60
60
  SnowsightUrlBuilder,
61
+ split_qualified_name,
61
62
  )
62
63
  from datahub.ingestion.source.sql.sql_utils import (
63
64
  add_table_to_schema_container,
@@ -70,6 +71,7 @@ from datahub.ingestion.source.sql.sql_utils import (
70
71
  )
71
72
  from datahub.ingestion.source_report.ingestion_stage import (
72
73
  EXTERNAL_TABLE_DDL_LINEAGE,
74
+ LINEAGE_EXTRACTION,
73
75
  METADATA_EXTRACTION,
74
76
  PROFILING,
75
77
  )
@@ -81,6 +83,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
81
83
  TimeStamp,
82
84
  )
83
85
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
86
+ DatasetLineageTypeClass,
84
87
  DatasetProperties,
85
88
  ViewProperties,
86
89
  )
@@ -420,73 +423,126 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
420
423
  schema_name = snowflake_schema.name
421
424
 
422
425
  if self.config.extract_tags != TagOption.skip:
423
- snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
424
- schema_name=schema_name, db_name=db_name, domain="schema"
425
- )
426
+ self._process_tags(snowflake_schema, schema_name, db_name, domain="schema")
426
427
 
427
428
  if self.config.include_technical_schema:
428
429
  yield from self.gen_schema_containers(snowflake_schema, db_name)
429
430
 
430
- # We need to do this first so that we can use it when fetching columns.
431
+ tables, views, streams = [], [], []
432
+
431
433
  if self.config.include_tables:
432
434
  tables = self.fetch_tables_for_schema(
433
435
  snowflake_schema, db_name, schema_name
434
436
  )
437
+ db_tables[schema_name] = tables
438
+ yield from self._process_tables(
439
+ tables, snowflake_schema, db_name, schema_name
440
+ )
441
+
435
442
  if self.config.include_views:
436
443
  views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
444
+ yield from self._process_views(
445
+ views, snowflake_schema, db_name, schema_name
446
+ )
437
447
 
438
- if self.config.include_tables:
439
- db_tables[schema_name] = tables
448
+ if self.config.include_streams:
449
+ self.report.num_get_streams_for_schema_queries += 1
450
+ streams = self.fetch_streams_for_schema(
451
+ snowflake_schema, db_name, schema_name
452
+ )
453
+ yield from self._process_streams(streams, snowflake_schema, db_name)
440
454
 
441
- if self.config.include_technical_schema:
442
- data_reader = self.make_data_reader()
443
- for table in tables:
444
- table_wu_generator = self._process_table(
445
- table, snowflake_schema, db_name
446
- )
455
+ if self.config.include_technical_schema and snowflake_schema.tags:
456
+ yield from self._process_tags_in_schema(snowflake_schema)
447
457
 
448
- yield from classification_workunit_processor(
449
- table_wu_generator,
450
- self.classification_handler,
451
- data_reader,
452
- [db_name, schema_name, table.name],
453
- )
458
+ if (
459
+ not snowflake_schema.views
460
+ and not snowflake_schema.tables
461
+ and not snowflake_schema.streams
462
+ ):
463
+ self.structured_reporter.info(
464
+ title="No tables/views/streams found in schema",
465
+ message="If objects exist, please grant REFERENCES or SELECT permissions on them.",
466
+ context=f"{db_name}.{schema_name}",
467
+ )
454
468
 
455
- if self.config.include_views:
456
- if self.aggregator:
457
- for view in views:
458
- view_identifier = self.identifiers.get_dataset_identifier(
469
+ def _process_tags(
470
+ self,
471
+ snowflake_schema: SnowflakeSchema,
472
+ schema_name: str,
473
+ db_name: str,
474
+ domain: str,
475
+ ) -> None:
476
+ snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
477
+ schema_name=schema_name, db_name=db_name, domain=domain
478
+ )
479
+
480
+ def _process_tables(
481
+ self,
482
+ tables: List[SnowflakeTable],
483
+ snowflake_schema: SnowflakeSchema,
484
+ db_name: str,
485
+ schema_name: str,
486
+ ) -> Iterable[MetadataWorkUnit]:
487
+ if self.config.include_technical_schema:
488
+ data_reader = self.make_data_reader()
489
+ for table in tables:
490
+ table_wu_generator = self._process_table(
491
+ table, snowflake_schema, db_name
492
+ )
493
+ yield from classification_workunit_processor(
494
+ table_wu_generator,
495
+ self.classification_handler,
496
+ data_reader,
497
+ [db_name, schema_name, table.name],
498
+ )
499
+
500
+ def _process_views(
501
+ self,
502
+ views: List[SnowflakeView],
503
+ snowflake_schema: SnowflakeSchema,
504
+ db_name: str,
505
+ schema_name: str,
506
+ ) -> Iterable[MetadataWorkUnit]:
507
+ if self.aggregator:
508
+ for view in views:
509
+ view_identifier = self.identifiers.get_dataset_identifier(
510
+ view.name, schema_name, db_name
511
+ )
512
+ if view.is_secure and not view.view_definition:
513
+ view.view_definition = self.fetch_secure_view_definition(
459
514
  view.name, schema_name, db_name
460
515
  )
461
- if view.is_secure and not view.view_definition:
462
- view.view_definition = self.fetch_secure_view_definition(
463
- view.name, schema_name, db_name
464
- )
465
- if view.view_definition:
466
- self.aggregator.add_view_definition(
467
- view_urn=self.identifiers.gen_dataset_urn(view_identifier),
468
- view_definition=view.view_definition,
469
- default_db=db_name,
470
- default_schema=schema_name,
471
- )
472
- elif view.is_secure:
473
- self.report.num_secure_views_missing_definition += 1
516
+ if view.view_definition:
517
+ self.aggregator.add_view_definition(
518
+ view_urn=self.identifiers.gen_dataset_urn(view_identifier),
519
+ view_definition=view.view_definition,
520
+ default_db=db_name,
521
+ default_schema=schema_name,
522
+ )
523
+ elif view.is_secure:
524
+ self.report.num_secure_views_missing_definition += 1
474
525
 
475
- if self.config.include_technical_schema:
476
- for view in views:
477
- yield from self._process_view(view, snowflake_schema, db_name)
526
+ if self.config.include_technical_schema:
527
+ for view in views:
528
+ yield from self._process_view(view, snowflake_schema, db_name)
478
529
 
479
- if self.config.include_technical_schema and snowflake_schema.tags:
530
+ def _process_streams(
531
+ self,
532
+ streams: List[SnowflakeStream],
533
+ snowflake_schema: SnowflakeSchema,
534
+ db_name: str,
535
+ ) -> Iterable[MetadataWorkUnit]:
536
+ for stream in streams:
537
+ yield from self._process_stream(stream, snowflake_schema, db_name)
538
+
539
+ def _process_tags_in_schema(
540
+ self, snowflake_schema: SnowflakeSchema
541
+ ) -> Iterable[MetadataWorkUnit]:
542
+ if snowflake_schema.tags:
480
543
  for tag in snowflake_schema.tags:
481
544
  yield from self._process_tag(tag)
482
545
 
483
- if not snowflake_schema.views and not snowflake_schema.tables:
484
- self.structured_reporter.info(
485
- title="No tables/views found in schema",
486
- message="If tables exist, please grant REFERENCES or SELECT permissions on them.",
487
- context=f"{db_name}.{schema_name}",
488
- )
489
-
490
546
  def fetch_secure_view_definition(
491
547
  self, table_name: str, schema_name: str, db_name: str
492
548
  ) -> Optional[str]:
@@ -729,7 +785,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
729
785
 
730
786
  def gen_dataset_workunits(
731
787
  self,
732
- table: Union[SnowflakeTable, SnowflakeView],
788
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
733
789
  schema_name: str,
734
790
  db_name: str,
735
791
  ) -> Iterable[MetadataWorkUnit]:
@@ -786,13 +842,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
786
842
  if dpi_aspect:
787
843
  yield dpi_aspect
788
844
 
789
- subTypes = SubTypes(
790
- typeNames=(
791
- [DatasetSubTypes.VIEW]
792
- if isinstance(table, SnowflakeView)
793
- else [DatasetSubTypes.TABLE]
794
- )
795
- )
845
+ subTypes = SubTypes(typeNames=[table.get_subtype()])
796
846
 
797
847
  yield MetadataChangeProposalWrapper(
798
848
  entityUrn=dataset_urn, aspect=subTypes
@@ -843,28 +893,50 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
843
893
 
844
894
  def get_dataset_properties(
845
895
  self,
846
- table: Union[SnowflakeTable, SnowflakeView],
896
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
847
897
  schema_name: str,
848
898
  db_name: str,
849
899
  ) -> DatasetProperties:
850
900
  custom_properties = {}
851
901
 
852
902
  if isinstance(table, SnowflakeTable):
853
- if table.clustering_key:
854
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
855
-
856
- if table.is_hybrid:
857
- custom_properties["IS_HYBRID"] = "true"
858
-
859
- if table.is_dynamic:
860
- custom_properties["IS_DYNAMIC"] = "true"
861
-
862
- if table.is_iceberg:
863
- custom_properties["IS_ICEBERG"] = "true"
903
+ custom_properties.update(
904
+ {
905
+ k: v
906
+ for k, v in {
907
+ "CLUSTERING_KEY": table.clustering_key,
908
+ "IS_HYBRID": "true" if table.is_hybrid else None,
909
+ "IS_DYNAMIC": "true" if table.is_dynamic else None,
910
+ "IS_ICEBERG": "true" if table.is_iceberg else None,
911
+ }.items()
912
+ if v
913
+ }
914
+ )
864
915
 
865
916
  if isinstance(table, SnowflakeView) and table.is_secure:
866
917
  custom_properties["IS_SECURE"] = "true"
867
918
 
919
+ elif isinstance(table, SnowflakeStream):
920
+ custom_properties.update(
921
+ {
922
+ k: v
923
+ for k, v in {
924
+ "SOURCE_TYPE": table.source_type,
925
+ "TYPE": table.type,
926
+ "STALE": table.stale,
927
+ "MODE": table.mode,
928
+ "INVALID_REASON": table.invalid_reason,
929
+ "OWNER_ROLE_TYPE": table.owner_role_type,
930
+ "TABLE_NAME": table.table_name,
931
+ "BASE_TABLES": table.base_tables,
932
+ "STALE_AFTER": (
933
+ table.stale_after.isoformat() if table.stale_after else None
934
+ ),
935
+ }.items()
936
+ if v
937
+ }
938
+ )
939
+
868
940
  return DatasetProperties(
869
941
  name=table.name,
870
942
  created=(
@@ -909,7 +981,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
909
981
  ).as_workunit()
910
982
 
911
983
  def gen_column_tags_as_structured_properties(
912
- self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
984
+ self,
985
+ dataset_urn: str,
986
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
913
987
  ) -> Iterable[MetadataWorkUnit]:
914
988
  for column_name in table.column_tags:
915
989
  schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
@@ -922,7 +996,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
922
996
 
923
997
  def gen_schema_metadata(
924
998
  self,
925
- table: Union[SnowflakeTable, SnowflakeView],
999
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
926
1000
  schema_name: str,
927
1001
  db_name: str,
928
1002
  ) -> SchemaMetadata:
@@ -1214,3 +1288,158 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1214
1288
  "External table ddl lineage extraction failed",
1215
1289
  exc=e,
1216
1290
  )
1291
+
1292
+ def fetch_streams_for_schema(
1293
+ self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
1294
+ ) -> List[SnowflakeStream]:
1295
+ try:
1296
+ streams: List[SnowflakeStream] = []
1297
+ for stream in self.get_streams_for_schema(schema_name, db_name):
1298
+ stream_identifier = self.identifiers.get_dataset_identifier(
1299
+ stream.name, schema_name, db_name
1300
+ )
1301
+
1302
+ self.report.report_entity_scanned(stream_identifier, "stream")
1303
+
1304
+ if not self.filters.is_dataset_pattern_allowed(
1305
+ stream_identifier, SnowflakeObjectDomain.STREAM
1306
+ ):
1307
+ self.report.report_dropped(stream_identifier)
1308
+ else:
1309
+ streams.append(stream)
1310
+ snowflake_schema.streams = [stream.name for stream in streams]
1311
+ return streams
1312
+ except Exception as e:
1313
+ if isinstance(e, SnowflakePermissionError):
1314
+ error_msg = f"Failed to get streams for schema {db_name}.{schema_name}. Please check permissions."
1315
+ raise SnowflakePermissionError(error_msg) from e.__cause__
1316
+ else:
1317
+ self.structured_reporter.warning(
1318
+ "Failed to get streams for schema",
1319
+ f"{db_name}.{schema_name}",
1320
+ exc=e,
1321
+ )
1322
+ return []
1323
+
1324
+ def get_streams_for_schema(
1325
+ self, schema_name: str, db_name: str
1326
+ ) -> List[SnowflakeStream]:
1327
+ streams = self.data_dictionary.get_streams_for_database(db_name)
1328
+
1329
+ return streams.get(schema_name, [])
1330
+
1331
+ def _process_stream(
1332
+ self,
1333
+ stream: SnowflakeStream,
1334
+ snowflake_schema: SnowflakeSchema,
1335
+ db_name: str,
1336
+ ) -> Iterable[MetadataWorkUnit]:
1337
+ schema_name = snowflake_schema.name
1338
+
1339
+ try:
1340
+ # Retrieve and register the schema without metadata to prevent columns from mapping upstream
1341
+ stream.columns = self.get_columns_for_stream(stream.table_name)
1342
+ yield from self.gen_dataset_workunits(stream, schema_name, db_name)
1343
+
1344
+ if self.config.include_column_lineage:
1345
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
1346
+ self.populate_stream_upstreams(stream, db_name, schema_name)
1347
+
1348
+ except Exception as e:
1349
+ self.structured_reporter.warning(
1350
+ "Failed to get columns for stream:", stream.name, exc=e
1351
+ )
1352
+
1353
+ def get_columns_for_stream(
1354
+ self,
1355
+ source_object: str, # Qualified name of source table/view
1356
+ ) -> List[SnowflakeColumn]:
1357
+ """
1358
+ Get column information for a stream by getting source object columns and adding metadata columns.
1359
+ Stream includes all columns from source object plus metadata columns like:
1360
+ - METADATA$ACTION
1361
+ - METADATA$ISUPDATE
1362
+ - METADATA$ROW_ID
1363
+ """
1364
+ columns: List[SnowflakeColumn] = []
1365
+
1366
+ source_parts = split_qualified_name(source_object)
1367
+
1368
+ source_db, source_schema, source_name = source_parts
1369
+
1370
+ # Get columns from source object
1371
+ source_columns = self.data_dictionary.get_columns_for_schema(
1372
+ source_schema, source_db, itertools.chain([source_name])
1373
+ ).get(source_name, [])
1374
+
1375
+ # Add all source columns
1376
+ columns.extend(source_columns)
1377
+
1378
+ # Add standard stream metadata columns
1379
+ metadata_columns = [
1380
+ SnowflakeColumn(
1381
+ name="METADATA$ACTION",
1382
+ ordinal_position=len(columns) + 1,
1383
+ is_nullable=False,
1384
+ data_type="VARCHAR",
1385
+ comment="Type of DML operation (INSERT/DELETE)",
1386
+ character_maximum_length=10,
1387
+ numeric_precision=None,
1388
+ numeric_scale=None,
1389
+ ),
1390
+ SnowflakeColumn(
1391
+ name="METADATA$ISUPDATE",
1392
+ ordinal_position=len(columns) + 2,
1393
+ is_nullable=False,
1394
+ data_type="BOOLEAN",
1395
+ comment="Whether row is from UPDATE operation",
1396
+ character_maximum_length=None,
1397
+ numeric_precision=None,
1398
+ numeric_scale=None,
1399
+ ),
1400
+ SnowflakeColumn(
1401
+ name="METADATA$ROW_ID",
1402
+ ordinal_position=len(columns) + 3,
1403
+ is_nullable=False,
1404
+ data_type="NUMBER",
1405
+ comment="Unique row identifier",
1406
+ character_maximum_length=None,
1407
+ numeric_precision=38,
1408
+ numeric_scale=0,
1409
+ ),
1410
+ ]
1411
+
1412
+ columns.extend(metadata_columns)
1413
+
1414
+ return columns
1415
+
1416
+ def populate_stream_upstreams(
1417
+ self, stream: SnowflakeStream, db_name: str, schema_name: str
1418
+ ) -> None:
1419
+ """
1420
+ Populate Streams upstream tables
1421
+ """
1422
+ self.report.num_streams_with_known_upstreams += 1
1423
+ if self.aggregator:
1424
+ source_parts = split_qualified_name(stream.table_name)
1425
+ source_db, source_schema, source_name = source_parts
1426
+
1427
+ dataset_identifier = self.identifiers.get_dataset_identifier(
1428
+ stream.name, schema_name, db_name
1429
+ )
1430
+ dataset_urn = self.identifiers.gen_dataset_urn(dataset_identifier)
1431
+
1432
+ upstream_identifier = self.identifiers.get_dataset_identifier(
1433
+ source_name, source_schema, source_db
1434
+ )
1435
+ upstream_urn = self.identifiers.gen_dataset_urn(upstream_identifier)
1436
+
1437
+ logger.debug(
1438
+ f"""upstream_urn: {upstream_urn}, downstream_urn: {dataset_urn}"""
1439
+ )
1440
+
1441
+ self.aggregator.add_known_lineage_mapping(
1442
+ upstream_urn=upstream_urn,
1443
+ downstream_urn=dataset_urn,
1444
+ lineage_type=DatasetLineageTypeClass.COPY,
1445
+ )