acryl-datahub 0.15.0.5rc10__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (22) hide show
  1. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2394 -2394
  2. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +22 -22
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/source/common/subtypes.py +1 -0
  5. datahub/ingestion/source/powerbi/config.py +1 -0
  6. datahub/ingestion/source/powerbi/powerbi.py +28 -3
  7. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
  8. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
  9. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
  10. datahub/ingestion/source/snowflake/constants.py +1 -0
  11. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  12. datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
  13. datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
  14. datahub/ingestion/source/snowflake/snowflake_report.py +6 -0
  15. datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
  16. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
  17. datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
  18. datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
  19. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
  20. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
  21. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
  22. {acryl_datahub-0.15.0.5rc10.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
14
14
  )
15
15
  from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
16
16
  from datahub.utilities.file_backed_collections import FileBackedDict
17
- from datahub.utilities.prefix_batch_builder import build_prefix_batches
17
+ from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
18
18
  from datahub.utilities.serialized_lru_cache import serialized_lru_cache
19
19
 
20
20
  logger: logging.Logger = logging.getLogger(__name__)
@@ -118,6 +118,7 @@ class SnowflakeSchema:
118
118
  comment: Optional[str]
119
119
  tables: List[str] = field(default_factory=list)
120
120
  views: List[str] = field(default_factory=list)
121
+ streams: List[str] = field(default_factory=list)
121
122
  tags: Optional[List[SnowflakeTag]] = None
122
123
 
123
124
 
@@ -131,6 +132,29 @@ class SnowflakeDatabase:
131
132
  tags: Optional[List[SnowflakeTag]] = None
132
133
 
133
134
 
135
+ @dataclass
136
+ class SnowflakeStream:
137
+ name: str
138
+ created: datetime
139
+ owner: str
140
+ source_type: str
141
+ type: str
142
+ stale: str
143
+ mode: str
144
+ invalid_reason: str
145
+ owner_role_type: str
146
+ database_name: str
147
+ schema_name: str
148
+ table_name: str
149
+ comment: Optional[str]
150
+ columns: List[SnowflakeColumn] = field(default_factory=list)
151
+ stale_after: Optional[datetime] = None
152
+ base_tables: Optional[str] = None
153
+ tags: Optional[List[SnowflakeTag]] = None
154
+ column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
155
+ last_altered: Optional[datetime] = None
156
+
157
+
134
158
  class _SnowflakeTagCache:
135
159
  def __init__(self) -> None:
136
160
  # self._database_tags[<database_name>] = list of tags applied to database
@@ -208,6 +232,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
208
232
  self.get_tables_for_database,
209
233
  self.get_views_for_database,
210
234
  self.get_columns_for_schema,
235
+ self.get_streams_for_database,
211
236
  self.get_pk_constraints_for_schema,
212
237
  self.get_fk_constraints_for_schema,
213
238
  ]
@@ -431,9 +456,18 @@ class SnowflakeDataDictionary(SupportsAsObj):
431
456
  # For massive schemas, use a FileBackedDict to avoid memory issues.
432
457
  columns = FileBackedDict()
433
458
 
434
- object_batches = build_prefix_batches(
435
- all_objects, max_batch_size=10000, max_groups_in_batch=5
436
- )
459
+ # Single prefix table case (for streams)
460
+ if len(all_objects) == 1:
461
+ object_batches = [
462
+ [PrefixGroup(prefix=all_objects[0], names=[], exact_match=True)]
463
+ ]
464
+ else:
465
+ # Build batches for full schema scan
466
+ object_batches = build_prefix_batches(
467
+ all_objects, max_batch_size=10000, max_groups_in_batch=5
468
+ )
469
+
470
+ # Process batches
437
471
  for batch_index, object_batch in enumerate(object_batches):
438
472
  if batch_index > 0:
439
473
  logger.info(
@@ -611,3 +645,63 @@ class SnowflakeDataDictionary(SupportsAsObj):
611
645
  tags[column_name].append(snowflake_tag)
612
646
 
613
647
  return tags
648
+
649
+ @serialized_lru_cache(maxsize=1)
650
+ def get_streams_for_database(
651
+ self, db_name: str
652
+ ) -> Dict[str, List[SnowflakeStream]]:
653
+ page_limit = SHOW_VIEWS_MAX_PAGE_SIZE
654
+
655
+ streams: Dict[str, List[SnowflakeStream]] = {}
656
+
657
+ first_iteration = True
658
+ stream_pagination_marker: Optional[str] = None
659
+ while first_iteration or stream_pagination_marker is not None:
660
+ cur = self.connection.query(
661
+ SnowflakeQuery.streams_for_database(
662
+ db_name,
663
+ limit=page_limit,
664
+ stream_pagination_marker=stream_pagination_marker,
665
+ )
666
+ )
667
+
668
+ first_iteration = False
669
+ stream_pagination_marker = None
670
+
671
+ result_set_size = 0
672
+ for stream in cur:
673
+ result_set_size += 1
674
+
675
+ stream_name = stream["name"]
676
+ schema_name = stream["schema_name"]
677
+ if schema_name not in streams:
678
+ streams[schema_name] = []
679
+ streams[stream["schema_name"]].append(
680
+ SnowflakeStream(
681
+ name=stream["name"],
682
+ created=stream["created_on"],
683
+ owner=stream["owner"],
684
+ comment=stream["comment"],
685
+ source_type=stream["source_type"],
686
+ type=stream["type"],
687
+ stale=stream["stale"],
688
+ mode=stream["mode"],
689
+ database_name=stream["database_name"],
690
+ schema_name=stream["schema_name"],
691
+ invalid_reason=stream["invalid_reason"],
692
+ owner_role_type=stream["owner_role_type"],
693
+ stale_after=stream["stale_after"],
694
+ table_name=stream["table_name"],
695
+ base_tables=stream["base_tables"],
696
+ last_altered=stream["created_on"],
697
+ )
698
+ )
699
+
700
+ if result_set_size >= page_limit:
701
+ # If we hit the limit, we need to send another request to get the next page.
702
+ logger.info(
703
+ f"Fetching next page of streams for {db_name} - after {stream_name}"
704
+ )
705
+ stream_pagination_marker = stream_name
706
+
707
+ return streams
@@ -48,6 +48,7 @@ from datahub.ingestion.source.snowflake.snowflake_schema import (
48
48
  SnowflakeFK,
49
49
  SnowflakePK,
50
50
  SnowflakeSchema,
51
+ SnowflakeStream,
51
52
  SnowflakeTable,
52
53
  SnowflakeTag,
53
54
  SnowflakeView,
@@ -58,6 +59,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
58
59
  SnowflakeIdentifierBuilder,
59
60
  SnowflakeStructuredReportMixin,
60
61
  SnowsightUrlBuilder,
62
+ split_qualified_name,
61
63
  )
62
64
  from datahub.ingestion.source.sql.sql_utils import (
63
65
  add_table_to_schema_container,
@@ -70,6 +72,7 @@ from datahub.ingestion.source.sql.sql_utils import (
70
72
  )
71
73
  from datahub.ingestion.source_report.ingestion_stage import (
72
74
  EXTERNAL_TABLE_DDL_LINEAGE,
75
+ LINEAGE_EXTRACTION,
73
76
  METADATA_EXTRACTION,
74
77
  PROFILING,
75
78
  )
@@ -81,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
81
84
  TimeStamp,
82
85
  )
83
86
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
87
+ DatasetLineageTypeClass,
84
88
  DatasetProperties,
85
89
  ViewProperties,
86
90
  )
@@ -420,73 +424,120 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
420
424
  schema_name = snowflake_schema.name
421
425
 
422
426
  if self.config.extract_tags != TagOption.skip:
423
- snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
424
- schema_name=schema_name, db_name=db_name, domain="schema"
425
- )
427
+ self._process_tags(snowflake_schema, schema_name, db_name, domain="schema")
426
428
 
427
429
  if self.config.include_technical_schema:
428
430
  yield from self.gen_schema_containers(snowflake_schema, db_name)
429
431
 
430
- # We need to do this first so that we can use it when fetching columns.
432
+ tables, views, streams = [], [], []
433
+
431
434
  if self.config.include_tables:
432
435
  tables = self.fetch_tables_for_schema(
433
436
  snowflake_schema, db_name, schema_name
434
437
  )
438
+ db_tables[schema_name] = tables
439
+ yield from self._process_tables(
440
+ tables, snowflake_schema, db_name, schema_name
441
+ )
442
+
435
443
  if self.config.include_views:
436
444
  views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
445
+ yield from self._process_views(
446
+ views, snowflake_schema, db_name, schema_name
447
+ )
437
448
 
438
- if self.config.include_tables:
439
- db_tables[schema_name] = tables
449
+ if self.config.include_streams:
450
+ self.report.num_get_streams_for_schema_queries += 1
451
+ streams = self.fetch_streams_for_schema(
452
+ snowflake_schema, db_name, schema_name
453
+ )
454
+ yield from self._process_streams(streams, snowflake_schema, db_name)
440
455
 
441
- if self.config.include_technical_schema:
442
- data_reader = self.make_data_reader()
443
- for table in tables:
444
- table_wu_generator = self._process_table(
445
- table, snowflake_schema, db_name
446
- )
456
+ if self.config.include_technical_schema and snowflake_schema.tags:
457
+ yield from self._process_tags_in_schema(snowflake_schema)
447
458
 
448
- yield from classification_workunit_processor(
449
- table_wu_generator,
450
- self.classification_handler,
451
- data_reader,
452
- [db_name, schema_name, table.name],
453
- )
459
+ if (
460
+ not snowflake_schema.views
461
+ and not snowflake_schema.tables
462
+ and not snowflake_schema.streams
463
+ ):
464
+ self.structured_reporter.info(
465
+ title="No tables/views/streams found in schema",
466
+ message="If objects exist, please grant REFERENCES or SELECT permissions on them.",
467
+ context=f"{db_name}.{schema_name}",
468
+ )
454
469
 
455
- if self.config.include_views:
456
- if self.aggregator:
457
- for view in views:
458
- view_identifier = self.identifiers.get_dataset_identifier(
470
+ def _process_tags(self, snowflake_schema, schema_name, db_name, domain):
471
+ snowflake_schema.tags = self.tag_extractor.get_tags_on_object(
472
+ schema_name=schema_name, db_name=db_name, domain=domain
473
+ )
474
+
475
+ def _process_tables(
476
+ self,
477
+ tables: List[SnowflakeTable],
478
+ snowflake_schema: SnowflakeSchema,
479
+ db_name: str,
480
+ schema_name: str,
481
+ ) -> Iterable[MetadataWorkUnit]:
482
+ if self.config.include_technical_schema:
483
+ data_reader = self.make_data_reader()
484
+ for table in tables:
485
+ table_wu_generator = self._process_table(
486
+ table, snowflake_schema, db_name
487
+ )
488
+ yield from classification_workunit_processor(
489
+ table_wu_generator,
490
+ self.classification_handler,
491
+ data_reader,
492
+ [db_name, schema_name, table.name],
493
+ )
494
+
495
+ def _process_views(
496
+ self,
497
+ views: List[SnowflakeView],
498
+ snowflake_schema: SnowflakeSchema,
499
+ db_name: str,
500
+ schema_name: str,
501
+ ) -> Iterable[MetadataWorkUnit]:
502
+ if self.aggregator:
503
+ for view in views:
504
+ view_identifier = self.identifiers.get_dataset_identifier(
505
+ view.name, schema_name, db_name
506
+ )
507
+ if view.is_secure and not view.view_definition:
508
+ view.view_definition = self.fetch_secure_view_definition(
459
509
  view.name, schema_name, db_name
460
510
  )
461
- if view.is_secure and not view.view_definition:
462
- view.view_definition = self.fetch_secure_view_definition(
463
- view.name, schema_name, db_name
464
- )
465
- if view.view_definition:
466
- self.aggregator.add_view_definition(
467
- view_urn=self.identifiers.gen_dataset_urn(view_identifier),
468
- view_definition=view.view_definition,
469
- default_db=db_name,
470
- default_schema=schema_name,
471
- )
472
- elif view.is_secure:
473
- self.report.num_secure_views_missing_definition += 1
511
+ if view.view_definition:
512
+ self.aggregator.add_view_definition(
513
+ view_urn=self.identifiers.gen_dataset_urn(view_identifier),
514
+ view_definition=view.view_definition,
515
+ default_db=db_name,
516
+ default_schema=schema_name,
517
+ )
518
+ elif view.is_secure:
519
+ self.report.num_secure_views_missing_definition += 1
474
520
 
475
- if self.config.include_technical_schema:
476
- for view in views:
477
- yield from self._process_view(view, snowflake_schema, db_name)
521
+ if self.config.include_technical_schema:
522
+ for view in views:
523
+ yield from self._process_view(view, snowflake_schema, db_name)
478
524
 
479
- if self.config.include_technical_schema and snowflake_schema.tags:
525
+ def _process_streams(
526
+ self,
527
+ streams: List[SnowflakeStream],
528
+ snowflake_schema: SnowflakeSchema,
529
+ db_name: str,
530
+ ) -> Iterable[MetadataWorkUnit]:
531
+ for stream in streams:
532
+ yield from self._process_stream(stream, snowflake_schema, db_name)
533
+
534
+ def _process_tags_in_schema(
535
+ self, snowflake_schema: SnowflakeSchema
536
+ ) -> Iterable[MetadataWorkUnit]:
537
+ if snowflake_schema.tags:
480
538
  for tag in snowflake_schema.tags:
481
539
  yield from self._process_tag(tag)
482
540
 
483
- if not snowflake_schema.views and not snowflake_schema.tables:
484
- self.structured_reporter.info(
485
- title="No tables/views found in schema",
486
- message="If tables exist, please grant REFERENCES or SELECT permissions on them.",
487
- context=f"{db_name}.{schema_name}",
488
- )
489
-
490
541
  def fetch_secure_view_definition(
491
542
  self, table_name: str, schema_name: str, db_name: str
492
543
  ) -> Optional[str]:
@@ -729,7 +780,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
729
780
 
730
781
  def gen_dataset_workunits(
731
782
  self,
732
- table: Union[SnowflakeTable, SnowflakeView],
783
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
733
784
  schema_name: str,
734
785
  db_name: str,
735
786
  ) -> Iterable[MetadataWorkUnit]:
@@ -788,7 +839,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
788
839
 
789
840
  subTypes = SubTypes(
790
841
  typeNames=(
791
- [DatasetSubTypes.VIEW]
842
+ [DatasetSubTypes.SNOWFLAKE_STREAM]
843
+ if isinstance(table, SnowflakeStream)
844
+ else [DatasetSubTypes.VIEW]
792
845
  if isinstance(table, SnowflakeView)
793
846
  else [DatasetSubTypes.TABLE]
794
847
  )
@@ -843,28 +896,50 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
843
896
 
844
897
  def get_dataset_properties(
845
898
  self,
846
- table: Union[SnowflakeTable, SnowflakeView],
899
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
847
900
  schema_name: str,
848
901
  db_name: str,
849
902
  ) -> DatasetProperties:
850
903
  custom_properties = {}
851
904
 
852
905
  if isinstance(table, SnowflakeTable):
853
- if table.clustering_key:
854
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
855
-
856
- if table.is_hybrid:
857
- custom_properties["IS_HYBRID"] = "true"
858
-
859
- if table.is_dynamic:
860
- custom_properties["IS_DYNAMIC"] = "true"
861
-
862
- if table.is_iceberg:
863
- custom_properties["IS_ICEBERG"] = "true"
906
+ custom_properties.update(
907
+ {
908
+ k: v
909
+ for k, v in {
910
+ "CLUSTERING_KEY": table.clustering_key,
911
+ "IS_HYBRID": "true" if table.is_hybrid else None,
912
+ "IS_DYNAMIC": "true" if table.is_dynamic else None,
913
+ "IS_ICEBERG": "true" if table.is_iceberg else None,
914
+ }.items()
915
+ if v
916
+ }
917
+ )
864
918
 
865
919
  if isinstance(table, SnowflakeView) and table.is_secure:
866
920
  custom_properties["IS_SECURE"] = "true"
867
921
 
922
+ elif isinstance(table, SnowflakeStream):
923
+ custom_properties.update(
924
+ {
925
+ k: v
926
+ for k, v in {
927
+ "SOURCE_TYPE": table.source_type,
928
+ "TYPE": table.type,
929
+ "STALE": table.stale,
930
+ "MODE": table.mode,
931
+ "INVALID_REASON": table.invalid_reason,
932
+ "OWNER_ROLE_TYPE": table.owner_role_type,
933
+ "TABLE_NAME": table.table_name,
934
+ "BASE_TABLES": table.base_tables,
935
+ "STALE_AFTER": table.stale_after.isoformat()
936
+ if table.stale_after
937
+ else None,
938
+ }.items()
939
+ if v
940
+ }
941
+ )
942
+
868
943
  return DatasetProperties(
869
944
  name=table.name,
870
945
  created=(
@@ -909,7 +984,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
909
984
  ).as_workunit()
910
985
 
911
986
  def gen_column_tags_as_structured_properties(
912
- self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
987
+ self,
988
+ dataset_urn: str,
989
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
913
990
  ) -> Iterable[MetadataWorkUnit]:
914
991
  for column_name in table.column_tags:
915
992
  schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
@@ -922,7 +999,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
922
999
 
923
1000
  def gen_schema_metadata(
924
1001
  self,
925
- table: Union[SnowflakeTable, SnowflakeView],
1002
+ table: Union[SnowflakeTable, SnowflakeView, SnowflakeStream],
926
1003
  schema_name: str,
927
1004
  db_name: str,
928
1005
  ) -> SchemaMetadata:
@@ -1214,3 +1291,158 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1214
1291
  "External table ddl lineage extraction failed",
1215
1292
  exc=e,
1216
1293
  )
1294
+
1295
+ def fetch_streams_for_schema(
1296
+ self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
1297
+ ) -> List[SnowflakeStream]:
1298
+ try:
1299
+ streams: List[SnowflakeStream] = []
1300
+ for stream in self.get_streams_for_schema(schema_name, db_name):
1301
+ stream_identifier = self.identifiers.get_dataset_identifier(
1302
+ stream.name, schema_name, db_name
1303
+ )
1304
+
1305
+ self.report.report_entity_scanned(stream_identifier, "stream")
1306
+
1307
+ if not self.filters.is_dataset_pattern_allowed(
1308
+ stream_identifier, SnowflakeObjectDomain.STREAM
1309
+ ):
1310
+ self.report.report_dropped(stream_identifier)
1311
+ else:
1312
+ streams.append(stream)
1313
+ snowflake_schema.streams = [stream.name for stream in streams]
1314
+ return streams
1315
+ except Exception as e:
1316
+ if isinstance(e, SnowflakePermissionError):
1317
+ error_msg = f"Failed to get streams for schema {db_name}.{schema_name}. Please check permissions."
1318
+ raise SnowflakePermissionError(error_msg) from e.__cause__
1319
+ else:
1320
+ self.structured_reporter.warning(
1321
+ "Failed to get streams for schema",
1322
+ f"{db_name}.{schema_name}",
1323
+ exc=e,
1324
+ )
1325
+ return []
1326
+
1327
+ def get_streams_for_schema(
1328
+ self, schema_name: str, db_name: str
1329
+ ) -> List[SnowflakeStream]:
1330
+ streams = self.data_dictionary.get_streams_for_database(db_name)
1331
+
1332
+ return streams.get(schema_name, [])
1333
+
1334
+ def _process_stream(
1335
+ self,
1336
+ stream: SnowflakeStream,
1337
+ snowflake_schema: SnowflakeSchema,
1338
+ db_name: str,
1339
+ ) -> Iterable[MetadataWorkUnit]:
1340
+ schema_name = snowflake_schema.name
1341
+
1342
+ try:
1343
+ # Retrieve and register the schema without metadata to prevent columns from mapping upstream
1344
+ stream.columns = self.get_columns_for_stream(stream.table_name)
1345
+ yield from self.gen_dataset_workunits(stream, schema_name, db_name)
1346
+
1347
+ if self.config.include_column_lineage:
1348
+ with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
1349
+ self.populate_stream_upstreams(stream, db_name, schema_name)
1350
+
1351
+ except Exception as e:
1352
+ self.structured_reporter.warning(
1353
+ "Failed to get columns for stream:", stream.name, exc=e
1354
+ )
1355
+
1356
+ def get_columns_for_stream(
1357
+ self,
1358
+ source_object: str, # Qualified name of source table/view
1359
+ ) -> List[SnowflakeColumn]:
1360
+ """
1361
+ Get column information for a stream by getting source object columns and adding metadata columns.
1362
+ Stream includes all columns from source object plus metadata columns like:
1363
+ - METADATA$ACTION
1364
+ - METADATA$ISUPDATE
1365
+ - METADATA$ROW_ID
1366
+ """
1367
+ columns: List[SnowflakeColumn] = []
1368
+
1369
+ source_parts = split_qualified_name(source_object)
1370
+
1371
+ source_db, source_schema, source_name = source_parts
1372
+
1373
+ # Get columns from source object
1374
+ source_columns = self.data_dictionary.get_columns_for_schema(
1375
+ source_schema, source_db, itertools.chain([source_name])
1376
+ ).get(source_name, [])
1377
+
1378
+ # Add all source columns
1379
+ columns.extend(source_columns)
1380
+
1381
+ # Add standard stream metadata columns
1382
+ metadata_columns = [
1383
+ SnowflakeColumn(
1384
+ name="METADATA$ACTION",
1385
+ ordinal_position=len(columns) + 1,
1386
+ is_nullable=False,
1387
+ data_type="VARCHAR",
1388
+ comment="Type of DML operation (INSERT/DELETE)",
1389
+ character_maximum_length=10,
1390
+ numeric_precision=None,
1391
+ numeric_scale=None,
1392
+ ),
1393
+ SnowflakeColumn(
1394
+ name="METADATA$ISUPDATE",
1395
+ ordinal_position=len(columns) + 2,
1396
+ is_nullable=False,
1397
+ data_type="BOOLEAN",
1398
+ comment="Whether row is from UPDATE operation",
1399
+ character_maximum_length=None,
1400
+ numeric_precision=None,
1401
+ numeric_scale=None,
1402
+ ),
1403
+ SnowflakeColumn(
1404
+ name="METADATA$ROW_ID",
1405
+ ordinal_position=len(columns) + 3,
1406
+ is_nullable=False,
1407
+ data_type="NUMBER",
1408
+ comment="Unique row identifier",
1409
+ character_maximum_length=None,
1410
+ numeric_precision=38,
1411
+ numeric_scale=0,
1412
+ ),
1413
+ ]
1414
+
1415
+ columns.extend(metadata_columns)
1416
+
1417
+ return columns
1418
+
1419
+ def populate_stream_upstreams(
1420
+ self, stream: SnowflakeStream, db_name: str, schema_name: str
1421
+ ) -> None:
1422
+ """
1423
+ Populate Streams upstream tables
1424
+ """
1425
+ self.report.num_streams_with_known_upstreams += 1
1426
+ if self.aggregator:
1427
+ source_parts = split_qualified_name(stream.table_name)
1428
+ source_db, source_schema, source_name = source_parts
1429
+
1430
+ dataset_identifier = self.identifiers.get_dataset_identifier(
1431
+ stream.name, schema_name, db_name
1432
+ )
1433
+ dataset_urn = self.identifiers.gen_dataset_urn(dataset_identifier)
1434
+
1435
+ upstream_identifier = self.identifiers.get_dataset_identifier(
1436
+ source_name, source_schema, source_db
1437
+ )
1438
+ upstream_urn = self.identifiers.gen_dataset_urn(upstream_identifier)
1439
+
1440
+ logger.debug(
1441
+ f"""upstream_urn: {upstream_urn}, downstream_urn: {dataset_urn}"""
1442
+ )
1443
+
1444
+ self.aggregator.add_known_lineage_mapping(
1445
+ upstream_urn=upstream_urn,
1446
+ downstream_urn=dataset_urn,
1447
+ lineage_type=DatasetLineageTypeClass.COPY,
1448
+ )
@@ -124,19 +124,20 @@ class SnowflakeFilter:
124
124
  SnowflakeObjectDomain.VIEW,
125
125
  SnowflakeObjectDomain.MATERIALIZED_VIEW,
126
126
  SnowflakeObjectDomain.ICEBERG_TABLE,
127
+ SnowflakeObjectDomain.STREAM,
127
128
  ):
128
129
  return False
129
130
  if _is_sys_table(dataset_name):
130
131
  return False
131
132
 
132
- dataset_params = _split_qualified_name(dataset_name)
133
+ dataset_params = split_qualified_name(dataset_name)
133
134
  if len(dataset_params) != 3:
134
135
  self.structured_reporter.info(
135
136
  title="Unexpected dataset pattern",
136
137
  message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.",
137
138
  context=dataset_name,
138
139
  )
139
- # We fall-through here so table/view filtering still works.
140
+ # We fall-through here so table/view/stream filtering still works.
140
141
 
141
142
  if (
142
143
  len(dataset_params) >= 1
@@ -169,6 +170,14 @@ class SnowflakeFilter:
169
170
  ):
170
171
  return False
171
172
 
173
+ if (
174
+ dataset_type.lower() == SnowflakeObjectDomain.STREAM
175
+ and not self.filter_config.stream_pattern.allowed(
176
+ _cleanup_qualified_name(dataset_name, self.structured_reporter)
177
+ )
178
+ ):
179
+ return False
180
+
172
181
  return True
173
182
 
174
183
 
@@ -183,17 +192,17 @@ def _is_sys_table(table_name: str) -> bool:
183
192
  return table_name.lower().startswith("sys$")
184
193
 
185
194
 
186
- def _split_qualified_name(qualified_name: str) -> List[str]:
195
+ def split_qualified_name(qualified_name: str) -> List[str]:
187
196
  """
188
197
  Split a qualified name into its constituent parts.
189
198
 
190
- >>> _split_qualified_name("db.my_schema.my_table")
199
+ >>> split_qualified_name("db.my_schema.my_table")
191
200
  ['db', 'my_schema', 'my_table']
192
- >>> _split_qualified_name('"db"."my_schema"."my_table"')
201
+ >>> split_qualified_name('"db"."my_schema"."my_table"')
193
202
  ['db', 'my_schema', 'my_table']
194
- >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
203
+ >>> split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
195
204
  ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
196
- >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
205
+ >>> split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
197
206
  ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
198
207
  """
199
208
 
@@ -231,7 +240,7 @@ def _split_qualified_name(qualified_name: str) -> List[str]:
231
240
  def _cleanup_qualified_name(
232
241
  qualified_name: str, structured_reporter: SourceReport
233
242
  ) -> str:
234
- name_parts = _split_qualified_name(qualified_name)
243
+ name_parts = split_qualified_name(qualified_name)
235
244
  if len(name_parts) != 3:
236
245
  if not _is_sys_table(qualified_name):
237
246
  structured_reporter.info(