acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show
  1. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2431 -2431
  2. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +46 -45
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/graph/client.py +2 -1
  5. datahub/ingestion/graph/entity_versioning.py +201 -0
  6. datahub/ingestion/source/abs/report.py +2 -2
  7. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  8. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/delta_lake/report.py +2 -2
  11. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  12. datahub/ingestion/source/elastic_search.py +2 -1
  13. datahub/ingestion/source/ge_profiling_config.py +11 -7
  14. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  15. datahub/ingestion/source/identity/okta.py +2 -1
  16. datahub/ingestion/source/kafka/kafka.py +2 -1
  17. datahub/ingestion/source/kafka_connect/common.py +2 -1
  18. datahub/ingestion/source/ldap.py +2 -1
  19. datahub/ingestion/source/looker/lookml_config.py +9 -5
  20. datahub/ingestion/source/mongodb.py +2 -1
  21. datahub/ingestion/source/nifi.py +2 -1
  22. datahub/ingestion/source/powerbi/config.py +3 -2
  23. datahub/ingestion/source/powerbi/powerbi.py +28 -3
  24. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
  25. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
  26. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
  27. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  28. datahub/ingestion/source/redash.py +5 -5
  29. datahub/ingestion/source/salesforce.py +4 -1
  30. datahub/ingestion/source/snowflake/constants.py +1 -0
  31. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
  33. datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
  34. datahub/ingestion/source/snowflake/snowflake_report.py +8 -1
  35. datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
  36. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
  37. datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
  38. datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
  39. datahub/ingestion/source/tableau/tableau.py +2 -1
  40. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  41. datahub/ingestion/source/unity/report.py +1 -0
  42. datahub/ingestion/source_report/pulsar.py +5 -4
  43. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
  44. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
115
115
  )
116
116
  max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
117
117
  default=None,
118
- description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
118
+ description="A positive integer that specifies the maximum number of columns to profile for "
119
+ "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
120
+ "number of columns to profile goes up.",
119
121
  )
120
122
 
121
123
  profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
122
124
  default=None,
123
- description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
125
+ description="Profile table only if it has been updated since these many number of days. "
126
+ "If set to `null`, no constraint of last modified time for tables to profile. "
127
+ "Supported only in `snowflake` and `BigQuery`.",
124
128
  )
125
129
 
126
130
  profile_table_size_limit: Optional[int] = Field(
127
131
  default=5,
128
132
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
129
- "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
130
- "Supported for `oracle` based on calculated size from gathered stats.",
133
+ "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
+ "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
131
135
  )
132
136
 
133
137
  profile_table_row_limit: Optional[int] = Field(
134
138
  default=5000000,
135
- description="Profile tables only if their row count is less than specified count. If set to `null`, "
136
- "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
137
- "Supported for `oracle` based on gathered stats.",
139
+ description="Profile tables only if their row count is less than specified count. "
140
+ "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
+ "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
138
142
  )
139
143
 
140
144
  profile_table_row_count_estimate_only: bool = Field(
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, Optional
4
4
 
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
20
20
  OperationConfig,
21
21
  is_profiling_enabled,
22
22
  )
23
+ from datahub.utilities.lossy_collections import LossyList
23
24
  from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
24
25
 
25
26
  logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
198
199
  class IcebergSourceReport(StaleEntityRemovalSourceReport):
199
200
  tables_scanned: int = 0
200
201
  entities_profiled: int = 0
201
- filtered: List[str] = field(default_factory=list)
202
+ filtered: LossyList[str] = field(default_factory=LossyList)
202
203
  load_table_timings: TimingClass = field(default_factory=TimingClass)
203
204
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
204
205
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
50
50
  OriginTypeClass,
51
51
  StatusClass,
52
52
  )
53
+ from datahub.utilities.lossy_collections import LossyList
53
54
 
54
55
  logger = logging.getLogger(__name__)
55
56
  nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
173
174
 
174
175
  @dataclass
175
176
  class OktaSourceReport(StaleEntityRemovalSourceReport):
176
- filtered: List[str] = field(default_factory=list)
177
+ filtered: LossyList[str] = field(default_factory=LossyList)
177
178
 
178
179
  def report_filtered(self, name: str) -> None:
179
180
  self.filtered.append(name)
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
73
73
  OwnershipSourceTypeClass,
74
74
  SubTypesClass,
75
75
  )
76
+ from datahub.utilities.lossy_collections import LossyList
76
77
  from datahub.utilities.mapping import Constants, OperationProcessor
77
78
  from datahub.utilities.registries.domain_registry import DomainRegistry
78
79
  from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
190
191
  @dataclass
191
192
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
192
193
  topics_scanned: int = 0
193
- filtered: List[str] = field(default_factory=list)
194
+ filtered: LossyList[str] = field(default_factory=LossyList)
194
195
 
195
196
  def report_topic_scanned(self, topic: str) -> None:
196
197
  self.topics_scanned += 1
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
16
  from datahub.ingestion.source.state.stateful_ingestion_base import (
17
17
  StatefulIngestionConfigBase,
18
18
  )
19
+ from datahub.utilities.lossy_collections import LossyList
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
83
84
  @dataclass
84
85
  class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
86
  connectors_scanned: int = 0
86
- filtered: List[str] = field(default_factory=list)
87
+ filtered: LossyList[str] = field(default_factory=LossyList)
87
88
 
88
89
  def report_connector_scanned(self, connector: str) -> None:
89
90
  self.connectors_scanned += 1
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
37
37
  CorpUserSnapshotClass,
38
38
  GroupMembershipClass,
39
39
  )
40
+ from datahub.utilities.lossy_collections import LossyList
40
41
 
41
42
  # default mapping for attrs
42
43
  user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
160
161
 
161
162
  @dataclasses.dataclass
162
163
  class LDAPSourceReport(StaleEntityRemovalSourceReport):
163
- dropped_dns: List[str] = dataclasses.field(default_factory=list)
164
+ dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
164
165
 
165
166
  def report_dropped(self, dn: str) -> None:
166
167
  self.dropped_dns.append(dn)
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field as dataclass_field
3
3
  from datetime import timedelta
4
- from typing import Any, Dict, List, Literal, Optional, Union
4
+ from typing import Any, Dict, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
7
  from pydantic import root_validator, validator
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
48
48
  class LookMLSourceReport(StaleEntityRemovalSourceReport):
49
49
  git_clone_latency: Optional[timedelta] = None
50
50
  models_discovered: int = 0
51
- models_dropped: List[str] = dataclass_field(default_factory=LossyList)
51
+ models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
52
52
  views_discovered: int = 0
53
- views_dropped: List[str] = dataclass_field(default_factory=LossyList)
54
- views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList)
53
+ views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
54
+ views_dropped_unreachable: LossyList[str] = dataclass_field(
55
+ default_factory=LossyList
56
+ )
55
57
  query_parse_attempts: int = 0
56
58
  query_parse_failures: int = 0
57
- query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList)
59
+ query_parse_failure_views: LossyList[str] = dataclass_field(
60
+ default_factory=LossyList
61
+ )
58
62
  _looker_api: Optional[LookerAPI] = None
59
63
 
60
64
  def report_models_scanned(self) -> None:
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
68
68
  UnionTypeClass,
69
69
  )
70
70
  from datahub.metadata.urns import DatasetUrn
71
+ from datahub.utilities.lossy_collections import LossyList
71
72
 
72
73
  logger = logging.getLogger(__name__)
73
74
 
@@ -143,7 +144,7 @@ class MongoDBConfig(
143
144
 
144
145
  @dataclass
145
146
  class MongoDBSourceReport(StaleEntityRemovalSourceReport):
146
- filtered: List[str] = field(default_factory=list)
147
+ filtered: LossyList[str] = field(default_factory=LossyList)
147
148
 
148
149
  def report_dropped(self, name: str) -> None:
149
150
  self.filtered.append(name)
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DatasetPropertiesClass,
47
47
  )
48
48
  from datahub.specific.datajob import DataJobPatchBuilder
49
+ from datahub.utilities.lossy_collections import LossyList
49
50
 
50
51
  logger = logging.getLogger(__name__)
51
52
  NIFI = "nifi"
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
453
 
453
454
  @dataclass
454
455
  class NifiSourceReport(SourceReport):
455
- filtered: List[str] = field(default_factory=list)
456
+ filtered: LossyList[str] = field(default_factory=LossyList)
456
457
 
457
458
  def report_dropped(self, ent_name: str) -> None:
458
459
  self.filtered.append(ent_name)
@@ -132,6 +132,7 @@ class Constant:
132
132
  ACTIVE = "Active"
133
133
  SQL_PARSING_FAILURE = "SQL Parsing Failure"
134
134
  M_QUERY_NULL = '"null"'
135
+ REPORT_WEB_URL = "reportWebUrl"
135
136
 
136
137
 
137
138
  @dataclass
@@ -195,8 +196,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
195
196
 
196
197
  dashboards_scanned: int = 0
197
198
  charts_scanned: int = 0
198
- filtered_dashboards: List[str] = dataclass_field(default_factory=list)
199
- filtered_charts: List[str] = dataclass_field(default_factory=list)
199
+ filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
200
+ filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
200
201
 
201
202
  m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
202
203
  m_query_parse_attempts: int = 0
@@ -582,8 +582,11 @@ class Mapper:
582
582
  if tile.dataset is not None and tile.dataset.webUrl is not None:
583
583
  custom_properties[Constant.DATASET_WEB_URL] = tile.dataset.webUrl
584
584
 
585
- if tile.report is not None and tile.report.id is not None:
586
- custom_properties[Constant.REPORT_ID] = tile.report.id
585
+ if tile.report_id is not None:
586
+ custom_properties[Constant.REPORT_ID] = tile.report_id
587
+
588
+ if tile.report is not None and tile.report.webUrl is not None:
589
+ custom_properties[Constant.REPORT_WEB_URL] = tile.report.webUrl
587
590
 
588
591
  return custom_properties
589
592
 
@@ -1053,6 +1056,7 @@ class Mapper:
1053
1056
  report: powerbi_data_classes.Report,
1054
1057
  chart_mcps: List[MetadataChangeProposalWrapper],
1055
1058
  user_mcps: List[MetadataChangeProposalWrapper],
1059
+ dashboard_edges: List[EdgeClass],
1056
1060
  ) -> List[MetadataChangeProposalWrapper]:
1057
1061
  """
1058
1062
  Map PowerBi report to Datahub dashboard
@@ -1074,6 +1078,7 @@ class Mapper:
1074
1078
  charts=chart_urn_list,
1075
1079
  lastModified=ChangeAuditStamps(),
1076
1080
  dashboardUrl=report.webUrl,
1081
+ dashboards=dashboard_edges,
1077
1082
  )
1078
1083
 
1079
1084
  info_mcp = self.new_mcp(
@@ -1167,8 +1172,28 @@ class Mapper:
1167
1172
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1168
1173
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1169
1174
 
1175
+ # find all dashboards with a Tile referencing this report
1176
+ downstream_dashboards_edges = []
1177
+ for d in workspace.dashboards.values():
1178
+ if any(t.report_id == report.id for t in d.tiles):
1179
+ dashboard_urn = builder.make_dashboard_urn(
1180
+ platform=self.__config.platform_name,
1181
+ platform_instance=self.__config.platform_instance,
1182
+ name=d.get_urn_part(),
1183
+ )
1184
+ edge = EdgeClass(
1185
+ destinationUrn=dashboard_urn,
1186
+ sourceUrn=None,
1187
+ created=None,
1188
+ lastModified=None,
1189
+ properties=None,
1190
+ )
1191
+ downstream_dashboards_edges.append(edge)
1192
+
1170
1193
  # Let's convert report to datahub dashboard
1171
- report_mcps = self.report_to_dashboard(workspace, report, chart_mcps, user_mcps)
1194
+ report_mcps = self.report_to_dashboard(
1195
+ workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
1196
+ )
1172
1197
 
1173
1198
  # Now add MCPs in sequence
1174
1199
  mcps.extend(ds_mcps)
@@ -286,11 +286,15 @@ class Tile:
286
286
  id: str
287
287
  title: str
288
288
  embedUrl: str
289
- dataset: Optional["PowerBIDataset"]
290
289
  dataset_id: Optional[str]
291
- report: Optional[Report]
290
+ report_id: Optional[str]
292
291
  createdFrom: CreatedFrom
293
292
 
293
+ # In a first pass, `dataset_id` and/or `report_id` are filled in.
294
+ # In a subsequent pass, the objects are populated.
295
+ dataset: Optional["PowerBIDataset"]
296
+ report: Optional[Report]
297
+
294
298
  def get_urn_part(self):
295
299
  return f"charts.{self.id}"
296
300
 
@@ -337,41 +337,6 @@ class DataResolverBase(ABC):
337
337
  -tiles), there is no information available on pagination
338
338
 
339
339
  """
340
-
341
- def new_dataset_or_report(tile_instance: Any) -> dict:
342
- """
343
- Find out which is the data source for tile. It is either REPORT or DATASET
344
- """
345
- report_fields = {
346
- Constant.REPORT: (
347
- self.get_report(
348
- workspace=workspace,
349
- report_id=tile_instance.get(Constant.REPORT_ID),
350
- )
351
- if tile_instance.get(Constant.REPORT_ID) is not None
352
- else None
353
- ),
354
- Constant.CREATED_FROM: Tile.CreatedFrom.UNKNOWN,
355
- }
356
-
357
- # reportId and datasetId are exclusive in tile_instance
358
- # if datasetId is present that means tile is created from dataset
359
- # if reportId is present that means tile is created from report
360
- # if both i.e. reportId and datasetId are not present then tile is created from some visualization
361
- if tile_instance.get(Constant.REPORT_ID) is not None:
362
- report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.REPORT
363
- elif tile_instance.get(Constant.DATASET_ID) is not None:
364
- report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.DATASET
365
- else:
366
- report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.VISUALIZATION
367
-
368
- title: Optional[str] = tile_instance.get(Constant.TITLE)
369
- _id: Optional[str] = tile_instance.get(Constant.ID)
370
- created_from: Any = report_fields[Constant.CREATED_FROM]
371
- logger.info(f"Tile {title}({_id}) is created from {created_from}")
372
-
373
- return report_fields
374
-
375
340
  tile_list_endpoint: str = self.get_tiles_endpoint(
376
341
  workspace, dashboard_id=dashboard.id
377
342
  )
@@ -393,8 +358,18 @@ class DataResolverBase(ABC):
393
358
  title=instance.get(Constant.TITLE),
394
359
  embedUrl=instance.get(Constant.EMBED_URL),
395
360
  dataset_id=instance.get(Constant.DATASET_ID),
361
+ report_id=instance.get(Constant.REPORT_ID),
396
362
  dataset=None,
397
- **new_dataset_or_report(instance),
363
+ report=None,
364
+ createdFrom=(
365
+ # In the past we considered that only one of the two report_id or dataset_id would be present
366
+ # but we have seen cases where both are present. If both are present, we prioritize the report.
367
+ Tile.CreatedFrom.REPORT
368
+ if instance.get(Constant.REPORT_ID)
369
+ else Tile.CreatedFrom.DATASET
370
+ if instance.get(Constant.DATASET_ID)
371
+ else Tile.CreatedFrom.VISUALIZATION
372
+ ),
398
373
  )
399
374
  for instance in tile_dict
400
375
  if instance is not None
@@ -625,13 +625,26 @@ class PowerBiAPI:
625
625
  dashboard.tiles = self._get_resolver().get_tiles(
626
626
  workspace, dashboard=dashboard
627
627
  )
628
- # set the dataset for tiles
628
+ # set the dataset and the report for tiles
629
629
  for tile in dashboard.tiles:
630
+ # In Power BI, dashboards, reports, and datasets are tightly scoped to the workspace they belong to.
631
+ # https://learn.microsoft.com/en-us/power-bi/collaborate-share/service-new-workspaces
632
+ if tile.report_id:
633
+ tile.report = workspace.reports.get(tile.report_id)
634
+ if tile.report is None:
635
+ self.reporter.info(
636
+ title="Missing Report Lineage For Tile",
637
+ message="A Report reference that failed to be resolved. Please ensure that 'extract_reports' is set to True in the configuration.",
638
+ context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, report-id: {tile.report_id}",
639
+ )
640
+ # However, semantic models (aka datasets) can be shared accross workspaces
641
+ # https://learn.microsoft.com/en-us/fabric/admin/portal-workspace#use-semantic-models-across-workspaces
642
+ # That's why the global 'dataset_registry' is required
630
643
  if tile.dataset_id:
631
644
  tile.dataset = self.dataset_registry.get(tile.dataset_id)
632
645
  if tile.dataset is None:
633
646
  self.reporter.info(
634
- title="Missing Lineage For Tile",
647
+ title="Missing Dataset Lineage For Tile",
635
648
  message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
636
649
  context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, dataset-id: {tile.dataset_id}",
637
650
  )
@@ -653,10 +666,10 @@ class PowerBiAPI:
653
666
  for dashboard in workspace.dashboards.values():
654
667
  dashboard.tags = workspace.dashboard_endorsements.get(dashboard.id, [])
655
668
 
669
+ # fill reports first since some dashboard may reference a report
670
+ fill_reports()
656
671
  if self.__config.extract_dashboards:
657
672
  fill_dashboards()
658
-
659
- fill_reports()
660
673
  fill_dashboard_tags()
661
674
  self._fill_independent_datasets(workspace=workspace)
662
675
 
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
53
53
  StatusClass,
54
54
  )
55
55
  from datahub.utilities.dedup_list import deduplicate_list
56
+ from datahub.utilities.lossy_collections import LossyList
56
57
 
57
58
  LOGGER = logging.getLogger(__name__)
58
59
 
@@ -476,7 +477,7 @@ class Mapper:
476
477
  @dataclass
477
478
  class PowerBiReportServerDashboardSourceReport(SourceReport):
478
479
  scanned_report: int = 0
479
- filtered_reports: List[str] = dataclass_field(default_factory=list)
480
+ filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
480
481
 
481
482
  def report_scanned(self, count: int = 1) -> None:
482
483
  self.scanned_report += count
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set
5
+ from typing import Dict, Iterable, List, Optional
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
39
39
  DashboardInfoClass,
40
40
  )
41
41
  from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
- from datahub.utilities.lossy_collections import LossyDict, LossyList
42
+ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
45
45
 
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
280
280
  class RedashSourceReport(SourceReport):
281
281
  items_scanned: int = 0
282
282
  filtered: LossyList[str] = field(default_factory=LossyList)
283
- queries_problem_parsing: Set[str] = field(default_factory=set)
284
- queries_no_dataset: Set[str] = field(default_factory=set)
285
- charts_no_input: Set[str] = field(default_factory=set)
283
+ queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
284
+ queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
285
+ charts_no_input: LossySet[str] = field(default_factory=LossySet)
286
286
  total_queries: Optional[int] = field(
287
287
  default=None,
288
288
  )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import time
4
+ from dataclasses import dataclass, field as dataclass_field
4
5
  from datetime import datetime
5
6
  from enum import Enum
6
7
  from typing import Any, Dict, Iterable, List, Optional
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
60
61
  TagAssociationClass,
61
62
  )
62
63
  from datahub.utilities import config_clean
64
+ from datahub.utilities.lossy_collections import LossyList
63
65
 
64
66
  logger = logging.getLogger(__name__)
65
67
 
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
146
148
  return config_clean.remove_trailing_slashes(v)
147
149
 
148
150
 
151
+ @dataclass
149
152
  class SalesforceSourceReport(SourceReport):
150
- filtered: List[str] = []
153
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
151
154
 
152
155
  def report_dropped(self, ent_name: str) -> None:
153
156
  self.filtered.append(ent_name)
@@ -53,6 +53,7 @@ class SnowflakeObjectDomain(StrEnum):
53
53
  SCHEMA = "schema"
54
54
  COLUMN = "column"
55
55
  ICEBERG_TABLE = "iceberg table"
56
+ STREAM = "stream"
56
57
 
57
58
 
58
59
  GENERIC_PERMISSION_ERROR_KEY = "permission-error"
@@ -98,6 +98,11 @@ class SnowflakeFilterConfig(SQLFilterConfig):
98
98
  )
99
99
  # table_pattern and view_pattern are inherited from SQLFilterConfig
100
100
 
101
+ stream_pattern: AllowDenyPattern = Field(
102
+ default=AllowDenyPattern.allow_all(),
103
+ description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
104
+ )
105
+
101
106
  match_fully_qualified_names: bool = Field(
102
107
  default=False,
103
108
  description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
@@ -274,6 +279,11 @@ class SnowflakeV2Config(
274
279
  description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
275
280
  )
276
281
 
282
+ include_streams: bool = Field(
283
+ default=True,
284
+ description="If enabled, streams will be ingested as separate entities from tables/views.",
285
+ )
286
+
277
287
  structured_property_pattern: AllowDenyPattern = Field(
278
288
  default=AllowDenyPattern.allow_all(),
279
289
  description=(
@@ -49,6 +49,7 @@ from datahub.metadata.urns import CorpUserUrn
49
49
  from datahub.sql_parsing.schema_resolver import SchemaResolver
50
50
  from datahub.sql_parsing.sql_parsing_aggregator import (
51
51
  KnownLineageMapping,
52
+ ObservedQuery,
52
53
  PreparsedQuery,
53
54
  SqlAggregatorReport,
54
55
  SqlParsingAggregator,
@@ -241,7 +242,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
241
242
  use_cached_audit_log = audit_log_file.exists()
242
243
 
243
244
  queries: FileBackedList[
244
- Union[KnownLineageMapping, PreparsedQuery, TableRename, TableSwap]
245
+ Union[
246
+ KnownLineageMapping,
247
+ PreparsedQuery,
248
+ TableRename,
249
+ TableSwap,
250
+ ObservedQuery,
251
+ ]
245
252
  ]
246
253
  if use_cached_audit_log:
247
254
  logger.info("Using cached audit log")
@@ -252,7 +259,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
252
259
 
253
260
  shared_connection = ConnectionWrapper(audit_log_file)
254
261
  queries = FileBackedList(shared_connection)
255
- entry: Union[KnownLineageMapping, PreparsedQuery, TableRename, TableSwap]
262
+ entry: Union[
263
+ KnownLineageMapping,
264
+ PreparsedQuery,
265
+ TableRename,
266
+ TableSwap,
267
+ ObservedQuery,
268
+ ]
256
269
 
257
270
  with self.report.copy_history_fetch_timer:
258
271
  for entry in self.fetch_copy_history():
@@ -329,7 +342,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
329
342
 
330
343
  def fetch_query_log(
331
344
  self, users: UsersMapping
332
- ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
345
+ ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
333
346
  query_log_query = _build_enriched_query_log_query(
334
347
  start_time=self.config.window.start_time,
335
348
  end_time=self.config.window.end_time,
@@ -362,7 +375,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
362
375
 
363
376
  def _parse_audit_log_row(
364
377
  self, row: Dict[str, Any], users: UsersMapping
365
- ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
378
+ ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
366
379
  json_fields = {
367
380
  "DIRECT_OBJECTS_ACCESSED",
368
381
  "OBJECTS_MODIFIED",
@@ -398,6 +411,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
398
411
  pass
399
412
  else:
400
413
  return None
414
+
415
+ user = CorpUserUrn(
416
+ self.identifiers.get_user_identifier(
417
+ res["user_name"], users.get(res["user_name"])
418
+ )
419
+ )
420
+
421
+ # Use direct_objects_accessed instead objects_modified
422
+ # objects_modified returns $SYS_VIEW_X with no mapping
423
+ has_stream_objects = any(
424
+ obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
425
+ )
426
+
427
+ # If a stream is used, default to query parsing.
428
+ if has_stream_objects:
429
+ logger.debug("Found matching stream object")
430
+ return ObservedQuery(
431
+ query=res["query_text"],
432
+ session_id=res["session_id"],
433
+ timestamp=res["query_start_time"].astimezone(timezone.utc),
434
+ user=user,
435
+ default_db=res["default_db"],
436
+ default_schema=res["default_schema"],
437
+ query_hash=get_query_fingerprint(
438
+ res["query_text"], self.identifiers.platform, fast=True
439
+ ),
440
+ )
441
+
401
442
  upstreams = []
402
443
  column_usage = {}
403
444
 
@@ -460,12 +501,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
460
501
  )
461
502
  )
462
503
 
463
- user = CorpUserUrn(
464
- self.identifiers.get_user_identifier(
465
- res["user_name"], users.get(res["user_name"])
466
- )
467
- )
468
-
469
504
  timestamp: datetime = res["query_start_time"]
470
505
  timestamp = timestamp.astimezone(timezone.utc)
471
506
 
@@ -9,6 +9,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
9
9
  from datahub.utilities.prefix_batch_builder import PrefixGroup
10
10
 
11
11
  SHOW_VIEWS_MAX_PAGE_SIZE = 10000
12
+ SHOW_STREAM_MAX_PAGE_SIZE = 10000
12
13
 
13
14
 
14
15
  def create_deny_regex_sql_filter(
@@ -36,6 +37,7 @@ class SnowflakeQuery:
36
37
  SnowflakeObjectDomain.VIEW.capitalize(),
37
38
  SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
38
39
  SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
40
+ SnowflakeObjectDomain.STREAM.capitalize(),
39
41
  }
40
42
 
41
43
  ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
@@ -44,7 +46,8 @@ class SnowflakeQuery:
44
46
  ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
45
47
  "("
46
48
  f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
47
- f"'{SnowflakeObjectDomain.VIEW.capitalize()}'"
49
+ f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
50
+ f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
48
51
  ")"
49
52
  )
50
53
 
@@ -963,3 +966,19 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
963
966
  @staticmethod
964
967
  def get_all_users() -> str:
965
968
  return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
969
+
970
+ @staticmethod
971
+ def streams_for_database(
972
+ db_name: str,
973
+ limit: int = SHOW_STREAM_MAX_PAGE_SIZE,
974
+ stream_pagination_marker: Optional[str] = None,
975
+ ) -> str:
976
+ # SHOW STREAMS can return a maximum of 10000 rows.
977
+ # https://docs.snowflake.com/en/sql-reference/sql/show-streams#usage-notes
978
+ assert limit <= SHOW_STREAM_MAX_PAGE_SIZE
979
+
980
+ # To work around this, we paginate through the results using the FROM clause.
981
+ from_clause = (
982
+ f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
983
+ )
984
+ return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""