acryl-datahub 0.15.0.5rc8__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show
  1. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/METADATA +2525 -2523
  2. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/RECORD +46 -45
  3. datahub/_version.py +1 -1
  4. datahub/entrypoints.py +9 -0
  5. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  6. datahub/ingestion/glossary/classification_mixin.py +6 -0
  7. datahub/ingestion/glossary/classifier.py +3 -2
  8. datahub/ingestion/graph/client.py +2 -1
  9. datahub/ingestion/graph/entity_versioning.py +201 -0
  10. datahub/ingestion/source/abs/report.py +2 -2
  11. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  12. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  13. datahub/ingestion/source/delta_lake/report.py +2 -2
  14. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  15. datahub/ingestion/source/elastic_search.py +2 -1
  16. datahub/ingestion/source/ge_profiling_config.py +11 -7
  17. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  18. datahub/ingestion/source/identity/azure_ad.py +6 -14
  19. datahub/ingestion/source/identity/okta.py +2 -1
  20. datahub/ingestion/source/kafka/kafka.py +2 -1
  21. datahub/ingestion/source/kafka_connect/common.py +2 -1
  22. datahub/ingestion/source/ldap.py +2 -1
  23. datahub/ingestion/source/looker/lookml_config.py +9 -5
  24. datahub/ingestion/source/mode.py +2 -4
  25. datahub/ingestion/source/mongodb.py +2 -1
  26. datahub/ingestion/source/nifi.py +2 -1
  27. datahub/ingestion/source/powerbi/config.py +2 -2
  28. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  29. datahub/ingestion/source/redash.py +5 -5
  30. datahub/ingestion/source/salesforce.py +4 -1
  31. datahub/ingestion/source/snowflake/snowflake_config.py +7 -0
  32. datahub/ingestion/source/snowflake/snowflake_report.py +2 -1
  33. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  34. datahub/ingestion/source/tableau/tableau.py +2 -1
  35. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  36. datahub/ingestion/source/unity/report.py +1 -0
  37. datahub/ingestion/source_report/pulsar.py +5 -4
  38. datahub/metadata/schema.avsc +5 -5
  39. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  40. datahub/metadata/schemas/MetadataChangeEvent.avsc +5 -5
  41. datahub/specific/dashboard.py +43 -1
  42. datahub/upgrade/upgrade.py +13 -5
  43. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/LICENSE +0 -0
  44. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/top_level.txt +0 -0
@@ -141,7 +141,7 @@ class BigQueryV2Report(
141
141
  profiling_skipped_invalid_partition_type: Dict[str, str] = field(
142
142
  default_factory=TopKDict
143
143
  )
144
- profiling_skipped_partition_profiling_disabled: List[str] = field(
144
+ profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
145
145
  default_factory=LossyList
146
146
  )
147
147
  allow_pattern: Optional[str] = None
@@ -1,14 +1,14 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
- from typing import List
4
3
 
5
4
  from datahub.ingestion.api.source import SourceReport
5
+ from datahub.utilities.lossy_collections import LossyList
6
6
 
7
7
 
8
8
  @dataclasses.dataclass
9
9
  class DeltaLakeSourceReport(SourceReport):
10
10
  files_scanned = 0
11
- filtered: List[str] = dataclass_field(default_factory=list)
11
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
12
12
 
13
13
  def report_file_scanned(self) -> None:
14
14
  self.files_scanned += 1
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
68
68
  StringTypeClass,
69
69
  UnionTypeClass,
70
70
  )
71
+ from datahub.utilities.lossy_collections import LossyList
71
72
  from datahub.utilities.registries.domain_registry import DomainRegistry
72
73
 
73
74
  MAX_ITEMS_TO_RETRIEVE = 100
@@ -120,7 +121,7 @@ class DynamoDBConfig(
120
121
 
121
122
  @dataclass
122
123
  class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
123
- filtered: List[str] = field(default_factory=list)
124
+ filtered: LossyList[str] = field(default_factory=LossyList)
124
125
 
125
126
  def report_dropped(self, name: str) -> None:
126
127
  self.filtered.append(name)
@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
62
62
  SubTypesClass,
63
63
  )
64
64
  from datahub.utilities.config_clean import remove_protocol
65
+ from datahub.utilities.lossy_collections import LossyList
65
66
  from datahub.utilities.urns.dataset_urn import DatasetUrn
66
67
 
67
68
  logger = logging.getLogger(__name__)
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
189
190
  @dataclass
190
191
  class ElasticsearchSourceReport(SourceReport):
191
192
  index_scanned: int = 0
192
- filtered: List[str] = field(default_factory=list)
193
+ filtered: LossyList[str] = field(default_factory=LossyList)
193
194
 
194
195
  def report_index_scanned(self, index: str) -> None:
195
196
  self.index_scanned += 1
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
115
115
  )
116
116
  max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
117
117
  default=None,
118
- description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
118
+ description="A positive integer that specifies the maximum number of columns to profile for "
119
+ "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
120
+ "number of columns to profile goes up.",
119
121
  )
120
122
 
121
123
  profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
122
124
  default=None,
123
- description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
125
+ description="Profile table only if it has been updated since these many number of days. "
126
+ "If set to `null`, no constraint of last modified time for tables to profile. "
127
+ "Supported only in `snowflake` and `BigQuery`.",
124
128
  )
125
129
 
126
130
  profile_table_size_limit: Optional[int] = Field(
127
131
  default=5,
128
132
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
129
- "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
130
- "Supported for `oracle` based on calculated size from gathered stats.",
133
+ "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
+ "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
131
135
  )
132
136
 
133
137
  profile_table_row_limit: Optional[int] = Field(
134
138
  default=5000000,
135
- description="Profile tables only if their row count is less than specified count. If set to `null`, "
136
- "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
137
- "Supported for `oracle` based on gathered stats.",
139
+ description="Profile tables only if their row count is less than specified count. "
140
+ "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
+ "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
138
142
  )
139
143
 
140
144
  profile_table_row_count_estimate_only: bool = Field(
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, Optional
4
4
 
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
20
20
  OperationConfig,
21
21
  is_profiling_enabled,
22
22
  )
23
+ from datahub.utilities.lossy_collections import LossyList
23
24
  from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
24
25
 
25
26
  logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
198
199
  class IcebergSourceReport(StaleEntityRemovalSourceReport):
199
200
  tables_scanned: int = 0
200
201
  entities_profiled: int = 0
201
- filtered: List[str] = field(default_factory=list)
202
+ filtered: LossyList[str] = field(default_factory=LossyList)
202
203
  load_table_timings: TimingClass = field(default_factory=TimingClass)
203
204
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
204
205
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
13
13
 
14
14
  from datahub.configuration.common import AllowDenyPattern
15
15
  from datahub.configuration.source_common import DatasetSourceConfigMixin
16
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
16
17
  from datahub.emitter.mce_builder import make_group_urn, make_user_urn
17
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
19
  from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
51
52
  OriginTypeClass,
52
53
  StatusClass,
53
54
  )
55
+ from datahub.utilities.lossy_collections import LossyList
54
56
 
55
57
  logger = logging.getLogger(__name__)
56
58
 
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
132
134
  description="regex patterns for groups to include in ingestion.",
133
135
  )
134
136
 
135
- # If enabled, report will contain names of filtered users and groups.
136
- filtered_tracking: bool = Field(
137
- default=True,
138
- description="If enabled, report will contain names of filtered users and groups.",
139
- )
137
+ _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
140
138
 
141
139
  # Optional: Whether to mask sensitive information from workunit ID's. On by default.
142
140
  mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
156
154
 
157
155
  @dataclass
158
156
  class AzureADSourceReport(StaleEntityRemovalSourceReport):
159
- filtered: List[str] = field(default_factory=list)
160
- filtered_tracking: bool = field(default=True, repr=False)
161
- filtered_count: int = field(default=0)
157
+ filtered: LossyList[str] = field(default_factory=LossyList)
162
158
 
163
159
  def report_filtered(self, name: str) -> None:
164
- self.filtered_count += 1
165
- if self.filtered_tracking:
166
- self.filtered.append(name)
160
+ self.filtered.append(name)
167
161
 
168
162
 
169
163
  # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
266
260
  def __init__(self, config: AzureADConfig, ctx: PipelineContext):
267
261
  super().__init__(config, ctx)
268
262
  self.config = config
269
- self.report = AzureADSourceReport(
270
- filtered_tracking=self.config.filtered_tracking
271
- )
263
+ self.report = AzureADSourceReport()
272
264
  session = requests.Session()
273
265
  retries = Retry(
274
266
  total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
50
50
  OriginTypeClass,
51
51
  StatusClass,
52
52
  )
53
+ from datahub.utilities.lossy_collections import LossyList
53
54
 
54
55
  logger = logging.getLogger(__name__)
55
56
  nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
173
174
 
174
175
  @dataclass
175
176
  class OktaSourceReport(StaleEntityRemovalSourceReport):
176
- filtered: List[str] = field(default_factory=list)
177
+ filtered: LossyList[str] = field(default_factory=LossyList)
177
178
 
178
179
  def report_filtered(self, name: str) -> None:
179
180
  self.filtered.append(name)
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
73
73
  OwnershipSourceTypeClass,
74
74
  SubTypesClass,
75
75
  )
76
+ from datahub.utilities.lossy_collections import LossyList
76
77
  from datahub.utilities.mapping import Constants, OperationProcessor
77
78
  from datahub.utilities.registries.domain_registry import DomainRegistry
78
79
  from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
190
191
  @dataclass
191
192
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
192
193
  topics_scanned: int = 0
193
- filtered: List[str] = field(default_factory=list)
194
+ filtered: LossyList[str] = field(default_factory=LossyList)
194
195
 
195
196
  def report_topic_scanned(self, topic: str) -> None:
196
197
  self.topics_scanned += 1
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
16
  from datahub.ingestion.source.state.stateful_ingestion_base import (
17
17
  StatefulIngestionConfigBase,
18
18
  )
19
+ from datahub.utilities.lossy_collections import LossyList
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
83
84
  @dataclass
84
85
  class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
86
  connectors_scanned: int = 0
86
- filtered: List[str] = field(default_factory=list)
87
+ filtered: LossyList[str] = field(default_factory=LossyList)
87
88
 
88
89
  def report_connector_scanned(self, connector: str) -> None:
89
90
  self.connectors_scanned += 1
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
37
37
  CorpUserSnapshotClass,
38
38
  GroupMembershipClass,
39
39
  )
40
+ from datahub.utilities.lossy_collections import LossyList
40
41
 
41
42
  # default mapping for attrs
42
43
  user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
160
161
 
161
162
  @dataclasses.dataclass
162
163
  class LDAPSourceReport(StaleEntityRemovalSourceReport):
163
- dropped_dns: List[str] = dataclasses.field(default_factory=list)
164
+ dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
164
165
 
165
166
  def report_dropped(self, dn: str) -> None:
166
167
  self.dropped_dns.append(dn)
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field as dataclass_field
3
3
  from datetime import timedelta
4
- from typing import Any, Dict, List, Literal, Optional, Union
4
+ from typing import Any, Dict, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
7
  from pydantic import root_validator, validator
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
48
48
  class LookMLSourceReport(StaleEntityRemovalSourceReport):
49
49
  git_clone_latency: Optional[timedelta] = None
50
50
  models_discovered: int = 0
51
- models_dropped: List[str] = dataclass_field(default_factory=LossyList)
51
+ models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
52
52
  views_discovered: int = 0
53
- views_dropped: List[str] = dataclass_field(default_factory=LossyList)
54
- views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList)
53
+ views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
54
+ views_dropped_unreachable: LossyList[str] = dataclass_field(
55
+ default_factory=LossyList
56
+ )
55
57
  query_parse_attempts: int = 0
56
58
  query_parse_failures: int = 0
57
- query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList)
59
+ query_parse_failure_views: LossyList[str] = dataclass_field(
60
+ default_factory=LossyList
61
+ )
58
62
  _looker_api: Optional[LookerAPI] = None
59
63
 
60
64
  def report_models_scanned(self) -> None:
@@ -24,6 +24,7 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
24
24
  import datahub.emitter.mce_builder as builder
25
25
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
26
26
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
27
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
27
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
29
  from datahub.emitter.mcp_builder import (
29
30
  ContainerKey,
@@ -155,10 +156,7 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
155
156
  workspace: str = Field(
156
157
  description="The Mode workspace name. Find it in Settings > Workspace > Details."
157
158
  )
158
- default_schema: str = Field(
159
- default="public",
160
- description="Default schema to use when schema is not provided in an SQL query",
161
- )
159
+ _default_schema = pydantic_removed_field("default_schema")
162
160
 
163
161
  space_pattern: AllowDenyPattern = Field(
164
162
  default=AllowDenyPattern(
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
68
68
  UnionTypeClass,
69
69
  )
70
70
  from datahub.metadata.urns import DatasetUrn
71
+ from datahub.utilities.lossy_collections import LossyList
71
72
 
72
73
  logger = logging.getLogger(__name__)
73
74
 
@@ -143,7 +144,7 @@ class MongoDBConfig(
143
144
 
144
145
  @dataclass
145
146
  class MongoDBSourceReport(StaleEntityRemovalSourceReport):
146
- filtered: List[str] = field(default_factory=list)
147
+ filtered: LossyList[str] = field(default_factory=LossyList)
147
148
 
148
149
  def report_dropped(self, name: str) -> None:
149
150
  self.filtered.append(name)
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DatasetPropertiesClass,
47
47
  )
48
48
  from datahub.specific.datajob import DataJobPatchBuilder
49
+ from datahub.utilities.lossy_collections import LossyList
49
50
 
50
51
  logger = logging.getLogger(__name__)
51
52
  NIFI = "nifi"
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
453
 
453
454
  @dataclass
454
455
  class NifiSourceReport(SourceReport):
455
- filtered: List[str] = field(default_factory=list)
456
+ filtered: LossyList[str] = field(default_factory=LossyList)
456
457
 
457
458
  def report_dropped(self, ent_name: str) -> None:
458
459
  self.filtered.append(ent_name)
@@ -195,8 +195,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
195
195
 
196
196
  dashboards_scanned: int = 0
197
197
  charts_scanned: int = 0
198
- filtered_dashboards: List[str] = dataclass_field(default_factory=list)
199
- filtered_charts: List[str] = dataclass_field(default_factory=list)
198
+ filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
199
+ filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
200
200
 
201
201
  m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
202
202
  m_query_parse_attempts: int = 0
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
53
53
  StatusClass,
54
54
  )
55
55
  from datahub.utilities.dedup_list import deduplicate_list
56
+ from datahub.utilities.lossy_collections import LossyList
56
57
 
57
58
  LOGGER = logging.getLogger(__name__)
58
59
 
@@ -476,7 +477,7 @@ class Mapper:
476
477
  @dataclass
477
478
  class PowerBiReportServerDashboardSourceReport(SourceReport):
478
479
  scanned_report: int = 0
479
- filtered_reports: List[str] = dataclass_field(default_factory=list)
480
+ filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
480
481
 
481
482
  def report_scanned(self, count: int = 1) -> None:
482
483
  self.scanned_report += count
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set
5
+ from typing import Dict, Iterable, List, Optional
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
39
39
  DashboardInfoClass,
40
40
  )
41
41
  from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
- from datahub.utilities.lossy_collections import LossyDict, LossyList
42
+ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
45
45
 
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
280
280
  class RedashSourceReport(SourceReport):
281
281
  items_scanned: int = 0
282
282
  filtered: LossyList[str] = field(default_factory=LossyList)
283
- queries_problem_parsing: Set[str] = field(default_factory=set)
284
- queries_no_dataset: Set[str] = field(default_factory=set)
285
- charts_no_input: Set[str] = field(default_factory=set)
283
+ queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
284
+ queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
285
+ charts_no_input: LossySet[str] = field(default_factory=LossySet)
286
286
  total_queries: Optional[int] = field(
287
287
  default=None,
288
288
  )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import time
4
+ from dataclasses import dataclass, field as dataclass_field
4
5
  from datetime import datetime
5
6
  from enum import Enum
6
7
  from typing import Any, Dict, Iterable, List, Optional
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
60
61
  TagAssociationClass,
61
62
  )
62
63
  from datahub.utilities import config_clean
64
+ from datahub.utilities.lossy_collections import LossyList
63
65
 
64
66
  logger = logging.getLogger(__name__)
65
67
 
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
146
148
  return config_clean.remove_trailing_slashes(v)
147
149
 
148
150
 
151
+ @dataclass
149
152
  class SalesforceSourceReport(SourceReport):
150
- filtered: List[str] = []
153
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
151
154
 
152
155
  def report_dropped(self, ent_name: str) -> None:
153
156
  self.filtered.append(ent_name)
@@ -308,6 +308,13 @@ class SnowflakeV2Config(
308
308
  " assertions CLI in snowflake",
309
309
  )
310
310
 
311
+ pushdown_deny_usernames: List[str] = Field(
312
+ default=[],
313
+ description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
314
+ "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
315
+ "Only applicable if `use_queries_v2` is enabled.",
316
+ )
317
+
311
318
  @validator("convert_urns_to_lowercase")
312
319
  def validate_convert_urns_to_lowercase(cls, v):
313
320
  if not v:
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
12
12
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
13
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
14
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
+ from datahub.utilities.lossy_collections import LossyDict
15
16
  from datahub.utilities.perf_timer import PerfTimer
16
17
 
17
18
  if TYPE_CHECKING:
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
66
67
  num_external_table_edges_scanned: int = 0
67
68
  ignore_start_time_lineage: Optional[bool] = None
68
69
  upstream_lineage_in_report: Optional[bool] = None
69
- upstream_lineage: Dict[str, List[str]] = field(default_factory=dict)
70
+ upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
70
71
 
71
72
  lineage_start_time: Optional[datetime] = None
72
73
  lineage_end_time: Optional[datetime] = None
@@ -567,6 +567,7 @@ class SnowflakeV2Source(
567
567
  include_queries=self.config.include_queries,
568
568
  include_query_usage_statistics=self.config.include_query_usage_statistics,
569
569
  user_email_pattern=self.config.user_email_pattern,
570
+ pushdown_deny_usernames=self.config.pushdown_deny_usernames,
570
571
  ),
571
572
  structured_report=self.report,
572
573
  filters=self.filters,
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
170
  create_lineage_sql_parsed_result,
171
171
  )
172
172
  from datahub.utilities import config_clean
173
+ from datahub.utilities.lossy_collections import LossyList
173
174
  from datahub.utilities.perf_timer import PerfTimer
174
175
  from datahub.utilities.stats_collections import TopKDict
175
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
798
799
  num_upstream_table_lineage_failed_parse_sql: int = 0
799
800
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
800
801
  num_hidden_assets_skipped: int = 0
801
- logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
802
+ logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
802
803
 
803
804
  last_authenticated_at: Optional[datetime] = None
804
805
 
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Iterable, List, Optional
5
5
 
6
+ from databricks.sdk.service.catalog import DataSourceFormat
6
7
  from sqlalchemy import create_engine
7
8
  from sqlalchemy.engine import Connection
8
9
 
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
34
35
  self.size_in_bytes = None
35
36
  self.rows_count = None
36
37
  self.ddl = None
38
+ self.data_source_format = table.data_source_format
39
+
40
+ @property
41
+ def is_delta_table(self) -> bool:
42
+ return self.data_source_format == DataSourceFormat.DELTA
37
43
 
38
44
 
39
45
  class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
110
116
  profile_table_level_only = self.profiling_config.profile_table_level_only
111
117
 
112
118
  dataset_name = table.ref.qualified_table_name
113
- try:
114
- table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
115
- except Exception as e:
116
- logger.warning(f"Failed to get table size for {dataset_name}: {e}")
119
+ if table.is_delta_table:
120
+ try:
121
+ table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
122
+ except Exception as e:
123
+ self.report.warning(
124
+ title="Incomplete Dataset Profile",
125
+ message="Failed to get table size",
126
+ context=dataset_name,
127
+ exc=e,
128
+ )
117
129
 
118
130
  if table.size_in_bytes is None:
119
131
  self.report.num_profile_missing_size_in_bytes += 1
132
+
120
133
  if not self.is_dataset_eligible_for_profiling(
121
134
  dataset_name,
122
135
  size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
143
156
  self.report.report_dropped(dataset_name)
144
157
  return None
145
158
 
159
+ if profile_table_level_only and table.is_delta_table:
160
+ # For requests with profile_table_level_only set, dataset profile is generated
161
+ # by looking at table.rows_count. For delta tables (a typical databricks table)
162
+ # count(*) is an efficient query to compute row count.
163
+ try:
164
+ table.rows_count = _get_dataset_row_count(table, conn)
165
+ except Exception as e:
166
+ self.report.warning(
167
+ title="Incomplete Dataset Profile",
168
+ message="Failed to get table row count",
169
+ context=dataset_name,
170
+ exc=e,
171
+ )
172
+
173
+ if table.rows_count is None:
174
+ self.report.num_profile_missing_row_count += 1
175
+
146
176
  self.report.report_entity_profiled(dataset_name)
147
177
  logger.debug(f"Preparing profiling request for {dataset_name}")
148
178
  return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
160
190
  conn.dialect.identifier_preparer.quote(c)
161
191
  for c in [table.ref.catalog, table.ref.schema, table.ref.table]
162
192
  )
193
+ # This query only works for delta table.
194
+ # Ref: https://docs.databricks.com/en/delta/table-details.html
195
+ # Note: Any change here should also update _get_dataset_row_count
163
196
  row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
164
197
  if row is None:
165
198
  return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
168
201
  return int(row._asdict()["sizeInBytes"])
169
202
  except Exception:
170
203
  return None
204
+
205
+
206
+ def _get_dataset_row_count(
207
+ table: UnityCatalogSQLGenericTable, conn: Connection
208
+ ) -> Optional[int]:
209
+ name = ".".join(
210
+ conn.dialect.identifier_preparer.quote(c)
211
+ for c in [table.ref.catalog, table.ref.schema, table.ref.table]
212
+ )
213
+ # This query only works efficiently for delta table
214
+ row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
215
+ if row is None:
216
+ return None
217
+ else:
218
+ try:
219
+ return int(row._asdict()["numRows"])
220
+ except Exception:
221
+ return None
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
52
52
  default_factory=LossyDict
53
53
  )
54
54
  num_profile_missing_size_in_bytes: int = 0
55
+ num_profile_missing_row_count: int = 0
55
56
  num_profile_failed_unsupported_column_type: int = 0
56
57
  num_profile_failed_int_casts: int = 0
57
58
 
@@ -1,9 +1,10 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional
2
+ from typing import Optional
3
3
 
4
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
5
  StaleEntityRemovalSourceReport,
6
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
7
8
 
8
9
 
9
10
  @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
12
13
  tenants_scanned: Optional[int] = None
13
14
  namespaces_scanned: Optional[int] = None
14
15
  topics_scanned: Optional[int] = None
15
- tenants_filtered: List[str] = field(default_factory=list)
16
- namespaces_filtered: List[str] = field(default_factory=list)
17
- topics_filtered: List[str] = field(default_factory=list)
16
+ tenants_filtered: LossyList[str] = field(default_factory=LossyList)
17
+ namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
18
+ topics_filtered: LossyList[str] = field(default_factory=LossyList)
18
19
 
19
20
  def report_pulsar_version(self, version: str) -> None:
20
21
  self.pulsar_version = version
@@ -4730,16 +4730,16 @@
4730
4730
  {
4731
4731
  "Relationship": {
4732
4732
  "/*/destinationUrn": {
4733
- "createdActor": "datasetEdges/*/created/actor",
4734
- "createdOn": "datasetEdges/*/created/time",
4733
+ "createdActor": "dashboards/*/created/actor",
4734
+ "createdOn": "dashboards/*/created/time",
4735
4735
  "entityTypes": [
4736
4736
  "dashboard"
4737
4737
  ],
4738
4738
  "isLineage": true,
4739
4739
  "name": "DashboardContainsDashboard",
4740
- "properties": "datasetEdges/*/properties",
4741
- "updatedActor": "datasetEdges/*/lastModified/actor",
4742
- "updatedOn": "datasetEdges/*/lastModified/time"
4740
+ "properties": "dashboards/*/properties",
4741
+ "updatedActor": "dashboards/*/lastModified/actor",
4742
+ "updatedOn": "dashboards/*/lastModified/time"
4743
4743
  }
4744
4744
  },
4745
4745
  "type": {