acryl-datahub 0.15.0.5rc8__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/METADATA +2525 -2523
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/RECORD +46 -45
- datahub/_version.py +1 -1
- datahub/entrypoints.py +9 -0
- datahub/ingestion/api/incremental_lineage_helper.py +4 -0
- datahub/ingestion/glossary/classification_mixin.py +6 -0
- datahub/ingestion/glossary/classifier.py +3 -2
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/azure_ad.py +6 -14
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/lookml_config.py +9 -5
- datahub/ingestion/source/mode.py +2 -4
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +7 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- datahub/metadata/schema.avsc +5 -5
- datahub/metadata/schemas/DashboardInfo.avsc +5 -5
- datahub/metadata/schemas/MetadataChangeEvent.avsc +5 -5
- datahub/specific/dashboard.py +43 -1
- datahub/upgrade/upgrade.py +13 -5
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc8.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/top_level.txt +0 -0
|
@@ -141,7 +141,7 @@ class BigQueryV2Report(
|
|
|
141
141
|
profiling_skipped_invalid_partition_type: Dict[str, str] = field(
|
|
142
142
|
default_factory=TopKDict
|
|
143
143
|
)
|
|
144
|
-
profiling_skipped_partition_profiling_disabled:
|
|
144
|
+
profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
|
|
145
145
|
default_factory=LossyList
|
|
146
146
|
)
|
|
147
147
|
allow_pattern: Optional[str] = None
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from datahub.ingestion.api.source import SourceReport
|
|
5
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclasses.dataclass
|
|
9
9
|
class DeltaLakeSourceReport(SourceReport):
|
|
10
10
|
files_scanned = 0
|
|
11
|
-
filtered:
|
|
11
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
12
12
|
|
|
13
13
|
def report_file_scanned(self) -> None:
|
|
14
14
|
self.files_scanned += 1
|
|
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
|
|
|
68
68
|
StringTypeClass,
|
|
69
69
|
UnionTypeClass,
|
|
70
70
|
)
|
|
71
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
71
72
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
72
73
|
|
|
73
74
|
MAX_ITEMS_TO_RETRIEVE = 100
|
|
@@ -120,7 +121,7 @@ class DynamoDBConfig(
|
|
|
120
121
|
|
|
121
122
|
@dataclass
|
|
122
123
|
class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
|
|
123
|
-
filtered:
|
|
124
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
124
125
|
|
|
125
126
|
def report_dropped(self, name: str) -> None:
|
|
126
127
|
self.filtered.append(name)
|
|
@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
|
|
|
62
62
|
SubTypesClass,
|
|
63
63
|
)
|
|
64
64
|
from datahub.utilities.config_clean import remove_protocol
|
|
65
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
65
66
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
66
67
|
|
|
67
68
|
logger = logging.getLogger(__name__)
|
|
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
|
|
|
189
190
|
@dataclass
|
|
190
191
|
class ElasticsearchSourceReport(SourceReport):
|
|
191
192
|
index_scanned: int = 0
|
|
192
|
-
filtered:
|
|
193
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
193
194
|
|
|
194
195
|
def report_index_scanned(self, index: str) -> None:
|
|
195
196
|
self.index_scanned += 1
|
|
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
115
115
|
)
|
|
116
116
|
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
|
|
117
117
|
default=None,
|
|
118
|
-
description="A positive integer that specifies the maximum number of columns to profile for
|
|
118
|
+
description="A positive integer that specifies the maximum number of columns to profile for "
|
|
119
|
+
"any table. `None` implies all columns. The cost of profiling goes up significantly as the "
|
|
120
|
+
"number of columns to profile goes up.",
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
|
|
122
124
|
default=None,
|
|
123
|
-
description="Profile table only if it has been updated since these many number of days.
|
|
125
|
+
description="Profile table only if it has been updated since these many number of days. "
|
|
126
|
+
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
|
+
"Supported only in `snowflake` and `BigQuery`.",
|
|
124
128
|
)
|
|
125
129
|
|
|
126
130
|
profile_table_size_limit: Optional[int] = Field(
|
|
127
131
|
default=5,
|
|
128
132
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
129
|
-
"no limit on the size of tables to profile. Supported only in `
|
|
130
|
-
"Supported for `
|
|
133
|
+
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
|
+
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
131
135
|
)
|
|
132
136
|
|
|
133
137
|
profile_table_row_limit: Optional[int] = Field(
|
|
134
138
|
default=5000000,
|
|
135
|
-
description="Profile tables only if their row count is less than specified count.
|
|
136
|
-
"no limit on the row count of tables to profile. Supported only in
|
|
137
|
-
"Supported for `
|
|
139
|
+
description="Profile tables only if their row count is less than specified count. "
|
|
140
|
+
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
|
+
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
138
142
|
)
|
|
139
143
|
|
|
140
144
|
profile_table_row_count_estimate_only: bool = Field(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
5
|
from humanfriendly import format_timespan
|
|
6
6
|
from pydantic import Field, validator
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
|
|
|
20
20
|
OperationConfig,
|
|
21
21
|
is_profiling_enabled,
|
|
22
22
|
)
|
|
23
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
23
24
|
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
@@ -198,7 +199,7 @@ class TimingClass:
|
|
|
198
199
|
class IcebergSourceReport(StaleEntityRemovalSourceReport):
|
|
199
200
|
tables_scanned: int = 0
|
|
200
201
|
entities_profiled: int = 0
|
|
201
|
-
filtered:
|
|
202
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
202
203
|
load_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
203
204
|
processing_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
204
205
|
profiling_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
|
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.common import AllowDenyPattern
|
|
15
15
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
16
|
+
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
16
17
|
from datahub.emitter.mce_builder import make_group_urn, make_user_urn
|
|
17
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
|
|
|
51
52
|
OriginTypeClass,
|
|
52
53
|
StatusClass,
|
|
53
54
|
)
|
|
55
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
54
56
|
|
|
55
57
|
logger = logging.getLogger(__name__)
|
|
56
58
|
|
|
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
132
134
|
description="regex patterns for groups to include in ingestion.",
|
|
133
135
|
)
|
|
134
136
|
|
|
135
|
-
|
|
136
|
-
filtered_tracking: bool = Field(
|
|
137
|
-
default=True,
|
|
138
|
-
description="If enabled, report will contain names of filtered users and groups.",
|
|
139
|
-
)
|
|
137
|
+
_remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
|
|
140
138
|
|
|
141
139
|
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
|
|
142
140
|
mask_group_id: bool = Field(
|
|
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
156
154
|
|
|
157
155
|
@dataclass
|
|
158
156
|
class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
159
|
-
filtered:
|
|
160
|
-
filtered_tracking: bool = field(default=True, repr=False)
|
|
161
|
-
filtered_count: int = field(default=0)
|
|
157
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
162
158
|
|
|
163
159
|
def report_filtered(self, name: str) -> None:
|
|
164
|
-
self.
|
|
165
|
-
if self.filtered_tracking:
|
|
166
|
-
self.filtered.append(name)
|
|
160
|
+
self.filtered.append(name)
|
|
167
161
|
|
|
168
162
|
|
|
169
163
|
# Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
|
|
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
|
|
|
266
260
|
def __init__(self, config: AzureADConfig, ctx: PipelineContext):
|
|
267
261
|
super().__init__(config, ctx)
|
|
268
262
|
self.config = config
|
|
269
|
-
self.report = AzureADSourceReport(
|
|
270
|
-
filtered_tracking=self.config.filtered_tracking
|
|
271
|
-
)
|
|
263
|
+
self.report = AzureADSourceReport()
|
|
272
264
|
session = requests.Session()
|
|
273
265
|
retries = Retry(
|
|
274
266
|
total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
|
|
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
|
|
|
50
50
|
OriginTypeClass,
|
|
51
51
|
StatusClass,
|
|
52
52
|
)
|
|
53
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
53
54
|
|
|
54
55
|
logger = logging.getLogger(__name__)
|
|
55
56
|
nest_asyncio.apply()
|
|
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
|
|
|
173
174
|
|
|
174
175
|
@dataclass
|
|
175
176
|
class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
176
|
-
filtered:
|
|
177
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
177
178
|
|
|
178
179
|
def report_filtered(self, name: str) -> None:
|
|
179
180
|
self.filtered.append(name)
|
|
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
|
|
|
73
73
|
OwnershipSourceTypeClass,
|
|
74
74
|
SubTypesClass,
|
|
75
75
|
)
|
|
76
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
76
77
|
from datahub.utilities.mapping import Constants, OperationProcessor
|
|
77
78
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
78
79
|
from datahub.utilities.str_enum import StrEnum
|
|
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
|
|
|
190
191
|
@dataclass
|
|
191
192
|
class KafkaSourceReport(StaleEntityRemovalSourceReport):
|
|
192
193
|
topics_scanned: int = 0
|
|
193
|
-
filtered:
|
|
194
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
195
|
|
|
195
196
|
def report_topic_scanned(self, topic: str) -> None:
|
|
196
197
|
self.topics_scanned += 1
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
16
16
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
17
|
StatefulIngestionConfigBase,
|
|
18
18
|
)
|
|
19
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
|
|
|
83
84
|
@dataclass
|
|
84
85
|
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
85
86
|
connectors_scanned: int = 0
|
|
86
|
-
filtered:
|
|
87
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
87
88
|
|
|
88
89
|
def report_connector_scanned(self, connector: str) -> None:
|
|
89
90
|
self.connectors_scanned += 1
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
|
|
|
37
37
|
CorpUserSnapshotClass,
|
|
38
38
|
GroupMembershipClass,
|
|
39
39
|
)
|
|
40
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
40
41
|
|
|
41
42
|
# default mapping for attrs
|
|
42
43
|
user_attrs_map: Dict[str, Any] = {}
|
|
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
160
161
|
|
|
161
162
|
@dataclasses.dataclass
|
|
162
163
|
class LDAPSourceReport(StaleEntityRemovalSourceReport):
|
|
163
|
-
dropped_dns:
|
|
164
|
+
dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
164
165
|
|
|
165
166
|
def report_dropped(self, dn: str) -> None:
|
|
166
167
|
self.dropped_dns.append(dn)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field as dataclass_field
|
|
3
3
|
from datetime import timedelta
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import root_validator, validator
|
|
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
|
|
|
48
48
|
class LookMLSourceReport(StaleEntityRemovalSourceReport):
|
|
49
49
|
git_clone_latency: Optional[timedelta] = None
|
|
50
50
|
models_discovered: int = 0
|
|
51
|
-
models_dropped:
|
|
51
|
+
models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
52
52
|
views_discovered: int = 0
|
|
53
|
-
views_dropped:
|
|
54
|
-
views_dropped_unreachable:
|
|
53
|
+
views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
54
|
+
views_dropped_unreachable: LossyList[str] = dataclass_field(
|
|
55
|
+
default_factory=LossyList
|
|
56
|
+
)
|
|
55
57
|
query_parse_attempts: int = 0
|
|
56
58
|
query_parse_failures: int = 0
|
|
57
|
-
query_parse_failure_views:
|
|
59
|
+
query_parse_failure_views: LossyList[str] = dataclass_field(
|
|
60
|
+
default_factory=LossyList
|
|
61
|
+
)
|
|
58
62
|
_looker_api: Optional[LookerAPI] = None
|
|
59
63
|
|
|
60
64
|
def report_models_scanned(self) -> None:
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -24,6 +24,7 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
|
|
|
24
24
|
import datahub.emitter.mce_builder as builder
|
|
25
25
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
26
26
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
27
|
+
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
27
28
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
29
|
from datahub.emitter.mcp_builder import (
|
|
29
30
|
ContainerKey,
|
|
@@ -155,10 +156,7 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
|
|
|
155
156
|
workspace: str = Field(
|
|
156
157
|
description="The Mode workspace name. Find it in Settings > Workspace > Details."
|
|
157
158
|
)
|
|
158
|
-
|
|
159
|
-
default="public",
|
|
160
|
-
description="Default schema to use when schema is not provided in an SQL query",
|
|
161
|
-
)
|
|
159
|
+
_default_schema = pydantic_removed_field("default_schema")
|
|
162
160
|
|
|
163
161
|
space_pattern: AllowDenyPattern = Field(
|
|
164
162
|
default=AllowDenyPattern(
|
|
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
|
|
|
68
68
|
UnionTypeClass,
|
|
69
69
|
)
|
|
70
70
|
from datahub.metadata.urns import DatasetUrn
|
|
71
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
71
72
|
|
|
72
73
|
logger = logging.getLogger(__name__)
|
|
73
74
|
|
|
@@ -143,7 +144,7 @@ class MongoDBConfig(
|
|
|
143
144
|
|
|
144
145
|
@dataclass
|
|
145
146
|
class MongoDBSourceReport(StaleEntityRemovalSourceReport):
|
|
146
|
-
filtered:
|
|
147
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
147
148
|
|
|
148
149
|
def report_dropped(self, name: str) -> None:
|
|
149
150
|
self.filtered.append(name)
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
|
|
|
46
46
|
DatasetPropertiesClass,
|
|
47
47
|
)
|
|
48
48
|
from datahub.specific.datajob import DataJobPatchBuilder
|
|
49
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
49
50
|
|
|
50
51
|
logger = logging.getLogger(__name__)
|
|
51
52
|
NIFI = "nifi"
|
|
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
|
|
|
452
453
|
|
|
453
454
|
@dataclass
|
|
454
455
|
class NifiSourceReport(SourceReport):
|
|
455
|
-
filtered:
|
|
456
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
456
457
|
|
|
457
458
|
def report_dropped(self, ent_name: str) -> None:
|
|
458
459
|
self.filtered.append(ent_name)
|
|
@@ -195,8 +195,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
195
195
|
|
|
196
196
|
dashboards_scanned: int = 0
|
|
197
197
|
charts_scanned: int = 0
|
|
198
|
-
filtered_dashboards:
|
|
199
|
-
filtered_charts:
|
|
198
|
+
filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
199
|
+
filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
200
200
|
|
|
201
201
|
m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
|
|
202
202
|
m_query_parse_attempts: int = 0
|
|
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
|
|
|
53
53
|
StatusClass,
|
|
54
54
|
)
|
|
55
55
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
56
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
56
57
|
|
|
57
58
|
LOGGER = logging.getLogger(__name__)
|
|
58
59
|
|
|
@@ -476,7 +477,7 @@ class Mapper:
|
|
|
476
477
|
@dataclass
|
|
477
478
|
class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
478
479
|
scanned_report: int = 0
|
|
479
|
-
filtered_reports:
|
|
480
|
+
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
480
481
|
|
|
481
482
|
def report_scanned(self, count: int = 1) -> None:
|
|
482
483
|
self.scanned_report += count
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
|
|
|
39
39
|
DashboardInfoClass,
|
|
40
40
|
)
|
|
41
41
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
|
-
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
42
|
+
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
45
45
|
|
|
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
|
|
|
280
280
|
class RedashSourceReport(SourceReport):
|
|
281
281
|
items_scanned: int = 0
|
|
282
282
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
283
|
-
queries_problem_parsing:
|
|
284
|
-
queries_no_dataset:
|
|
285
|
-
charts_no_input:
|
|
283
|
+
queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
|
|
284
|
+
queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
|
|
285
|
+
charts_no_input: LossySet[str] = field(default_factory=LossySet)
|
|
286
286
|
total_queries: Optional[int] = field(
|
|
287
287
|
default=None,
|
|
288
288
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
+
from dataclasses import dataclass, field as dataclass_field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
|
|
|
60
61
|
TagAssociationClass,
|
|
61
62
|
)
|
|
62
63
|
from datahub.utilities import config_clean
|
|
64
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
63
65
|
|
|
64
66
|
logger = logging.getLogger(__name__)
|
|
65
67
|
|
|
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
|
|
|
146
148
|
return config_clean.remove_trailing_slashes(v)
|
|
147
149
|
|
|
148
150
|
|
|
151
|
+
@dataclass
|
|
149
152
|
class SalesforceSourceReport(SourceReport):
|
|
150
|
-
filtered:
|
|
153
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
151
154
|
|
|
152
155
|
def report_dropped(self, ent_name: str) -> None:
|
|
153
156
|
self.filtered.append(ent_name)
|
|
@@ -308,6 +308,13 @@ class SnowflakeV2Config(
|
|
|
308
308
|
" assertions CLI in snowflake",
|
|
309
309
|
)
|
|
310
310
|
|
|
311
|
+
pushdown_deny_usernames: List[str] = Field(
|
|
312
|
+
default=[],
|
|
313
|
+
description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
|
|
314
|
+
"This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
|
|
315
|
+
"Only applicable if `use_queries_v2` is enabled.",
|
|
316
|
+
)
|
|
317
|
+
|
|
311
318
|
@validator("convert_urns_to_lowercase")
|
|
312
319
|
def validate_convert_urns_to_lowercase(cls, v):
|
|
313
320
|
if not v:
|
|
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
12
12
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
13
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
14
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
|
+
from datahub.utilities.lossy_collections import LossyDict
|
|
15
16
|
from datahub.utilities.perf_timer import PerfTimer
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
|
|
|
66
67
|
num_external_table_edges_scanned: int = 0
|
|
67
68
|
ignore_start_time_lineage: Optional[bool] = None
|
|
68
69
|
upstream_lineage_in_report: Optional[bool] = None
|
|
69
|
-
upstream_lineage:
|
|
70
|
+
upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
|
|
70
71
|
|
|
71
72
|
lineage_start_time: Optional[datetime] = None
|
|
72
73
|
lineage_end_time: Optional[datetime] = None
|
|
@@ -567,6 +567,7 @@ class SnowflakeV2Source(
|
|
|
567
567
|
include_queries=self.config.include_queries,
|
|
568
568
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
569
569
|
user_email_pattern=self.config.user_email_pattern,
|
|
570
|
+
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
570
571
|
),
|
|
571
572
|
structured_report=self.report,
|
|
572
573
|
filters=self.filters,
|
|
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
170
|
create_lineage_sql_parsed_result,
|
|
171
171
|
)
|
|
172
172
|
from datahub.utilities import config_clean
|
|
173
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
173
174
|
from datahub.utilities.perf_timer import PerfTimer
|
|
174
175
|
from datahub.utilities.stats_collections import TopKDict
|
|
175
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
@@ -798,7 +799,7 @@ class TableauSourceReport(
|
|
|
798
799
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
799
800
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
800
801
|
num_hidden_assets_skipped: int = 0
|
|
801
|
-
logged_in_user:
|
|
802
|
+
logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
|
|
802
803
|
|
|
803
804
|
last_authenticated_at: Optional[datetime] = None
|
|
804
805
|
|
|
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Iterable, List, Optional
|
|
5
5
|
|
|
6
|
+
from databricks.sdk.service.catalog import DataSourceFormat
|
|
6
7
|
from sqlalchemy import create_engine
|
|
7
8
|
from sqlalchemy.engine import Connection
|
|
8
9
|
|
|
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
|
|
|
34
35
|
self.size_in_bytes = None
|
|
35
36
|
self.rows_count = None
|
|
36
37
|
self.ddl = None
|
|
38
|
+
self.data_source_format = table.data_source_format
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_delta_table(self) -> bool:
|
|
42
|
+
return self.data_source_format == DataSourceFormat.DELTA
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class UnityCatalogGEProfiler(GenericProfiler):
|
|
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
110
116
|
profile_table_level_only = self.profiling_config.profile_table_level_only
|
|
111
117
|
|
|
112
118
|
dataset_name = table.ref.qualified_table_name
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
if table.is_delta_table:
|
|
120
|
+
try:
|
|
121
|
+
table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.report.warning(
|
|
124
|
+
title="Incomplete Dataset Profile",
|
|
125
|
+
message="Failed to get table size",
|
|
126
|
+
context=dataset_name,
|
|
127
|
+
exc=e,
|
|
128
|
+
)
|
|
117
129
|
|
|
118
130
|
if table.size_in_bytes is None:
|
|
119
131
|
self.report.num_profile_missing_size_in_bytes += 1
|
|
132
|
+
|
|
120
133
|
if not self.is_dataset_eligible_for_profiling(
|
|
121
134
|
dataset_name,
|
|
122
135
|
size_in_bytes=table.size_in_bytes,
|
|
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
143
156
|
self.report.report_dropped(dataset_name)
|
|
144
157
|
return None
|
|
145
158
|
|
|
159
|
+
if profile_table_level_only and table.is_delta_table:
|
|
160
|
+
# For requests with profile_table_level_only set, dataset profile is generated
|
|
161
|
+
# by looking at table.rows_count. For delta tables (a typical databricks table)
|
|
162
|
+
# count(*) is an efficient query to compute row count.
|
|
163
|
+
try:
|
|
164
|
+
table.rows_count = _get_dataset_row_count(table, conn)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.report.warning(
|
|
167
|
+
title="Incomplete Dataset Profile",
|
|
168
|
+
message="Failed to get table row count",
|
|
169
|
+
context=dataset_name,
|
|
170
|
+
exc=e,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if table.rows_count is None:
|
|
174
|
+
self.report.num_profile_missing_row_count += 1
|
|
175
|
+
|
|
146
176
|
self.report.report_entity_profiled(dataset_name)
|
|
147
177
|
logger.debug(f"Preparing profiling request for {dataset_name}")
|
|
148
178
|
return TableProfilerRequest(
|
|
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
|
|
|
160
190
|
conn.dialect.identifier_preparer.quote(c)
|
|
161
191
|
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
162
192
|
)
|
|
193
|
+
# This query only works for delta table.
|
|
194
|
+
# Ref: https://docs.databricks.com/en/delta/table-details.html
|
|
195
|
+
# Note: Any change here should also update _get_dataset_row_count
|
|
163
196
|
row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
|
|
164
197
|
if row is None:
|
|
165
198
|
return None
|
|
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
|
|
|
168
201
|
return int(row._asdict()["sizeInBytes"])
|
|
169
202
|
except Exception:
|
|
170
203
|
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_dataset_row_count(
|
|
207
|
+
table: UnityCatalogSQLGenericTable, conn: Connection
|
|
208
|
+
) -> Optional[int]:
|
|
209
|
+
name = ".".join(
|
|
210
|
+
conn.dialect.identifier_preparer.quote(c)
|
|
211
|
+
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
212
|
+
)
|
|
213
|
+
# This query only works efficiently for delta table
|
|
214
|
+
row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
|
|
215
|
+
if row is None:
|
|
216
|
+
return None
|
|
217
|
+
else:
|
|
218
|
+
try:
|
|
219
|
+
return int(row._asdict()["numRows"])
|
|
220
|
+
except Exception:
|
|
221
|
+
return None
|
|
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
52
52
|
default_factory=LossyDict
|
|
53
53
|
)
|
|
54
54
|
num_profile_missing_size_in_bytes: int = 0
|
|
55
|
+
num_profile_missing_row_count: int = 0
|
|
55
56
|
num_profile_failed_unsupported_column_type: int = 0
|
|
56
57
|
num_profile_failed_int_casts: int = 0
|
|
57
58
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
5
|
StaleEntityRemovalSourceReport,
|
|
6
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
|
|
|
12
13
|
tenants_scanned: Optional[int] = None
|
|
13
14
|
namespaces_scanned: Optional[int] = None
|
|
14
15
|
topics_scanned: Optional[int] = None
|
|
15
|
-
tenants_filtered:
|
|
16
|
-
namespaces_filtered:
|
|
17
|
-
topics_filtered:
|
|
16
|
+
tenants_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
17
|
+
namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
|
+
topics_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
19
|
|
|
19
20
|
def report_pulsar_version(self, version: str) -> None:
|
|
20
21
|
self.pulsar_version = version
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -4730,16 +4730,16 @@
|
|
|
4730
4730
|
{
|
|
4731
4731
|
"Relationship": {
|
|
4732
4732
|
"/*/destinationUrn": {
|
|
4733
|
-
"createdActor": "
|
|
4734
|
-
"createdOn": "
|
|
4733
|
+
"createdActor": "dashboards/*/created/actor",
|
|
4734
|
+
"createdOn": "dashboards/*/created/time",
|
|
4735
4735
|
"entityTypes": [
|
|
4736
4736
|
"dashboard"
|
|
4737
4737
|
],
|
|
4738
4738
|
"isLineage": true,
|
|
4739
4739
|
"name": "DashboardContainsDashboard",
|
|
4740
|
-
"properties": "
|
|
4741
|
-
"updatedActor": "
|
|
4742
|
-
"updatedOn": "
|
|
4740
|
+
"properties": "dashboards/*/properties",
|
|
4741
|
+
"updatedActor": "dashboards/*/lastModified/actor",
|
|
4742
|
+
"updatedOn": "dashboards/*/lastModified/time"
|
|
4743
4743
|
}
|
|
4744
4744
|
},
|
|
4745
4745
|
"type": {
|