acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/METADATA +2491 -2491
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/RECORD +33 -32
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/lookml_config.py +9 -5
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_report.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
16
16
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
17
|
StatefulIngestionConfigBase,
|
|
18
18
|
)
|
|
19
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
|
|
|
83
84
|
@dataclass
|
|
84
85
|
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
85
86
|
connectors_scanned: int = 0
|
|
86
|
-
filtered:
|
|
87
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
87
88
|
|
|
88
89
|
def report_connector_scanned(self, connector: str) -> None:
|
|
89
90
|
self.connectors_scanned += 1
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
|
|
|
37
37
|
CorpUserSnapshotClass,
|
|
38
38
|
GroupMembershipClass,
|
|
39
39
|
)
|
|
40
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
40
41
|
|
|
41
42
|
# default mapping for attrs
|
|
42
43
|
user_attrs_map: Dict[str, Any] = {}
|
|
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
160
161
|
|
|
161
162
|
@dataclasses.dataclass
|
|
162
163
|
class LDAPSourceReport(StaleEntityRemovalSourceReport):
|
|
163
|
-
dropped_dns:
|
|
164
|
+
dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
164
165
|
|
|
165
166
|
def report_dropped(self, dn: str) -> None:
|
|
166
167
|
self.dropped_dns.append(dn)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field as dataclass_field
|
|
3
3
|
from datetime import timedelta
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import root_validator, validator
|
|
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
|
|
|
48
48
|
class LookMLSourceReport(StaleEntityRemovalSourceReport):
|
|
49
49
|
git_clone_latency: Optional[timedelta] = None
|
|
50
50
|
models_discovered: int = 0
|
|
51
|
-
models_dropped:
|
|
51
|
+
models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
52
52
|
views_discovered: int = 0
|
|
53
|
-
views_dropped:
|
|
54
|
-
views_dropped_unreachable:
|
|
53
|
+
views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
54
|
+
views_dropped_unreachable: LossyList[str] = dataclass_field(
|
|
55
|
+
default_factory=LossyList
|
|
56
|
+
)
|
|
55
57
|
query_parse_attempts: int = 0
|
|
56
58
|
query_parse_failures: int = 0
|
|
57
|
-
query_parse_failure_views:
|
|
59
|
+
query_parse_failure_views: LossyList[str] = dataclass_field(
|
|
60
|
+
default_factory=LossyList
|
|
61
|
+
)
|
|
58
62
|
_looker_api: Optional[LookerAPI] = None
|
|
59
63
|
|
|
60
64
|
def report_models_scanned(self) -> None:
|
|
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
|
|
|
68
68
|
UnionTypeClass,
|
|
69
69
|
)
|
|
70
70
|
from datahub.metadata.urns import DatasetUrn
|
|
71
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
71
72
|
|
|
72
73
|
logger = logging.getLogger(__name__)
|
|
73
74
|
|
|
@@ -143,7 +144,7 @@ class MongoDBConfig(
|
|
|
143
144
|
|
|
144
145
|
@dataclass
|
|
145
146
|
class MongoDBSourceReport(StaleEntityRemovalSourceReport):
|
|
146
|
-
filtered:
|
|
147
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
147
148
|
|
|
148
149
|
def report_dropped(self, name: str) -> None:
|
|
149
150
|
self.filtered.append(name)
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
|
|
|
46
46
|
DatasetPropertiesClass,
|
|
47
47
|
)
|
|
48
48
|
from datahub.specific.datajob import DataJobPatchBuilder
|
|
49
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
49
50
|
|
|
50
51
|
logger = logging.getLogger(__name__)
|
|
51
52
|
NIFI = "nifi"
|
|
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
|
|
|
452
453
|
|
|
453
454
|
@dataclass
|
|
454
455
|
class NifiSourceReport(SourceReport):
|
|
455
|
-
filtered:
|
|
456
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
456
457
|
|
|
457
458
|
def report_dropped(self, ent_name: str) -> None:
|
|
458
459
|
self.filtered.append(ent_name)
|
|
@@ -195,8 +195,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
195
195
|
|
|
196
196
|
dashboards_scanned: int = 0
|
|
197
197
|
charts_scanned: int = 0
|
|
198
|
-
filtered_dashboards:
|
|
199
|
-
filtered_charts:
|
|
198
|
+
filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
199
|
+
filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
200
200
|
|
|
201
201
|
m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
|
|
202
202
|
m_query_parse_attempts: int = 0
|
|
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
|
|
|
53
53
|
StatusClass,
|
|
54
54
|
)
|
|
55
55
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
56
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
56
57
|
|
|
57
58
|
LOGGER = logging.getLogger(__name__)
|
|
58
59
|
|
|
@@ -476,7 +477,7 @@ class Mapper:
|
|
|
476
477
|
@dataclass
|
|
477
478
|
class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
478
479
|
scanned_report: int = 0
|
|
479
|
-
filtered_reports:
|
|
480
|
+
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
480
481
|
|
|
481
482
|
def report_scanned(self, count: int = 1) -> None:
|
|
482
483
|
self.scanned_report += count
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
|
|
|
39
39
|
DashboardInfoClass,
|
|
40
40
|
)
|
|
41
41
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
|
-
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
42
|
+
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
45
45
|
|
|
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
|
|
|
280
280
|
class RedashSourceReport(SourceReport):
|
|
281
281
|
items_scanned: int = 0
|
|
282
282
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
283
|
-
queries_problem_parsing:
|
|
284
|
-
queries_no_dataset:
|
|
285
|
-
charts_no_input:
|
|
283
|
+
queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
|
|
284
|
+
queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
|
|
285
|
+
charts_no_input: LossySet[str] = field(default_factory=LossySet)
|
|
286
286
|
total_queries: Optional[int] = field(
|
|
287
287
|
default=None,
|
|
288
288
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
+
from dataclasses import dataclass, field as dataclass_field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
|
|
|
60
61
|
TagAssociationClass,
|
|
61
62
|
)
|
|
62
63
|
from datahub.utilities import config_clean
|
|
64
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
63
65
|
|
|
64
66
|
logger = logging.getLogger(__name__)
|
|
65
67
|
|
|
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
|
|
|
146
148
|
return config_clean.remove_trailing_slashes(v)
|
|
147
149
|
|
|
148
150
|
|
|
151
|
+
@dataclass
|
|
149
152
|
class SalesforceSourceReport(SourceReport):
|
|
150
|
-
filtered:
|
|
153
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
151
154
|
|
|
152
155
|
def report_dropped(self, ent_name: str) -> None:
|
|
153
156
|
self.filtered.append(ent_name)
|
|
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
12
12
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
13
13
|
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
14
14
|
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
15
|
+
from datahub.utilities.lossy_collections import LossyDict
|
|
15
16
|
from datahub.utilities.perf_timer import PerfTimer
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
|
|
|
66
67
|
num_external_table_edges_scanned: int = 0
|
|
67
68
|
ignore_start_time_lineage: Optional[bool] = None
|
|
68
69
|
upstream_lineage_in_report: Optional[bool] = None
|
|
69
|
-
upstream_lineage:
|
|
70
|
+
upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
|
|
70
71
|
|
|
71
72
|
lineage_start_time: Optional[datetime] = None
|
|
72
73
|
lineage_end_time: Optional[datetime] = None
|
|
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
170
|
create_lineage_sql_parsed_result,
|
|
171
171
|
)
|
|
172
172
|
from datahub.utilities import config_clean
|
|
173
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
173
174
|
from datahub.utilities.perf_timer import PerfTimer
|
|
174
175
|
from datahub.utilities.stats_collections import TopKDict
|
|
175
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
@@ -798,7 +799,7 @@ class TableauSourceReport(
|
|
|
798
799
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
799
800
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
800
801
|
num_hidden_assets_skipped: int = 0
|
|
801
|
-
logged_in_user:
|
|
802
|
+
logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
|
|
802
803
|
|
|
803
804
|
last_authenticated_at: Optional[datetime] = None
|
|
804
805
|
|
|
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Iterable, List, Optional
|
|
5
5
|
|
|
6
|
+
from databricks.sdk.service.catalog import DataSourceFormat
|
|
6
7
|
from sqlalchemy import create_engine
|
|
7
8
|
from sqlalchemy.engine import Connection
|
|
8
9
|
|
|
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
|
|
|
34
35
|
self.size_in_bytes = None
|
|
35
36
|
self.rows_count = None
|
|
36
37
|
self.ddl = None
|
|
38
|
+
self.data_source_format = table.data_source_format
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_delta_table(self) -> bool:
|
|
42
|
+
return self.data_source_format == DataSourceFormat.DELTA
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class UnityCatalogGEProfiler(GenericProfiler):
|
|
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
110
116
|
profile_table_level_only = self.profiling_config.profile_table_level_only
|
|
111
117
|
|
|
112
118
|
dataset_name = table.ref.qualified_table_name
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
if table.is_delta_table:
|
|
120
|
+
try:
|
|
121
|
+
table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.report.warning(
|
|
124
|
+
title="Incomplete Dataset Profile",
|
|
125
|
+
message="Failed to get table size",
|
|
126
|
+
context=dataset_name,
|
|
127
|
+
exc=e,
|
|
128
|
+
)
|
|
117
129
|
|
|
118
130
|
if table.size_in_bytes is None:
|
|
119
131
|
self.report.num_profile_missing_size_in_bytes += 1
|
|
132
|
+
|
|
120
133
|
if not self.is_dataset_eligible_for_profiling(
|
|
121
134
|
dataset_name,
|
|
122
135
|
size_in_bytes=table.size_in_bytes,
|
|
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
143
156
|
self.report.report_dropped(dataset_name)
|
|
144
157
|
return None
|
|
145
158
|
|
|
159
|
+
if profile_table_level_only and table.is_delta_table:
|
|
160
|
+
# For requests with profile_table_level_only set, dataset profile is generated
|
|
161
|
+
# by looking at table.rows_count. For delta tables (a typical databricks table)
|
|
162
|
+
# count(*) is an efficient query to compute row count.
|
|
163
|
+
try:
|
|
164
|
+
table.rows_count = _get_dataset_row_count(table, conn)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.report.warning(
|
|
167
|
+
title="Incomplete Dataset Profile",
|
|
168
|
+
message="Failed to get table row count",
|
|
169
|
+
context=dataset_name,
|
|
170
|
+
exc=e,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if table.rows_count is None:
|
|
174
|
+
self.report.num_profile_missing_row_count += 1
|
|
175
|
+
|
|
146
176
|
self.report.report_entity_profiled(dataset_name)
|
|
147
177
|
logger.debug(f"Preparing profiling request for {dataset_name}")
|
|
148
178
|
return TableProfilerRequest(
|
|
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
|
|
|
160
190
|
conn.dialect.identifier_preparer.quote(c)
|
|
161
191
|
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
162
192
|
)
|
|
193
|
+
# This query only works for delta table.
|
|
194
|
+
# Ref: https://docs.databricks.com/en/delta/table-details.html
|
|
195
|
+
# Note: Any change here should also update _get_dataset_row_count
|
|
163
196
|
row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
|
|
164
197
|
if row is None:
|
|
165
198
|
return None
|
|
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
|
|
|
168
201
|
return int(row._asdict()["sizeInBytes"])
|
|
169
202
|
except Exception:
|
|
170
203
|
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_dataset_row_count(
|
|
207
|
+
table: UnityCatalogSQLGenericTable, conn: Connection
|
|
208
|
+
) -> Optional[int]:
|
|
209
|
+
name = ".".join(
|
|
210
|
+
conn.dialect.identifier_preparer.quote(c)
|
|
211
|
+
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
212
|
+
)
|
|
213
|
+
# This query only works efficiently for delta table
|
|
214
|
+
row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
|
|
215
|
+
if row is None:
|
|
216
|
+
return None
|
|
217
|
+
else:
|
|
218
|
+
try:
|
|
219
|
+
return int(row._asdict()["numRows"])
|
|
220
|
+
except Exception:
|
|
221
|
+
return None
|
|
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
52
52
|
default_factory=LossyDict
|
|
53
53
|
)
|
|
54
54
|
num_profile_missing_size_in_bytes: int = 0
|
|
55
|
+
num_profile_missing_row_count: int = 0
|
|
55
56
|
num_profile_failed_unsupported_column_type: int = 0
|
|
56
57
|
num_profile_failed_int_casts: int = 0
|
|
57
58
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
5
|
StaleEntityRemovalSourceReport,
|
|
6
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
|
|
|
12
13
|
tenants_scanned: Optional[int] = None
|
|
13
14
|
namespaces_scanned: Optional[int] = None
|
|
14
15
|
topics_scanned: Optional[int] = None
|
|
15
|
-
tenants_filtered:
|
|
16
|
-
namespaces_filtered:
|
|
17
|
-
topics_filtered:
|
|
16
|
+
tenants_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
17
|
+
namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
|
+
topics_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
19
|
|
|
19
20
|
def report_pulsar_version(self, version: str) -> None:
|
|
20
21
|
self.pulsar_version = version
|
|
File without changes
|
|
File without changes
|
{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|