acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2431 -2431
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +46 -45
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/lookml_config.py +9 -5
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +3 -2
- datahub/ingestion/source/powerbi/powerbi.py +28 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
- datahub/ingestion/source/snowflake/snowflake_report.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
- datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
- datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
|
@@ -124,19 +124,20 @@ class SnowflakeFilter:
|
|
|
124
124
|
SnowflakeObjectDomain.VIEW,
|
|
125
125
|
SnowflakeObjectDomain.MATERIALIZED_VIEW,
|
|
126
126
|
SnowflakeObjectDomain.ICEBERG_TABLE,
|
|
127
|
+
SnowflakeObjectDomain.STREAM,
|
|
127
128
|
):
|
|
128
129
|
return False
|
|
129
130
|
if _is_sys_table(dataset_name):
|
|
130
131
|
return False
|
|
131
132
|
|
|
132
|
-
dataset_params =
|
|
133
|
+
dataset_params = split_qualified_name(dataset_name)
|
|
133
134
|
if len(dataset_params) != 3:
|
|
134
135
|
self.structured_reporter.info(
|
|
135
136
|
title="Unexpected dataset pattern",
|
|
136
137
|
message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.",
|
|
137
138
|
context=dataset_name,
|
|
138
139
|
)
|
|
139
|
-
# We fall-through here so table/view filtering still works.
|
|
140
|
+
# We fall-through here so table/view/stream filtering still works.
|
|
140
141
|
|
|
141
142
|
if (
|
|
142
143
|
len(dataset_params) >= 1
|
|
@@ -169,6 +170,14 @@ class SnowflakeFilter:
|
|
|
169
170
|
):
|
|
170
171
|
return False
|
|
171
172
|
|
|
173
|
+
if (
|
|
174
|
+
dataset_type.lower() == SnowflakeObjectDomain.STREAM
|
|
175
|
+
and not self.filter_config.stream_pattern.allowed(
|
|
176
|
+
_cleanup_qualified_name(dataset_name, self.structured_reporter)
|
|
177
|
+
)
|
|
178
|
+
):
|
|
179
|
+
return False
|
|
180
|
+
|
|
172
181
|
return True
|
|
173
182
|
|
|
174
183
|
|
|
@@ -183,17 +192,17 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
183
192
|
return table_name.lower().startswith("sys$")
|
|
184
193
|
|
|
185
194
|
|
|
186
|
-
def
|
|
195
|
+
def split_qualified_name(qualified_name: str) -> List[str]:
|
|
187
196
|
"""
|
|
188
197
|
Split a qualified name into its constituent parts.
|
|
189
198
|
|
|
190
|
-
>>>
|
|
199
|
+
>>> split_qualified_name("db.my_schema.my_table")
|
|
191
200
|
['db', 'my_schema', 'my_table']
|
|
192
|
-
>>>
|
|
201
|
+
>>> split_qualified_name('"db"."my_schema"."my_table"')
|
|
193
202
|
['db', 'my_schema', 'my_table']
|
|
194
|
-
>>>
|
|
203
|
+
>>> split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
|
|
195
204
|
['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
|
|
196
|
-
>>>
|
|
205
|
+
>>> split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
|
|
197
206
|
['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
|
|
198
207
|
"""
|
|
199
208
|
|
|
@@ -231,7 +240,7 @@ def _split_qualified_name(qualified_name: str) -> List[str]:
|
|
|
231
240
|
def _cleanup_qualified_name(
|
|
232
241
|
qualified_name: str, structured_reporter: SourceReport
|
|
233
242
|
) -> str:
|
|
234
|
-
name_parts =
|
|
243
|
+
name_parts = split_qualified_name(qualified_name)
|
|
235
244
|
if len(name_parts) != 3:
|
|
236
245
|
if not _is_sys_table(qualified_name):
|
|
237
246
|
structured_reporter.info(
|
|
@@ -539,15 +539,27 @@ class SnowflakeV2Source(
|
|
|
539
539
|
for schema in db.schemas
|
|
540
540
|
for table_name in schema.views
|
|
541
541
|
]
|
|
542
|
+
discovered_streams: List[str] = [
|
|
543
|
+
self.identifiers.get_dataset_identifier(stream_name, schema.name, db.name)
|
|
544
|
+
for db in databases
|
|
545
|
+
for schema in db.schemas
|
|
546
|
+
for stream_name in schema.streams
|
|
547
|
+
]
|
|
542
548
|
|
|
543
|
-
if
|
|
549
|
+
if (
|
|
550
|
+
len(discovered_tables) == 0
|
|
551
|
+
and len(discovered_views) == 0
|
|
552
|
+
and len(discovered_streams) == 0
|
|
553
|
+
):
|
|
544
554
|
self.structured_reporter.failure(
|
|
545
555
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
546
|
-
"No tables/views found. Please check permissions.",
|
|
556
|
+
"No tables/views/streams found. Please check permissions.",
|
|
547
557
|
)
|
|
548
558
|
return
|
|
549
559
|
|
|
550
|
-
self.discovered_datasets =
|
|
560
|
+
self.discovered_datasets = (
|
|
561
|
+
discovered_tables + discovered_views + discovered_streams
|
|
562
|
+
)
|
|
551
563
|
|
|
552
564
|
if self.config.use_queries_v2:
|
|
553
565
|
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
170
|
create_lineage_sql_parsed_result,
|
|
171
171
|
)
|
|
172
172
|
from datahub.utilities import config_clean
|
|
173
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
173
174
|
from datahub.utilities.perf_timer import PerfTimer
|
|
174
175
|
from datahub.utilities.stats_collections import TopKDict
|
|
175
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
@@ -798,7 +799,7 @@ class TableauSourceReport(
|
|
|
798
799
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
799
800
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
800
801
|
num_hidden_assets_skipped: int = 0
|
|
801
|
-
logged_in_user:
|
|
802
|
+
logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
|
|
802
803
|
|
|
803
804
|
last_authenticated_at: Optional[datetime] = None
|
|
804
805
|
|
|
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Iterable, List, Optional
|
|
5
5
|
|
|
6
|
+
from databricks.sdk.service.catalog import DataSourceFormat
|
|
6
7
|
from sqlalchemy import create_engine
|
|
7
8
|
from sqlalchemy.engine import Connection
|
|
8
9
|
|
|
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
|
|
|
34
35
|
self.size_in_bytes = None
|
|
35
36
|
self.rows_count = None
|
|
36
37
|
self.ddl = None
|
|
38
|
+
self.data_source_format = table.data_source_format
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_delta_table(self) -> bool:
|
|
42
|
+
return self.data_source_format == DataSourceFormat.DELTA
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class UnityCatalogGEProfiler(GenericProfiler):
|
|
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
110
116
|
profile_table_level_only = self.profiling_config.profile_table_level_only
|
|
111
117
|
|
|
112
118
|
dataset_name = table.ref.qualified_table_name
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
if table.is_delta_table:
|
|
120
|
+
try:
|
|
121
|
+
table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.report.warning(
|
|
124
|
+
title="Incomplete Dataset Profile",
|
|
125
|
+
message="Failed to get table size",
|
|
126
|
+
context=dataset_name,
|
|
127
|
+
exc=e,
|
|
128
|
+
)
|
|
117
129
|
|
|
118
130
|
if table.size_in_bytes is None:
|
|
119
131
|
self.report.num_profile_missing_size_in_bytes += 1
|
|
132
|
+
|
|
120
133
|
if not self.is_dataset_eligible_for_profiling(
|
|
121
134
|
dataset_name,
|
|
122
135
|
size_in_bytes=table.size_in_bytes,
|
|
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
143
156
|
self.report.report_dropped(dataset_name)
|
|
144
157
|
return None
|
|
145
158
|
|
|
159
|
+
if profile_table_level_only and table.is_delta_table:
|
|
160
|
+
# For requests with profile_table_level_only set, dataset profile is generated
|
|
161
|
+
# by looking at table.rows_count. For delta tables (a typical databricks table)
|
|
162
|
+
# count(*) is an efficient query to compute row count.
|
|
163
|
+
try:
|
|
164
|
+
table.rows_count = _get_dataset_row_count(table, conn)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.report.warning(
|
|
167
|
+
title="Incomplete Dataset Profile",
|
|
168
|
+
message="Failed to get table row count",
|
|
169
|
+
context=dataset_name,
|
|
170
|
+
exc=e,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if table.rows_count is None:
|
|
174
|
+
self.report.num_profile_missing_row_count += 1
|
|
175
|
+
|
|
146
176
|
self.report.report_entity_profiled(dataset_name)
|
|
147
177
|
logger.debug(f"Preparing profiling request for {dataset_name}")
|
|
148
178
|
return TableProfilerRequest(
|
|
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
|
|
|
160
190
|
conn.dialect.identifier_preparer.quote(c)
|
|
161
191
|
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
162
192
|
)
|
|
193
|
+
# This query only works for delta table.
|
|
194
|
+
# Ref: https://docs.databricks.com/en/delta/table-details.html
|
|
195
|
+
# Note: Any change here should also update _get_dataset_row_count
|
|
163
196
|
row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
|
|
164
197
|
if row is None:
|
|
165
198
|
return None
|
|
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
|
|
|
168
201
|
return int(row._asdict()["sizeInBytes"])
|
|
169
202
|
except Exception:
|
|
170
203
|
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_dataset_row_count(
|
|
207
|
+
table: UnityCatalogSQLGenericTable, conn: Connection
|
|
208
|
+
) -> Optional[int]:
|
|
209
|
+
name = ".".join(
|
|
210
|
+
conn.dialect.identifier_preparer.quote(c)
|
|
211
|
+
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
212
|
+
)
|
|
213
|
+
# This query only works efficiently for delta table
|
|
214
|
+
row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
|
|
215
|
+
if row is None:
|
|
216
|
+
return None
|
|
217
|
+
else:
|
|
218
|
+
try:
|
|
219
|
+
return int(row._asdict()["numRows"])
|
|
220
|
+
except Exception:
|
|
221
|
+
return None
|
|
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
52
52
|
default_factory=LossyDict
|
|
53
53
|
)
|
|
54
54
|
num_profile_missing_size_in_bytes: int = 0
|
|
55
|
+
num_profile_missing_row_count: int = 0
|
|
55
56
|
num_profile_failed_unsupported_column_type: int = 0
|
|
56
57
|
num_profile_failed_int_casts: int = 0
|
|
57
58
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
5
|
StaleEntityRemovalSourceReport,
|
|
6
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
|
|
|
12
13
|
tenants_scanned: Optional[int] = None
|
|
13
14
|
namespaces_scanned: Optional[int] = None
|
|
14
15
|
topics_scanned: Optional[int] = None
|
|
15
|
-
tenants_filtered:
|
|
16
|
-
namespaces_filtered:
|
|
17
|
-
topics_filtered:
|
|
16
|
+
tenants_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
17
|
+
namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
|
+
topics_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
19
|
|
|
19
20
|
def report_pulsar_version(self, version: str) -> None:
|
|
20
21
|
self.pulsar_version = version
|
|
File without changes
|
|
File without changes
|
{acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|