acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -25
- datahub/_version.py +13 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
- datahub/cli/check_cli.py +1 -1
- datahub/cli/cli_utils.py +3 -3
- datahub/cli/container_cli.py +1 -64
- datahub/cli/iceberg_cli.py +707 -0
- datahub/cli/ingest_cli.py +2 -2
- datahub/emitter/composite_emitter.py +36 -0
- datahub/emitter/rest_emitter.py +1 -1
- datahub/entrypoints.py +26 -5
- datahub/ingestion/api/incremental_lineage_helper.py +4 -0
- datahub/ingestion/api/registry.py +1 -1
- datahub/ingestion/glossary/classification_mixin.py +6 -0
- datahub/ingestion/glossary/classifier.py +3 -2
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/run/connection.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -3
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/apply/__init__.py +0 -0
- datahub/ingestion/source/apply/datahub_apply.py +223 -0
- datahub/ingestion/source/aws/glue.py +5 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/dbt/dbt_core.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/azure_ad.py +6 -14
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/looker_config.py +3 -1
- datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
- datahub/ingestion/source/looker/looker_file_loader.py +14 -3
- datahub/ingestion/source/looker/looker_template_language.py +104 -14
- datahub/ingestion/source/looker/lookml_config.py +29 -8
- datahub/ingestion/source/looker/lookml_source.py +110 -22
- datahub/ingestion/source/mode.py +2 -4
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
- datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
- datahub/ingestion/source/sql/clickhouse.py +5 -43
- datahub/ingestion/source/sql/mssql/job_models.py +37 -8
- datahub/ingestion/source/sql/mssql/source.py +17 -0
- datahub/ingestion/source/sql/sql_config.py +0 -10
- datahub/ingestion/source/tableau/tableau.py +16 -13
- datahub/ingestion/source/tableau/tableau_common.py +1 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/proxy.py +2 -2
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_config/operation_config.py +9 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- datahub/metadata/_schema_classes.py +304 -6
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
- datahub/metadata/schema.avsc +211 -12
- datahub/metadata/schemas/AssertionInfo.avsc +2 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
- datahub/metadata/schemas/DashboardInfo.avsc +5 -5
- datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +12 -0
- datahub/metadata/schemas/DisplayProperties.avsc +62 -0
- datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
- datahub/metadata/schemas/PostInfo.avsc +28 -2
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/specific/dashboard.py +43 -1
- datahub/telemetry/telemetry.py +4 -4
- datahub/testing/check_imports.py +28 -0
- datahub/upgrade/upgrade.py +17 -9
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,6 @@ import logging
|
|
|
2
2
|
from abc import abstractmethod
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
|
-
import cachetools
|
|
6
|
-
import cachetools.keys
|
|
7
5
|
import pydantic
|
|
8
6
|
from pydantic import Field
|
|
9
7
|
from sqlalchemy.engine import URL
|
|
@@ -29,7 +27,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
29
27
|
StatefulIngestionConfigBase,
|
|
30
28
|
)
|
|
31
29
|
from datahub.ingestion.source_config.operation_config import is_profiling_enabled
|
|
32
|
-
from datahub.utilities.cachetools_keys import self_methodkey
|
|
33
30
|
|
|
34
31
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
35
32
|
|
|
@@ -118,13 +115,6 @@ class SQLCommonConfig(
|
|
|
118
115
|
# Custom Stateful Ingestion settings
|
|
119
116
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
120
117
|
|
|
121
|
-
# TRICKY: The operation_config is time-dependent. Because we don't want to change
|
|
122
|
-
# whether or not we're running profiling mid-ingestion, we cache the result of this method.
|
|
123
|
-
# TODO: This decorator should be moved to the is_profiling_enabled(operation_config) method.
|
|
124
|
-
@cachetools.cached(
|
|
125
|
-
cache=cachetools.LRUCache(maxsize=1),
|
|
126
|
-
key=self_methodkey,
|
|
127
|
-
)
|
|
128
118
|
def is_profiling_enabled(self) -> bool:
|
|
129
119
|
return self.profiling.enabled and is_profiling_enabled(
|
|
130
120
|
self.profiling.operation_config
|
|
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
170
170
|
create_lineage_sql_parsed_result,
|
|
171
171
|
)
|
|
172
172
|
from datahub.utilities import config_clean
|
|
173
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
173
174
|
from datahub.utilities.perf_timer import PerfTimer
|
|
174
175
|
from datahub.utilities.stats_collections import TopKDict
|
|
175
176
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
@@ -798,7 +799,7 @@ class TableauSourceReport(
|
|
|
798
799
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
799
800
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
800
801
|
num_hidden_assets_skipped: int = 0
|
|
801
|
-
logged_in_user:
|
|
802
|
+
logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
|
|
802
803
|
|
|
803
804
|
last_authenticated_at: Optional[datetime] = None
|
|
804
805
|
|
|
@@ -2428,10 +2429,12 @@ class TableauSiteSource:
|
|
|
2428
2429
|
]
|
|
2429
2430
|
],
|
|
2430
2431
|
) -> Optional["SqlParsingResult"]:
|
|
2431
|
-
|
|
2432
|
-
|
|
2433
|
-
|
|
2434
|
-
|
|
2432
|
+
database_field = datasource.get(c.DATABASE) or {}
|
|
2433
|
+
database_id: Optional[str] = database_field.get(c.ID)
|
|
2434
|
+
database_name: Optional[str] = database_field.get(c.NAME) or c.UNKNOWN.lower()
|
|
2435
|
+
database_connection_type: Optional[str] = database_field.get(
|
|
2436
|
+
c.CONNECTION_TYPE
|
|
2437
|
+
) or datasource.get(c.CONNECTION_TYPE)
|
|
2435
2438
|
|
|
2436
2439
|
if (
|
|
2437
2440
|
datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False)
|
|
@@ -2440,10 +2443,7 @@ class TableauSiteSource:
|
|
|
2440
2443
|
logger.debug(f"datasource {datasource_urn} is not created from custom sql")
|
|
2441
2444
|
return None
|
|
2442
2445
|
|
|
2443
|
-
if
|
|
2444
|
-
database_info.get(c.NAME) is None
|
|
2445
|
-
or database_info.get(c.CONNECTION_TYPE) is None
|
|
2446
|
-
):
|
|
2446
|
+
if database_connection_type is None:
|
|
2447
2447
|
logger.debug(
|
|
2448
2448
|
f"database information is missing from datasource {datasource_urn}"
|
|
2449
2449
|
)
|
|
@@ -2459,14 +2459,14 @@ class TableauSiteSource:
|
|
|
2459
2459
|
|
|
2460
2460
|
logger.debug(f"Parsing sql={query}")
|
|
2461
2461
|
|
|
2462
|
-
upstream_db =
|
|
2462
|
+
upstream_db = database_name
|
|
2463
2463
|
|
|
2464
2464
|
if func_overridden_info is not None:
|
|
2465
2465
|
# Override the information as per configuration
|
|
2466
2466
|
upstream_db, platform_instance, platform, _ = func_overridden_info(
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2467
|
+
database_connection_type,
|
|
2468
|
+
database_name,
|
|
2469
|
+
database_id,
|
|
2470
2470
|
self.config.platform_instance_map,
|
|
2471
2471
|
self.config.lineage_overrides,
|
|
2472
2472
|
self.config.database_hostname_to_platform_instance_map,
|
|
@@ -2534,6 +2534,9 @@ class TableauSiteSource:
|
|
|
2534
2534
|
platform_instance=self.config.platform_instance,
|
|
2535
2535
|
func_overridden_info=get_overridden_info,
|
|
2536
2536
|
)
|
|
2537
|
+
logger.debug(
|
|
2538
|
+
f"_create_lineage_from_unsupported_csql parsed_result = {parsed_result}"
|
|
2539
|
+
)
|
|
2537
2540
|
|
|
2538
2541
|
if parsed_result is None:
|
|
2539
2542
|
return
|
|
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Iterable, List, Optional
|
|
5
5
|
|
|
6
|
+
from databricks.sdk.service.catalog import DataSourceFormat
|
|
6
7
|
from sqlalchemy import create_engine
|
|
7
8
|
from sqlalchemy.engine import Connection
|
|
8
9
|
|
|
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
|
|
|
34
35
|
self.size_in_bytes = None
|
|
35
36
|
self.rows_count = None
|
|
36
37
|
self.ddl = None
|
|
38
|
+
self.data_source_format = table.data_source_format
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_delta_table(self) -> bool:
|
|
42
|
+
return self.data_source_format == DataSourceFormat.DELTA
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
class UnityCatalogGEProfiler(GenericProfiler):
|
|
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
110
116
|
profile_table_level_only = self.profiling_config.profile_table_level_only
|
|
111
117
|
|
|
112
118
|
dataset_name = table.ref.qualified_table_name
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
if table.is_delta_table:
|
|
120
|
+
try:
|
|
121
|
+
table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.report.warning(
|
|
124
|
+
title="Incomplete Dataset Profile",
|
|
125
|
+
message="Failed to get table size",
|
|
126
|
+
context=dataset_name,
|
|
127
|
+
exc=e,
|
|
128
|
+
)
|
|
117
129
|
|
|
118
130
|
if table.size_in_bytes is None:
|
|
119
131
|
self.report.num_profile_missing_size_in_bytes += 1
|
|
132
|
+
|
|
120
133
|
if not self.is_dataset_eligible_for_profiling(
|
|
121
134
|
dataset_name,
|
|
122
135
|
size_in_bytes=table.size_in_bytes,
|
|
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
|
|
|
143
156
|
self.report.report_dropped(dataset_name)
|
|
144
157
|
return None
|
|
145
158
|
|
|
159
|
+
if profile_table_level_only and table.is_delta_table:
|
|
160
|
+
# For requests with profile_table_level_only set, dataset profile is generated
|
|
161
|
+
# by looking at table.rows_count. For delta tables (a typical databricks table)
|
|
162
|
+
# count(*) is an efficient query to compute row count.
|
|
163
|
+
try:
|
|
164
|
+
table.rows_count = _get_dataset_row_count(table, conn)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.report.warning(
|
|
167
|
+
title="Incomplete Dataset Profile",
|
|
168
|
+
message="Failed to get table row count",
|
|
169
|
+
context=dataset_name,
|
|
170
|
+
exc=e,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if table.rows_count is None:
|
|
174
|
+
self.report.num_profile_missing_row_count += 1
|
|
175
|
+
|
|
146
176
|
self.report.report_entity_profiled(dataset_name)
|
|
147
177
|
logger.debug(f"Preparing profiling request for {dataset_name}")
|
|
148
178
|
return TableProfilerRequest(
|
|
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
|
|
|
160
190
|
conn.dialect.identifier_preparer.quote(c)
|
|
161
191
|
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
162
192
|
)
|
|
193
|
+
# This query only works for delta table.
|
|
194
|
+
# Ref: https://docs.databricks.com/en/delta/table-details.html
|
|
195
|
+
# Note: Any change here should also update _get_dataset_row_count
|
|
163
196
|
row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
|
|
164
197
|
if row is None:
|
|
165
198
|
return None
|
|
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
|
|
|
168
201
|
return int(row._asdict()["sizeInBytes"])
|
|
169
202
|
except Exception:
|
|
170
203
|
return None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _get_dataset_row_count(
|
|
207
|
+
table: UnityCatalogSQLGenericTable, conn: Connection
|
|
208
|
+
) -> Optional[int]:
|
|
209
|
+
name = ".".join(
|
|
210
|
+
conn.dialect.identifier_preparer.quote(c)
|
|
211
|
+
for c in [table.ref.catalog, table.ref.schema, table.ref.table]
|
|
212
|
+
)
|
|
213
|
+
# This query only works efficiently for delta table
|
|
214
|
+
row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
|
|
215
|
+
if row is None:
|
|
216
|
+
return None
|
|
217
|
+
else:
|
|
218
|
+
try:
|
|
219
|
+
return int(row._asdict()["numRows"])
|
|
220
|
+
except Exception:
|
|
221
|
+
return None
|
|
@@ -26,7 +26,7 @@ from databricks.sdk.service.sql import (
|
|
|
26
26
|
)
|
|
27
27
|
from databricks.sdk.service.workspace import ObjectType
|
|
28
28
|
|
|
29
|
-
import
|
|
29
|
+
from datahub._version import nice_version_name
|
|
30
30
|
from datahub.emitter.mce_builder import parse_ts_millis
|
|
31
31
|
from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
|
|
32
32
|
from datahub.ingestion.source.unity.proxy_profiling import (
|
|
@@ -103,7 +103,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
|
|
|
103
103
|
host=workspace_url,
|
|
104
104
|
token=personal_access_token,
|
|
105
105
|
product="datahub",
|
|
106
|
-
product_version=
|
|
106
|
+
product_version=nice_version_name(),
|
|
107
107
|
)
|
|
108
108
|
self.warehouse_id = warehouse_id or ""
|
|
109
109
|
self.report = report
|
|
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
52
52
|
default_factory=LossyDict
|
|
53
53
|
)
|
|
54
54
|
num_profile_missing_size_in_bytes: int = 0
|
|
55
|
+
num_profile_missing_row_count: int = 0
|
|
55
56
|
num_profile_failed_unsupported_column_type: int = 0
|
|
56
57
|
num_profile_failed_int_casts: int = 0
|
|
57
58
|
|
|
@@ -2,10 +2,12 @@ import datetime
|
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
|
+
import cachetools
|
|
5
6
|
import pydantic
|
|
6
7
|
from pydantic.fields import Field
|
|
7
8
|
|
|
8
9
|
from datahub.configuration.common import ConfigModel
|
|
10
|
+
from datahub.utilities.cachetools_keys import self_methodkey
|
|
9
11
|
|
|
10
12
|
logger = logging.getLogger(__name__)
|
|
11
13
|
|
|
@@ -62,6 +64,13 @@ class OperationConfig(ConfigModel):
|
|
|
62
64
|
return profile_date_of_month
|
|
63
65
|
|
|
64
66
|
|
|
67
|
+
# TRICKY: The operation_config is time-dependent. Because we don't want to change
|
|
68
|
+
# whether or not we're running profiling mid-ingestion, we cache the result of this method.
|
|
69
|
+
# An additional benefit is that we only print the log lines on the first call.
|
|
70
|
+
@cachetools.cached(
|
|
71
|
+
cache=cachetools.LRUCache(maxsize=1),
|
|
72
|
+
key=self_methodkey,
|
|
73
|
+
)
|
|
65
74
|
def is_profiling_enabled(operation_config: OperationConfig) -> bool:
|
|
66
75
|
if operation_config.lower_freq_profile_enabled is False:
|
|
67
76
|
return True
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
5
|
StaleEntityRemovalSourceReport,
|
|
6
6
|
)
|
|
7
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
|
|
|
12
13
|
tenants_scanned: Optional[int] = None
|
|
13
14
|
namespaces_scanned: Optional[int] = None
|
|
14
15
|
topics_scanned: Optional[int] = None
|
|
15
|
-
tenants_filtered:
|
|
16
|
-
namespaces_filtered:
|
|
17
|
-
topics_filtered:
|
|
16
|
+
tenants_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
17
|
+
namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
|
+
topics_filtered: LossyList[str] = field(default_factory=LossyList)
|
|
18
19
|
|
|
19
20
|
def report_pulsar_version(self, version: str) -> None:
|
|
20
21
|
self.pulsar_version = version
|