acryl-datahub 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/METADATA +2397 -2396
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/RECORD +42 -41
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +3 -3
- datahub/api/entities/external/restricted_text.py +3 -3
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/quickstart_versioning.py +1 -1
- datahub/cli/specific/assertions_cli.py +37 -2
- datahub/cli/specific/datacontract_cli.py +54 -4
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
- datahub/ingestion/api/report.py +21 -2
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/aws/tag_entities.py +2 -2
- datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
- datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
- datahub/ingestion/source/grafana/models.py +6 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +4 -4
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/slack/slack.py +7 -14
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
- datahub/ingestion/source/tableau/tableau.py +1 -1
- datahub/ingestion/source/unity/config.py +36 -1
- datahub/ingestion/source/unity/proxy.py +332 -46
- datahub/ingestion/source/unity/proxy_types.py +12 -2
- datahub/ingestion/source/unity/source.py +91 -34
- datahub/ingestion/source/unity/tag_entities.py +2 -2
- datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/sdk/search_client.py +3 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataset.py +37 -59
- datahub/utilities/server_config_util.py +2 -1
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/top_level.txt +0 -0
|
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
|
|
|
62
62
|
|
|
63
63
|
date_format: Optional[str] = Field(
|
|
64
64
|
default=None,
|
|
65
|
-
type=str,
|
|
66
65
|
description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
|
|
67
66
|
)
|
|
68
67
|
|
|
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
|
|
|
260
259
|
) -> Union[None, parse.Result, parse.Match]:
|
|
261
260
|
return self.compiled_folder_include.parse(path)
|
|
262
261
|
|
|
263
|
-
@pydantic.root_validator()
|
|
262
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
264
263
|
def validate_no_double_stars(cls, values: Dict) -> Dict:
|
|
265
264
|
if "include" not in values:
|
|
266
265
|
return values
|
|
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
|
|
|
456
455
|
partition = partition.rsplit("/", 1)[0]
|
|
457
456
|
for partition_key in partition.split("/"):
|
|
458
457
|
if partition_key.find("=") != -1:
|
|
459
|
-
|
|
458
|
+
key_value = partition_key.split(
|
|
459
|
+
"=", 1
|
|
460
|
+
) # Split into at most 2 parts
|
|
461
|
+
if len(key_value) == 2:
|
|
462
|
+
partition_keys.append((key_value[0], key_value[1]))
|
|
460
463
|
else:
|
|
461
464
|
partition_split = partition.rsplit("/", 1)
|
|
462
465
|
if len(partition_split) == 1:
|
|
@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
370
370
|
name = node["alias"]
|
|
371
371
|
|
|
372
372
|
comment = node.get("comment", "")
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
373
|
+
|
|
374
|
+
# In dbt sources, there are two types of descriptions:
|
|
375
|
+
# - description: table-level description (specific to the source table)
|
|
376
|
+
# - sourceDescription: schema-level description (describes the overall source schema)
|
|
377
|
+
# The table-level description should take precedence since it's more specific.
|
|
378
|
+
description = node["description"] or node.get("sourceDescription", "")
|
|
376
379
|
|
|
377
380
|
if node["resourceType"] == "model":
|
|
378
381
|
materialization = node["materializedType"]
|
|
@@ -69,9 +69,10 @@ class FivetranLogAPI:
|
|
|
69
69
|
fivetran_log_query.set_schema(bigquery_destination_config.dataset)
|
|
70
70
|
|
|
71
71
|
# The "database" should be the BigQuery project name.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
72
|
+
result = engine.execute("SELECT @@project_id").fetchone()
|
|
73
|
+
if result is None:
|
|
74
|
+
raise ValueError("Failed to retrieve BigQuery project ID")
|
|
75
|
+
fivetran_log_database = result[0]
|
|
75
76
|
else:
|
|
76
77
|
raise ConfigurationError(
|
|
77
78
|
f"Destination platform '{destination_platform}' is not yet supported."
|
|
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
|
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, Field
|
|
14
14
|
|
|
15
|
+
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
15
16
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
16
17
|
|
|
17
18
|
# Grafana-specific type definitions for better type safety
|
|
@@ -106,6 +107,11 @@ class Folder(BaseModel):
|
|
|
106
107
|
title: str
|
|
107
108
|
description: Optional[str] = ""
|
|
108
109
|
|
|
110
|
+
if PYDANTIC_VERSION_2:
|
|
111
|
+
from pydantic import ConfigDict
|
|
112
|
+
|
|
113
|
+
model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
|
|
114
|
+
|
|
109
115
|
|
|
110
116
|
class FolderKey(ContainerKey):
|
|
111
117
|
"""Key for identifying a Grafana folder."""
|
|
@@ -69,7 +69,7 @@ class HexSourceConfig(
|
|
|
69
69
|
)
|
|
70
70
|
include_components: bool = Field(
|
|
71
71
|
default=True,
|
|
72
|
-
|
|
72
|
+
description="Include Hex Components in the ingestion",
|
|
73
73
|
)
|
|
74
74
|
page_size: int = Field(
|
|
75
75
|
default=HEX_API_PAGE_SIZE_DEFAULT,
|
|
@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
524
524
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
525
525
|
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
526
526
|
last_modified: Optional[int] = table.metadata.last_updated_ms
|
|
527
|
-
if table.current_snapshot():
|
|
528
|
-
custom_properties["snapshot-id"] = str(
|
|
529
|
-
custom_properties["manifest-list"] =
|
|
527
|
+
if current_snapshot := table.current_snapshot():
|
|
528
|
+
custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
|
|
529
|
+
custom_properties["manifest-list"] = current_snapshot.manifest_list
|
|
530
530
|
if not last_modified:
|
|
531
|
-
last_modified = int(
|
|
531
|
+
last_modified = int(current_snapshot.timestamp_ms)
|
|
532
532
|
if "created-at" in custom_properties:
|
|
533
533
|
try:
|
|
534
534
|
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
|
|
|
27
27
|
is_favorite: bool = Field(alias="IsFavorite")
|
|
28
28
|
user_info: Any = Field(None, alias="UserInfo")
|
|
29
29
|
display_name: Optional[str] = Field(None, alias="DisplayName")
|
|
30
|
-
has_data_sources: bool = Field(
|
|
31
|
-
data_sources: Optional[List["DataSource"]] = Field(
|
|
32
|
-
default_factory=list, alias="DataSources"
|
|
33
|
-
)
|
|
30
|
+
has_data_sources: bool = Field(False, alias="HasDataSources")
|
|
31
|
+
data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
|
|
34
32
|
|
|
35
33
|
@validator("display_name", always=True)
|
|
36
34
|
def validate_diplay_name(cls, value, values):
|
|
@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
|
|
|
26
26
|
|
|
27
27
|
class OutboundSharePlatformResource(BaseModel):
|
|
28
28
|
namespace: str
|
|
29
|
-
platform_instance: Optional[str]
|
|
29
|
+
platform_instance: Optional[str] = None
|
|
30
30
|
env: str
|
|
31
31
|
source_database: str
|
|
32
32
|
share_name: str
|
|
@@ -203,38 +203,31 @@ class SlackSourceConfig(
|
|
|
203
203
|
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
|
|
204
204
|
)
|
|
205
205
|
enrich_user_metadata: bool = Field(
|
|
206
|
-
|
|
207
|
-
default=True,
|
|
206
|
+
True,
|
|
208
207
|
description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
|
|
209
208
|
)
|
|
210
209
|
ingest_users: bool = Field(
|
|
211
|
-
|
|
212
|
-
default=True,
|
|
210
|
+
True,
|
|
213
211
|
description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
|
|
214
212
|
)
|
|
215
213
|
api_requests_per_min: int = Field(
|
|
216
|
-
|
|
217
|
-
default=10,
|
|
214
|
+
10,
|
|
218
215
|
description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
|
|
219
216
|
)
|
|
220
217
|
ingest_public_channels: bool = Field(
|
|
221
|
-
|
|
222
|
-
default=False,
|
|
218
|
+
False,
|
|
223
219
|
description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
|
|
224
220
|
)
|
|
225
221
|
channels_iteration_limit: int = Field(
|
|
226
|
-
|
|
227
|
-
default=200,
|
|
222
|
+
200,
|
|
228
223
|
description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
|
|
229
224
|
)
|
|
230
225
|
channel_min_members: int = Field(
|
|
231
|
-
|
|
232
|
-
default=2,
|
|
226
|
+
2,
|
|
233
227
|
description="Ingest channels with at least this many members.",
|
|
234
228
|
)
|
|
235
229
|
should_ingest_archived_channels: bool = Field(
|
|
236
|
-
|
|
237
|
-
default=False,
|
|
230
|
+
False,
|
|
238
231
|
description="Whether to ingest archived channels.",
|
|
239
232
|
)
|
|
240
233
|
|
|
@@ -72,7 +72,7 @@ class ColumnUpstreamJob(BaseModel):
|
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
class ColumnUpstreamLineage(BaseModel):
|
|
75
|
-
column_name: Optional[str]
|
|
75
|
+
column_name: Optional[str] = None
|
|
76
76
|
upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
|
|
77
77
|
|
|
78
78
|
|
|
@@ -91,9 +91,9 @@ class Query(BaseModel):
|
|
|
91
91
|
class UpstreamLineageEdge(BaseModel):
|
|
92
92
|
DOWNSTREAM_TABLE_NAME: str
|
|
93
93
|
DOWNSTREAM_TABLE_DOMAIN: str
|
|
94
|
-
UPSTREAM_TABLES: Optional[List[UpstreamTableNode]]
|
|
95
|
-
UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]]
|
|
96
|
-
QUERIES: Optional[List[Query]]
|
|
94
|
+
UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None
|
|
95
|
+
UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None
|
|
96
|
+
QUERIES: Optional[List[Query]] = None
|
|
97
97
|
|
|
98
98
|
_json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES")
|
|
99
99
|
_json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS")
|
|
@@ -1184,7 +1184,7 @@ class TableauSiteSource:
|
|
|
1184
1184
|
self.report.warning(
|
|
1185
1185
|
title="Incomplete project hierarchy",
|
|
1186
1186
|
message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
|
|
1187
|
-
context=f"Missing {project.parent_id}, referenced by {project.id} {project.
|
|
1187
|
+
context=f"Missing {project.parent_id}, referenced by {project.id} {project.name}",
|
|
1188
1188
|
)
|
|
1189
1189
|
project.parent_id = None
|
|
1190
1190
|
|
|
@@ -8,7 +8,7 @@ import pydantic
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
11
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
|
|
12
12
|
from datahub.configuration.source_common import (
|
|
13
13
|
DatasetSourceConfigMixin,
|
|
14
14
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -36,6 +36,12 @@ from datahub.utilities.global_warning_util import add_global_warning
|
|
|
36
36
|
logger = logging.getLogger(__name__)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
class LineageDataSource(ConfigEnum):
|
|
40
|
+
AUTO = "AUTO"
|
|
41
|
+
SYSTEM_TABLES = "SYSTEM_TABLES"
|
|
42
|
+
API = "API"
|
|
43
|
+
|
|
44
|
+
|
|
39
45
|
class UnityCatalogProfilerConfig(ConfigModel):
|
|
40
46
|
method: str = Field(
|
|
41
47
|
description=(
|
|
@@ -243,6 +249,21 @@ class UnityCatalogSourceConfig(
|
|
|
243
249
|
description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
|
|
244
250
|
)
|
|
245
251
|
|
|
252
|
+
lineage_data_source: LineageDataSource = pydantic.Field(
|
|
253
|
+
default=LineageDataSource.AUTO,
|
|
254
|
+
description=(
|
|
255
|
+
"Source for lineage data extraction. Options: "
|
|
256
|
+
f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
|
|
257
|
+
f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
|
|
258
|
+
f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
|
|
259
|
+
),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
ignore_start_time_lineage: bool = pydantic.Field(
|
|
263
|
+
default=False,
|
|
264
|
+
description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
|
|
265
|
+
)
|
|
266
|
+
|
|
246
267
|
column_lineage_column_limit: int = pydantic.Field(
|
|
247
268
|
default=300,
|
|
248
269
|
description="Limit the number of columns to get column level lineage. ",
|
|
@@ -362,6 +383,20 @@ class UnityCatalogSourceConfig(
|
|
|
362
383
|
|
|
363
384
|
return values
|
|
364
385
|
|
|
386
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
387
|
+
def validate_lineage_data_source_with_warehouse(
|
|
388
|
+
cls, values: Dict[str, Any]
|
|
389
|
+
) -> Dict[str, Any]:
|
|
390
|
+
lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
|
|
391
|
+
warehouse_id = values.get("warehouse_id")
|
|
392
|
+
|
|
393
|
+
if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
|
|
394
|
+
raise ValueError(
|
|
395
|
+
f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return values
|
|
399
|
+
|
|
365
400
|
@pydantic.validator("schema_pattern", always=True)
|
|
366
401
|
def schema_pattern_should__always_deny_information_schema(
|
|
367
402
|
cls, v: AllowDenyPattern
|