acryl-datahub 1.2.0.4rc1__py3-none-any.whl → 1.2.0.4rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (42) hide show
  1. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/METADATA +2397 -2396
  2. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/RECORD +42 -41
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/restricted_text.py +3 -3
  6. datahub/api/entities/forms/forms.py +3 -3
  7. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  8. datahub/cli/quickstart_versioning.py +1 -1
  9. datahub/cli/specific/assertions_cli.py +37 -2
  10. datahub/cli/specific/datacontract_cli.py +54 -4
  11. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  12. datahub/ingestion/api/report.py +21 -2
  13. datahub/ingestion/source/abs/config.py +1 -1
  14. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  15. datahub/ingestion/source/aws/tag_entities.py +2 -2
  16. datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
  17. datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
  18. datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
  19. datahub/ingestion/source/grafana/models.py +6 -0
  20. datahub/ingestion/source/hex/hex.py +1 -1
  21. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  22. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  23. datahub/ingestion/source/redshift/datashares.py +1 -1
  24. datahub/ingestion/source/slack/slack.py +7 -14
  25. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  26. datahub/ingestion/source/tableau/tableau.py +1 -1
  27. datahub/ingestion/source/unity/config.py +36 -1
  28. datahub/ingestion/source/unity/proxy.py +332 -46
  29. datahub/ingestion/source/unity/proxy_types.py +12 -2
  30. datahub/ingestion/source/unity/source.py +91 -34
  31. datahub/ingestion/source/unity/tag_entities.py +2 -2
  32. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  33. datahub/ingestion/transformer/base_transformer.py +8 -5
  34. datahub/sdk/search_client.py +3 -0
  35. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  36. datahub/specific/datajob.py +15 -1
  37. datahub/specific/dataset.py +37 -59
  38. datahub/utilities/server_config_util.py +2 -1
  39. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/WHEEL +0 -0
  40. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/entry_points.txt +0 -0
  41. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/licenses/LICENSE +0 -0
  42. {acryl_datahub-1.2.0.4rc1.dist-info → acryl_datahub-1.2.0.4rc3.dist-info}/top_level.txt +0 -0
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
62
62
 
63
63
  date_format: Optional[str] = Field(
64
64
  default=None,
65
- type=str,
66
65
  description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
67
66
  )
68
67
 
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
260
259
  ) -> Union[None, parse.Result, parse.Match]:
261
260
  return self.compiled_folder_include.parse(path)
262
261
 
263
- @pydantic.root_validator()
262
+ @pydantic.root_validator(skip_on_failure=True)
264
263
  def validate_no_double_stars(cls, values: Dict) -> Dict:
265
264
  if "include" not in values:
266
265
  return values
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
456
455
  partition = partition.rsplit("/", 1)[0]
457
456
  for partition_key in partition.split("/"):
458
457
  if partition_key.find("=") != -1:
459
- partition_keys.append(tuple(partition_key.split("=")))
458
+ key_value = partition_key.split(
459
+ "=", 1
460
+ ) # Split into at most 2 parts
461
+ if len(key_value) == 2:
462
+ partition_keys.append((key_value[0], key_value[1]))
460
463
  else:
461
464
  partition_split = partition.rsplit("/", 1)
462
465
  if len(partition_split) == 1:
@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
370
370
  name = node["alias"]
371
371
 
372
372
  comment = node.get("comment", "")
373
- description = node["description"]
374
- if node.get("sourceDescription"):
375
- description = node["sourceDescription"]
373
+
374
+ # In dbt sources, there are two types of descriptions:
375
+ # - description: table-level description (specific to the source table)
376
+ # - sourceDescription: schema-level description (describes the overall source schema)
377
+ # The table-level description should take precedence since it's more specific.
378
+ description = node["description"] or node.get("sourceDescription", "")
376
379
 
377
380
  if node["resourceType"] == "model":
378
381
  materialization = node["materializedType"]
@@ -69,9 +69,10 @@ class FivetranLogAPI:
69
69
  fivetran_log_query.set_schema(bigquery_destination_config.dataset)
70
70
 
71
71
  # The "database" should be the BigQuery project name.
72
- fivetran_log_database = engine.execute(
73
- "SELECT @@project_id"
74
- ).fetchone()[0]
72
+ result = engine.execute("SELECT @@project_id").fetchone()
73
+ if result is None:
74
+ raise ValueError("Failed to retrieve BigQuery project ID")
75
+ fivetran_log_database = result[0]
75
76
  else:
76
77
  raise ConfigurationError(
77
78
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
 
13
13
  from pydantic import BaseModel, Field
14
14
 
15
+ from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
15
16
  from datahub.emitter.mcp_builder import ContainerKey
16
17
 
17
18
  # Grafana-specific type definitions for better type safety
@@ -106,6 +107,11 @@ class Folder(BaseModel):
106
107
  title: str
107
108
  description: Optional[str] = ""
108
109
 
110
+ if PYDANTIC_VERSION_2:
111
+ from pydantic import ConfigDict
112
+
113
+ model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
114
+
109
115
 
110
116
  class FolderKey(ContainerKey):
111
117
  """Key for identifying a Grafana folder."""
@@ -69,7 +69,7 @@ class HexSourceConfig(
69
69
  )
70
70
  include_components: bool = Field(
71
71
  default=True,
72
- desciption="Include Hex Components in the ingestion",
72
+ description="Include Hex Components in the ingestion",
73
73
  )
74
74
  page_size: int = Field(
75
75
  default=HEX_API_PAGE_SIZE_DEFAULT,
@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
524
524
  custom_properties["format-version"] = str(table.metadata.format_version)
525
525
  custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
526
526
  last_modified: Optional[int] = table.metadata.last_updated_ms
527
- if table.current_snapshot():
528
- custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
529
- custom_properties["manifest-list"] = table.current_snapshot().manifest_list
527
+ if current_snapshot := table.current_snapshot():
528
+ custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
529
+ custom_properties["manifest-list"] = current_snapshot.manifest_list
530
530
  if not last_modified:
531
- last_modified = int(table.current_snapshot().timestamp_ms)
531
+ last_modified = int(current_snapshot.timestamp_ms)
532
532
  if "created-at" in custom_properties:
533
533
  try:
534
534
  dt = dateutil_parser.isoparse(custom_properties["created-at"])
@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
27
27
  is_favorite: bool = Field(alias="IsFavorite")
28
28
  user_info: Any = Field(None, alias="UserInfo")
29
29
  display_name: Optional[str] = Field(None, alias="DisplayName")
30
- has_data_sources: bool = Field(default=False, alias="HasDataSources")
31
- data_sources: Optional[List["DataSource"]] = Field(
32
- default_factory=list, alias="DataSources"
33
- )
30
+ has_data_sources: bool = Field(False, alias="HasDataSources")
31
+ data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
34
32
 
35
33
  @validator("display_name", always=True)
36
34
  def validate_diplay_name(cls, value, values):
@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
26
26
 
27
27
  class OutboundSharePlatformResource(BaseModel):
28
28
  namespace: str
29
- platform_instance: Optional[str]
29
+ platform_instance: Optional[str] = None
30
30
  env: str
31
31
  source_database: str
32
32
  share_name: str
@@ -203,38 +203,31 @@ class SlackSourceConfig(
203
203
  description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
204
204
  )
205
205
  enrich_user_metadata: bool = Field(
206
- type=bool,
207
- default=True,
206
+ True,
208
207
  description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
209
208
  )
210
209
  ingest_users: bool = Field(
211
- type=bool,
212
- default=True,
210
+ True,
213
211
  description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
214
212
  )
215
213
  api_requests_per_min: int = Field(
216
- type=int,
217
- default=10,
214
+ 10,
218
215
  description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
219
216
  )
220
217
  ingest_public_channels: bool = Field(
221
- type=bool,
222
- default=False,
218
+ False,
223
219
  description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
224
220
  )
225
221
  channels_iteration_limit: int = Field(
226
- type=int,
227
- default=200,
222
+ 200,
228
223
  description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
229
224
  )
230
225
  channel_min_members: int = Field(
231
- type=int,
232
- default=2,
226
+ 2,
233
227
  description="Ingest channels with at least this many members.",
234
228
  )
235
229
  should_ingest_archived_channels: bool = Field(
236
- type=bool,
237
- default=False,
230
+ False,
238
231
  description="Whether to ingest archived channels.",
239
232
  )
240
233
 
@@ -72,7 +72,7 @@ class ColumnUpstreamJob(BaseModel):
72
72
 
73
73
 
74
74
  class ColumnUpstreamLineage(BaseModel):
75
- column_name: Optional[str]
75
+ column_name: Optional[str] = None
76
76
  upstreams: List[ColumnUpstreamJob] = Field(default_factory=list)
77
77
 
78
78
 
@@ -91,9 +91,9 @@ class Query(BaseModel):
91
91
  class UpstreamLineageEdge(BaseModel):
92
92
  DOWNSTREAM_TABLE_NAME: str
93
93
  DOWNSTREAM_TABLE_DOMAIN: str
94
- UPSTREAM_TABLES: Optional[List[UpstreamTableNode]]
95
- UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]]
96
- QUERIES: Optional[List[Query]]
94
+ UPSTREAM_TABLES: Optional[List[UpstreamTableNode]] = None
95
+ UPSTREAM_COLUMNS: Optional[List[ColumnUpstreamLineage]] = None
96
+ QUERIES: Optional[List[Query]] = None
97
97
 
98
98
  _json_upstream_tables = pydantic_parse_json("UPSTREAM_TABLES")
99
99
  _json_upstream_columns = pydantic_parse_json("UPSTREAM_COLUMNS")
@@ -1184,7 +1184,7 @@ class TableauSiteSource:
1184
1184
  self.report.warning(
1185
1185
  title="Incomplete project hierarchy",
1186
1186
  message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
1187
- context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
1187
+ context=f"Missing {project.parent_id}, referenced by {project.id} {project.name}",
1188
1188
  )
1189
1189
  project.parent_id = None
1190
1190
 
@@ -8,7 +8,7 @@ import pydantic
8
8
  from pydantic import Field
9
9
  from typing_extensions import Literal
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
11
+ from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
12
12
  from datahub.configuration.source_common import (
13
13
  DatasetSourceConfigMixin,
14
14
  LowerCaseDatasetUrnConfigMixin,
@@ -36,6 +36,12 @@ from datahub.utilities.global_warning_util import add_global_warning
36
36
  logger = logging.getLogger(__name__)
37
37
 
38
38
 
39
+ class LineageDataSource(ConfigEnum):
40
+ AUTO = "AUTO"
41
+ SYSTEM_TABLES = "SYSTEM_TABLES"
42
+ API = "API"
43
+
44
+
39
45
  class UnityCatalogProfilerConfig(ConfigModel):
40
46
  method: str = Field(
41
47
  description=(
@@ -243,6 +249,21 @@ class UnityCatalogSourceConfig(
243
249
  description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ",
244
250
  )
245
251
 
252
+ lineage_data_source: LineageDataSource = pydantic.Field(
253
+ default=LineageDataSource.AUTO,
254
+ description=(
255
+ "Source for lineage data extraction. Options: "
256
+ f"'{LineageDataSource.AUTO.value}' - Use system tables when SQL warehouse is available, fallback to API; "
257
+ f"'{LineageDataSource.SYSTEM_TABLES.value}' - Force use of system.access.table_lineage and system.access.column_lineage tables (requires SQL warehouse); "
258
+ f"'{LineageDataSource.API.value}' - Force use of REST API endpoints for lineage data"
259
+ ),
260
+ )
261
+
262
+ ignore_start_time_lineage: bool = pydantic.Field(
263
+ default=False,
264
+ description="Option to ignore the start_time and retrieve all available lineage. When enabled, the start_time filter will be set to zero to extract all lineage events regardless of the configured time window.",
265
+ )
266
+
246
267
  column_lineage_column_limit: int = pydantic.Field(
247
268
  default=300,
248
269
  description="Limit the number of columns to get column level lineage. ",
@@ -362,6 +383,20 @@ class UnityCatalogSourceConfig(
362
383
 
363
384
  return values
364
385
 
386
+ @pydantic.root_validator(skip_on_failure=True)
387
+ def validate_lineage_data_source_with_warehouse(
388
+ cls, values: Dict[str, Any]
389
+ ) -> Dict[str, Any]:
390
+ lineage_data_source = values.get("lineage_data_source", LineageDataSource.AUTO)
391
+ warehouse_id = values.get("warehouse_id")
392
+
393
+ if lineage_data_source == LineageDataSource.SYSTEM_TABLES and not warehouse_id:
394
+ raise ValueError(
395
+ f"lineage_data_source='{LineageDataSource.SYSTEM_TABLES.value}' requires warehouse_id to be set"
396
+ )
397
+
398
+ return values
399
+
365
400
  @pydantic.validator("schema_pattern", always=True)
366
401
  def schema_pattern_should__always_deny_information_schema(
367
402
  cls, v: AllowDenyPattern