acryl-datahub 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (96) hide show
  1. acryl_datahub-0.15.0.5.dist-info/LICENSE +202 -0
  2. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2444 -2404
  3. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +96 -86
  4. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
  5. datahub/__init__.py +1 -25
  6. datahub/_version.py +13 -0
  7. datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
  8. datahub/cli/check_cli.py +1 -1
  9. datahub/cli/cli_utils.py +3 -3
  10. datahub/cli/container_cli.py +1 -64
  11. datahub/cli/iceberg_cli.py +707 -0
  12. datahub/cli/ingest_cli.py +2 -2
  13. datahub/emitter/composite_emitter.py +36 -0
  14. datahub/emitter/rest_emitter.py +1 -1
  15. datahub/entrypoints.py +26 -5
  16. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  17. datahub/ingestion/api/registry.py +4 -2
  18. datahub/ingestion/glossary/classification_mixin.py +6 -0
  19. datahub/ingestion/glossary/classifier.py +3 -2
  20. datahub/ingestion/graph/client.py +2 -1
  21. datahub/ingestion/graph/entity_versioning.py +201 -0
  22. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  23. datahub/ingestion/run/connection.py +1 -1
  24. datahub/ingestion/run/pipeline.py +3 -3
  25. datahub/ingestion/source/abs/report.py +2 -2
  26. datahub/ingestion/source/apply/__init__.py +0 -0
  27. datahub/ingestion/source/apply/datahub_apply.py +223 -0
  28. datahub/ingestion/source/aws/glue.py +15 -6
  29. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  30. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  31. datahub/ingestion/source/dbt/dbt_core.py +1 -1
  32. datahub/ingestion/source/delta_lake/report.py +2 -2
  33. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  34. datahub/ingestion/source/elastic_search.py +2 -1
  35. datahub/ingestion/source/ge_profiling_config.py +11 -7
  36. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  37. datahub/ingestion/source/identity/azure_ad.py +6 -14
  38. datahub/ingestion/source/identity/okta.py +2 -1
  39. datahub/ingestion/source/kafka/kafka.py +2 -1
  40. datahub/ingestion/source/kafka_connect/common.py +2 -1
  41. datahub/ingestion/source/ldap.py +2 -1
  42. datahub/ingestion/source/looker/looker_config.py +3 -1
  43. datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
  44. datahub/ingestion/source/looker/looker_file_loader.py +14 -3
  45. datahub/ingestion/source/looker/looker_template_language.py +104 -14
  46. datahub/ingestion/source/looker/lookml_config.py +29 -8
  47. datahub/ingestion/source/looker/lookml_source.py +110 -22
  48. datahub/ingestion/source/mode.py +2 -4
  49. datahub/ingestion/source/mongodb.py +2 -1
  50. datahub/ingestion/source/nifi.py +2 -1
  51. datahub/ingestion/source/powerbi/config.py +2 -2
  52. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  53. datahub/ingestion/source/redash.py +5 -5
  54. datahub/ingestion/source/salesforce.py +4 -1
  55. datahub/ingestion/source/slack/slack.py +6 -0
  56. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  57. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  58. datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
  59. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
  61. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  62. datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
  63. datahub/ingestion/source/sql/clickhouse.py +5 -43
  64. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  65. datahub/ingestion/source/sql/mssql/source.py +17 -0
  66. datahub/ingestion/source/sql/sql_config.py +0 -10
  67. datahub/ingestion/source/tableau/tableau.py +16 -13
  68. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  69. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  70. datahub/ingestion/source/unity/proxy.py +2 -2
  71. datahub/ingestion/source/unity/report.py +1 -0
  72. datahub/ingestion/source_config/operation_config.py +9 -0
  73. datahub/ingestion/source_report/pulsar.py +5 -4
  74. datahub/metadata/_schema_classes.py +304 -6
  75. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  78. datahub/metadata/schema.avsc +211 -12
  79. datahub/metadata/schemas/AssertionInfo.avsc +2 -2
  80. datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
  81. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  82. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  83. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  84. datahub/metadata/schemas/Deprecation.avsc +12 -0
  85. datahub/metadata/schemas/DisplayProperties.avsc +62 -0
  86. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  87. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
  89. datahub/metadata/schemas/PostInfo.avsc +28 -2
  90. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  91. datahub/specific/dashboard.py +43 -1
  92. datahub/telemetry/telemetry.py +4 -4
  93. datahub/testing/check_imports.py +28 -0
  94. datahub/upgrade/upgrade.py +17 -9
  95. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
  96. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,6 @@ import logging
2
2
  from abc import abstractmethod
3
3
  from typing import Any, Dict, Optional
4
4
 
5
- import cachetools
6
- import cachetools.keys
7
5
  import pydantic
8
6
  from pydantic import Field
9
7
  from sqlalchemy.engine import URL
@@ -29,7 +27,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
29
27
  StatefulIngestionConfigBase,
30
28
  )
31
29
  from datahub.ingestion.source_config.operation_config import is_profiling_enabled
32
- from datahub.utilities.cachetools_keys import self_methodkey
33
30
 
34
31
  logger: logging.Logger = logging.getLogger(__name__)
35
32
 
@@ -118,13 +115,6 @@ class SQLCommonConfig(
118
115
  # Custom Stateful Ingestion settings
119
116
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
120
117
 
121
- # TRICKY: The operation_config is time-dependent. Because we don't want to change
122
- # whether or not we're running profiling mid-ingestion, we cache the result of this method.
123
- # TODO: This decorator should be moved to the is_profiling_enabled(operation_config) method.
124
- @cachetools.cached(
125
- cache=cachetools.LRUCache(maxsize=1),
126
- key=self_methodkey,
127
- )
128
118
  def is_profiling_enabled(self) -> bool:
129
119
  return self.profiling.enabled and is_profiling_enabled(
130
120
  self.profiling.operation_config
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
170
  create_lineage_sql_parsed_result,
171
171
  )
172
172
  from datahub.utilities import config_clean
173
+ from datahub.utilities.lossy_collections import LossyList
173
174
  from datahub.utilities.perf_timer import PerfTimer
174
175
  from datahub.utilities.stats_collections import TopKDict
175
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
798
799
  num_upstream_table_lineage_failed_parse_sql: int = 0
799
800
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
800
801
  num_hidden_assets_skipped: int = 0
801
- logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
802
+ logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
802
803
 
803
804
  last_authenticated_at: Optional[datetime] = None
804
805
 
@@ -2428,10 +2429,12 @@ class TableauSiteSource:
2428
2429
  ]
2429
2430
  ],
2430
2431
  ) -> Optional["SqlParsingResult"]:
2431
- database_info = datasource.get(c.DATABASE) or {
2432
- c.NAME: c.UNKNOWN.lower(),
2433
- c.CONNECTION_TYPE: datasource.get(c.CONNECTION_TYPE),
2434
- }
2432
+ database_field = datasource.get(c.DATABASE) or {}
2433
+ database_id: Optional[str] = database_field.get(c.ID)
2434
+ database_name: Optional[str] = database_field.get(c.NAME) or c.UNKNOWN.lower()
2435
+ database_connection_type: Optional[str] = database_field.get(
2436
+ c.CONNECTION_TYPE
2437
+ ) or datasource.get(c.CONNECTION_TYPE)
2435
2438
 
2436
2439
  if (
2437
2440
  datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False)
@@ -2440,10 +2443,7 @@ class TableauSiteSource:
2440
2443
  logger.debug(f"datasource {datasource_urn} is not created from custom sql")
2441
2444
  return None
2442
2445
 
2443
- if (
2444
- database_info.get(c.NAME) is None
2445
- or database_info.get(c.CONNECTION_TYPE) is None
2446
- ):
2446
+ if database_connection_type is None:
2447
2447
  logger.debug(
2448
2448
  f"database information is missing from datasource {datasource_urn}"
2449
2449
  )
@@ -2459,14 +2459,14 @@ class TableauSiteSource:
2459
2459
 
2460
2460
  logger.debug(f"Parsing sql={query}")
2461
2461
 
2462
- upstream_db = database_info.get(c.NAME)
2462
+ upstream_db = database_name
2463
2463
 
2464
2464
  if func_overridden_info is not None:
2465
2465
  # Override the information as per configuration
2466
2466
  upstream_db, platform_instance, platform, _ = func_overridden_info(
2467
- database_info[c.CONNECTION_TYPE],
2468
- database_info.get(c.NAME),
2469
- database_info.get(c.ID),
2467
+ database_connection_type,
2468
+ database_name,
2469
+ database_id,
2470
2470
  self.config.platform_instance_map,
2471
2471
  self.config.lineage_overrides,
2472
2472
  self.config.database_hostname_to_platform_instance_map,
@@ -2534,6 +2534,9 @@ class TableauSiteSource:
2534
2534
  platform_instance=self.config.platform_instance,
2535
2535
  func_overridden_info=get_overridden_info,
2536
2536
  )
2537
+ logger.debug(
2538
+ f"_create_lineage_from_unsupported_csql parsed_result = {parsed_result}"
2539
+ )
2537
2540
 
2538
2541
  if parsed_result is None:
2539
2542
  return
@@ -761,7 +761,7 @@ class TableauUpstreamReference:
761
761
 
762
762
 
763
763
  def get_overridden_info(
764
- connection_type: Optional[str],
764
+ connection_type: str,
765
765
  upstream_db: Optional[str],
766
766
  upstream_db_id: Optional[str],
767
767
  platform_instance_map: Optional[Dict[str, str]],
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Iterable, List, Optional
5
5
 
6
+ from databricks.sdk.service.catalog import DataSourceFormat
6
7
  from sqlalchemy import create_engine
7
8
  from sqlalchemy.engine import Connection
8
9
 
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
34
35
  self.size_in_bytes = None
35
36
  self.rows_count = None
36
37
  self.ddl = None
38
+ self.data_source_format = table.data_source_format
39
+
40
+ @property
41
+ def is_delta_table(self) -> bool:
42
+ return self.data_source_format == DataSourceFormat.DELTA
37
43
 
38
44
 
39
45
  class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
110
116
  profile_table_level_only = self.profiling_config.profile_table_level_only
111
117
 
112
118
  dataset_name = table.ref.qualified_table_name
113
- try:
114
- table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
115
- except Exception as e:
116
- logger.warning(f"Failed to get table size for {dataset_name}: {e}")
119
+ if table.is_delta_table:
120
+ try:
121
+ table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
122
+ except Exception as e:
123
+ self.report.warning(
124
+ title="Incomplete Dataset Profile",
125
+ message="Failed to get table size",
126
+ context=dataset_name,
127
+ exc=e,
128
+ )
117
129
 
118
130
  if table.size_in_bytes is None:
119
131
  self.report.num_profile_missing_size_in_bytes += 1
132
+
120
133
  if not self.is_dataset_eligible_for_profiling(
121
134
  dataset_name,
122
135
  size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
143
156
  self.report.report_dropped(dataset_name)
144
157
  return None
145
158
 
159
+ if profile_table_level_only and table.is_delta_table:
160
+ # For requests with profile_table_level_only set, dataset profile is generated
161
+ # by looking at table.rows_count. For delta tables (a typical databricks table)
162
+ # count(*) is an efficient query to compute row count.
163
+ try:
164
+ table.rows_count = _get_dataset_row_count(table, conn)
165
+ except Exception as e:
166
+ self.report.warning(
167
+ title="Incomplete Dataset Profile",
168
+ message="Failed to get table row count",
169
+ context=dataset_name,
170
+ exc=e,
171
+ )
172
+
173
+ if table.rows_count is None:
174
+ self.report.num_profile_missing_row_count += 1
175
+
146
176
  self.report.report_entity_profiled(dataset_name)
147
177
  logger.debug(f"Preparing profiling request for {dataset_name}")
148
178
  return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
160
190
  conn.dialect.identifier_preparer.quote(c)
161
191
  for c in [table.ref.catalog, table.ref.schema, table.ref.table]
162
192
  )
193
+ # This query only works for delta table.
194
+ # Ref: https://docs.databricks.com/en/delta/table-details.html
195
+ # Note: Any change here should also update _get_dataset_row_count
163
196
  row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
164
197
  if row is None:
165
198
  return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
168
201
  return int(row._asdict()["sizeInBytes"])
169
202
  except Exception:
170
203
  return None
204
+
205
+
206
+ def _get_dataset_row_count(
207
+ table: UnityCatalogSQLGenericTable, conn: Connection
208
+ ) -> Optional[int]:
209
+ name = ".".join(
210
+ conn.dialect.identifier_preparer.quote(c)
211
+ for c in [table.ref.catalog, table.ref.schema, table.ref.table]
212
+ )
213
+ # This query only works efficiently for delta table
214
+ row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
215
+ if row is None:
216
+ return None
217
+ else:
218
+ try:
219
+ return int(row._asdict()["numRows"])
220
+ except Exception:
221
+ return None
@@ -26,7 +26,7 @@ from databricks.sdk.service.sql import (
26
26
  )
27
27
  from databricks.sdk.service.workspace import ObjectType
28
28
 
29
- import datahub
29
+ from datahub._version import nice_version_name
30
30
  from datahub.emitter.mce_builder import parse_ts_millis
31
31
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
32
32
  from datahub.ingestion.source.unity.proxy_profiling import (
@@ -103,7 +103,7 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
103
103
  host=workspace_url,
104
104
  token=personal_access_token,
105
105
  product="datahub",
106
- product_version=datahub.nice_version_name(),
106
+ product_version=nice_version_name(),
107
107
  )
108
108
  self.warehouse_id = warehouse_id or ""
109
109
  self.report = report
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
52
52
  default_factory=LossyDict
53
53
  )
54
54
  num_profile_missing_size_in_bytes: int = 0
55
+ num_profile_missing_row_count: int = 0
55
56
  num_profile_failed_unsupported_column_type: int = 0
56
57
  num_profile_failed_int_casts: int = 0
57
58
 
@@ -2,10 +2,12 @@ import datetime
2
2
  import logging
3
3
  from typing import Any, Dict, Optional
4
4
 
5
+ import cachetools
5
6
  import pydantic
6
7
  from pydantic.fields import Field
7
8
 
8
9
  from datahub.configuration.common import ConfigModel
10
+ from datahub.utilities.cachetools_keys import self_methodkey
9
11
 
10
12
  logger = logging.getLogger(__name__)
11
13
 
@@ -62,6 +64,13 @@ class OperationConfig(ConfigModel):
62
64
  return profile_date_of_month
63
65
 
64
66
 
67
+ # TRICKY: The operation_config is time-dependent. Because we don't want to change
68
+ # whether or not we're running profiling mid-ingestion, we cache the result of this method.
69
+ # An additional benefit is that we only print the log lines on the first call.
70
+ @cachetools.cached(
71
+ cache=cachetools.LRUCache(maxsize=1),
72
+ key=self_methodkey,
73
+ )
65
74
  def is_profiling_enabled(operation_config: OperationConfig) -> bool:
66
75
  if operation_config.lower_freq_profile_enabled is False:
67
76
  return True
@@ -1,9 +1,10 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional
2
+ from typing import Optional
3
3
 
4
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
5
  StaleEntityRemovalSourceReport,
6
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
7
8
 
8
9
 
9
10
  @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
12
13
  tenants_scanned: Optional[int] = None
13
14
  namespaces_scanned: Optional[int] = None
14
15
  topics_scanned: Optional[int] = None
15
- tenants_filtered: List[str] = field(default_factory=list)
16
- namespaces_filtered: List[str] = field(default_factory=list)
17
- topics_filtered: List[str] = field(default_factory=list)
16
+ tenants_filtered: LossyList[str] = field(default_factory=LossyList)
17
+ namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
18
+ topics_filtered: LossyList[str] = field(default_factory=LossyList)
18
19
 
19
20
  def report_pulsar_version(self, version: str) -> None:
20
21
  self.pulsar_version = version