acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.5rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (33) hide show
  1. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/METADATA +2491 -2491
  2. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/RECORD +33 -32
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/graph/client.py +2 -1
  5. datahub/ingestion/graph/entity_versioning.py +201 -0
  6. datahub/ingestion/source/abs/report.py +2 -2
  7. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  8. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  9. datahub/ingestion/source/delta_lake/report.py +2 -2
  10. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  11. datahub/ingestion/source/elastic_search.py +2 -1
  12. datahub/ingestion/source/ge_profiling_config.py +11 -7
  13. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  14. datahub/ingestion/source/identity/okta.py +2 -1
  15. datahub/ingestion/source/kafka/kafka.py +2 -1
  16. datahub/ingestion/source/kafka_connect/common.py +2 -1
  17. datahub/ingestion/source/ldap.py +2 -1
  18. datahub/ingestion/source/looker/lookml_config.py +9 -5
  19. datahub/ingestion/source/mongodb.py +2 -1
  20. datahub/ingestion/source/nifi.py +2 -1
  21. datahub/ingestion/source/powerbi/config.py +2 -2
  22. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  23. datahub/ingestion/source/redash.py +5 -5
  24. datahub/ingestion/source/salesforce.py +4 -1
  25. datahub/ingestion/source/snowflake/snowflake_report.py +2 -1
  26. datahub/ingestion/source/tableau/tableau.py +2 -1
  27. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  28. datahub/ingestion/source/unity/report.py +1 -0
  29. datahub/ingestion/source_report/pulsar.py +5 -4
  30. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/LICENSE +0 -0
  31. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/WHEEL +0 -0
  32. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/entry_points.txt +0 -0
  33. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.5rc10.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
16
  from datahub.ingestion.source.state.stateful_ingestion_base import (
17
17
  StatefulIngestionConfigBase,
18
18
  )
19
+ from datahub.utilities.lossy_collections import LossyList
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
83
84
  @dataclass
84
85
  class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
86
  connectors_scanned: int = 0
86
- filtered: List[str] = field(default_factory=list)
87
+ filtered: LossyList[str] = field(default_factory=LossyList)
87
88
 
88
89
  def report_connector_scanned(self, connector: str) -> None:
89
90
  self.connectors_scanned += 1
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
37
37
  CorpUserSnapshotClass,
38
38
  GroupMembershipClass,
39
39
  )
40
+ from datahub.utilities.lossy_collections import LossyList
40
41
 
41
42
  # default mapping for attrs
42
43
  user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
160
161
 
161
162
  @dataclasses.dataclass
162
163
  class LDAPSourceReport(StaleEntityRemovalSourceReport):
163
- dropped_dns: List[str] = dataclasses.field(default_factory=list)
164
+ dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
164
165
 
165
166
  def report_dropped(self, dn: str) -> None:
166
167
  self.dropped_dns.append(dn)
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field as dataclass_field
3
3
  from datetime import timedelta
4
- from typing import Any, Dict, List, Literal, Optional, Union
4
+ from typing import Any, Dict, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
7
  from pydantic import root_validator, validator
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
48
48
  class LookMLSourceReport(StaleEntityRemovalSourceReport):
49
49
  git_clone_latency: Optional[timedelta] = None
50
50
  models_discovered: int = 0
51
- models_dropped: List[str] = dataclass_field(default_factory=LossyList)
51
+ models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
52
52
  views_discovered: int = 0
53
- views_dropped: List[str] = dataclass_field(default_factory=LossyList)
54
- views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList)
53
+ views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
54
+ views_dropped_unreachable: LossyList[str] = dataclass_field(
55
+ default_factory=LossyList
56
+ )
55
57
  query_parse_attempts: int = 0
56
58
  query_parse_failures: int = 0
57
- query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList)
59
+ query_parse_failure_views: LossyList[str] = dataclass_field(
60
+ default_factory=LossyList
61
+ )
58
62
  _looker_api: Optional[LookerAPI] = None
59
63
 
60
64
  def report_models_scanned(self) -> None:
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
68
68
  UnionTypeClass,
69
69
  )
70
70
  from datahub.metadata.urns import DatasetUrn
71
+ from datahub.utilities.lossy_collections import LossyList
71
72
 
72
73
  logger = logging.getLogger(__name__)
73
74
 
@@ -143,7 +144,7 @@ class MongoDBConfig(
143
144
 
144
145
  @dataclass
145
146
  class MongoDBSourceReport(StaleEntityRemovalSourceReport):
146
- filtered: List[str] = field(default_factory=list)
147
+ filtered: LossyList[str] = field(default_factory=LossyList)
147
148
 
148
149
  def report_dropped(self, name: str) -> None:
149
150
  self.filtered.append(name)
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
46
46
  DatasetPropertiesClass,
47
47
  )
48
48
  from datahub.specific.datajob import DataJobPatchBuilder
49
+ from datahub.utilities.lossy_collections import LossyList
49
50
 
50
51
  logger = logging.getLogger(__name__)
51
52
  NIFI = "nifi"
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
453
 
453
454
  @dataclass
454
455
  class NifiSourceReport(SourceReport):
455
- filtered: List[str] = field(default_factory=list)
456
+ filtered: LossyList[str] = field(default_factory=LossyList)
456
457
 
457
458
  def report_dropped(self, ent_name: str) -> None:
458
459
  self.filtered.append(ent_name)
@@ -195,8 +195,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
195
195
 
196
196
  dashboards_scanned: int = 0
197
197
  charts_scanned: int = 0
198
- filtered_dashboards: List[str] = dataclass_field(default_factory=list)
199
- filtered_charts: List[str] = dataclass_field(default_factory=list)
198
+ filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
199
+ filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
200
200
 
201
201
  m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
202
202
  m_query_parse_attempts: int = 0
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
53
53
  StatusClass,
54
54
  )
55
55
  from datahub.utilities.dedup_list import deduplicate_list
56
+ from datahub.utilities.lossy_collections import LossyList
56
57
 
57
58
  LOGGER = logging.getLogger(__name__)
58
59
 
@@ -476,7 +477,7 @@ class Mapper:
476
477
  @dataclass
477
478
  class PowerBiReportServerDashboardSourceReport(SourceReport):
478
479
  scanned_report: int = 0
479
- filtered_reports: List[str] = dataclass_field(default_factory=list)
480
+ filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
480
481
 
481
482
  def report_scanned(self, count: int = 1) -> None:
482
483
  self.scanned_report += count
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set
5
+ from typing import Dict, Iterable, List, Optional
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
39
39
  DashboardInfoClass,
40
40
  )
41
41
  from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
- from datahub.utilities.lossy_collections import LossyDict, LossyList
42
+ from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
45
45
 
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
280
280
  class RedashSourceReport(SourceReport):
281
281
  items_scanned: int = 0
282
282
  filtered: LossyList[str] = field(default_factory=LossyList)
283
- queries_problem_parsing: Set[str] = field(default_factory=set)
284
- queries_no_dataset: Set[str] = field(default_factory=set)
285
- charts_no_input: Set[str] = field(default_factory=set)
283
+ queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
284
+ queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
285
+ charts_no_input: LossySet[str] = field(default_factory=LossySet)
286
286
  total_queries: Optional[int] = field(
287
287
  default=None,
288
288
  )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  import time
4
+ from dataclasses import dataclass, field as dataclass_field
4
5
  from datetime import datetime
5
6
  from enum import Enum
6
7
  from typing import Any, Dict, Iterable, List, Optional
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
60
61
  TagAssociationClass,
61
62
  )
62
63
  from datahub.utilities import config_clean
64
+ from datahub.utilities.lossy_collections import LossyList
63
65
 
64
66
  logger = logging.getLogger(__name__)
65
67
 
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
146
148
  return config_clean.remove_trailing_slashes(v)
147
149
 
148
150
 
151
+ @dataclass
149
152
  class SalesforceSourceReport(SourceReport):
150
- filtered: List[str] = []
153
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
151
154
 
152
155
  def report_dropped(self, ent_name: str) -> None:
153
156
  self.filtered.append(ent_name)
@@ -12,6 +12,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
12
12
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
13
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
14
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
+ from datahub.utilities.lossy_collections import LossyDict
15
16
  from datahub.utilities.perf_timer import PerfTimer
16
17
 
17
18
  if TYPE_CHECKING:
@@ -66,7 +67,7 @@ class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport):
66
67
  num_external_table_edges_scanned: int = 0
67
68
  ignore_start_time_lineage: Optional[bool] = None
68
69
  upstream_lineage_in_report: Optional[bool] = None
69
- upstream_lineage: Dict[str, List[str]] = field(default_factory=dict)
70
+ upstream_lineage: LossyDict[str, List[str]] = field(default_factory=LossyDict)
70
71
 
71
72
  lineage_start_time: Optional[datetime] = None
72
73
  lineage_end_time: Optional[datetime] = None
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
170
  create_lineage_sql_parsed_result,
171
171
  )
172
172
  from datahub.utilities import config_clean
173
+ from datahub.utilities.lossy_collections import LossyList
173
174
  from datahub.utilities.perf_timer import PerfTimer
174
175
  from datahub.utilities.stats_collections import TopKDict
175
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
798
799
  num_upstream_table_lineage_failed_parse_sql: int = 0
799
800
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
800
801
  num_hidden_assets_skipped: int = 0
801
- logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
802
+ logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
802
803
 
803
804
  last_authenticated_at: Optional[datetime] = None
804
805
 
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Iterable, List, Optional
5
5
 
6
+ from databricks.sdk.service.catalog import DataSourceFormat
6
7
  from sqlalchemy import create_engine
7
8
  from sqlalchemy.engine import Connection
8
9
 
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
34
35
  self.size_in_bytes = None
35
36
  self.rows_count = None
36
37
  self.ddl = None
38
+ self.data_source_format = table.data_source_format
39
+
40
+ @property
41
+ def is_delta_table(self) -> bool:
42
+ return self.data_source_format == DataSourceFormat.DELTA
37
43
 
38
44
 
39
45
  class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
110
116
  profile_table_level_only = self.profiling_config.profile_table_level_only
111
117
 
112
118
  dataset_name = table.ref.qualified_table_name
113
- try:
114
- table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
115
- except Exception as e:
116
- logger.warning(f"Failed to get table size for {dataset_name}: {e}")
119
+ if table.is_delta_table:
120
+ try:
121
+ table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
122
+ except Exception as e:
123
+ self.report.warning(
124
+ title="Incomplete Dataset Profile",
125
+ message="Failed to get table size",
126
+ context=dataset_name,
127
+ exc=e,
128
+ )
117
129
 
118
130
  if table.size_in_bytes is None:
119
131
  self.report.num_profile_missing_size_in_bytes += 1
132
+
120
133
  if not self.is_dataset_eligible_for_profiling(
121
134
  dataset_name,
122
135
  size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
143
156
  self.report.report_dropped(dataset_name)
144
157
  return None
145
158
 
159
+ if profile_table_level_only and table.is_delta_table:
160
+ # For requests with profile_table_level_only set, dataset profile is generated
161
+ # by looking at table.rows_count. For delta tables (a typical databricks table)
162
+ # count(*) is an efficient query to compute row count.
163
+ try:
164
+ table.rows_count = _get_dataset_row_count(table, conn)
165
+ except Exception as e:
166
+ self.report.warning(
167
+ title="Incomplete Dataset Profile",
168
+ message="Failed to get table row count",
169
+ context=dataset_name,
170
+ exc=e,
171
+ )
172
+
173
+ if table.rows_count is None:
174
+ self.report.num_profile_missing_row_count += 1
175
+
146
176
  self.report.report_entity_profiled(dataset_name)
147
177
  logger.debug(f"Preparing profiling request for {dataset_name}")
148
178
  return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
160
190
  conn.dialect.identifier_preparer.quote(c)
161
191
  for c in [table.ref.catalog, table.ref.schema, table.ref.table]
162
192
  )
193
+ # This query only works for delta table.
194
+ # Ref: https://docs.databricks.com/en/delta/table-details.html
195
+ # Note: Any change here should also update _get_dataset_row_count
163
196
  row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
164
197
  if row is None:
165
198
  return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
168
201
  return int(row._asdict()["sizeInBytes"])
169
202
  except Exception:
170
203
  return None
204
+
205
+
206
+ def _get_dataset_row_count(
207
+ table: UnityCatalogSQLGenericTable, conn: Connection
208
+ ) -> Optional[int]:
209
+ name = ".".join(
210
+ conn.dialect.identifier_preparer.quote(c)
211
+ for c in [table.ref.catalog, table.ref.schema, table.ref.table]
212
+ )
213
+ # This query only works efficiently for delta table
214
+ row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
215
+ if row is None:
216
+ return None
217
+ else:
218
+ try:
219
+ return int(row._asdict()["numRows"])
220
+ except Exception:
221
+ return None
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
52
52
  default_factory=LossyDict
53
53
  )
54
54
  num_profile_missing_size_in_bytes: int = 0
55
+ num_profile_missing_row_count: int = 0
55
56
  num_profile_failed_unsupported_column_type: int = 0
56
57
  num_profile_failed_int_casts: int = 0
57
58
 
@@ -1,9 +1,10 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional
2
+ from typing import Optional
3
3
 
4
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
5
  StaleEntityRemovalSourceReport,
6
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
7
8
 
8
9
 
9
10
  @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
12
13
  tenants_scanned: Optional[int] = None
13
14
  namespaces_scanned: Optional[int] = None
14
15
  topics_scanned: Optional[int] = None
15
- tenants_filtered: List[str] = field(default_factory=list)
16
- namespaces_filtered: List[str] = field(default_factory=list)
17
- topics_filtered: List[str] = field(default_factory=list)
16
+ tenants_filtered: LossyList[str] = field(default_factory=LossyList)
17
+ namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
18
+ topics_filtered: LossyList[str] = field(default_factory=LossyList)
18
19
 
19
20
  def report_pulsar_version(self, version: str) -> None:
20
21
  self.pulsar_version = version