acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show
  1. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2431 -2431
  2. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +46 -45
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/graph/client.py +2 -1
  5. datahub/ingestion/graph/entity_versioning.py +201 -0
  6. datahub/ingestion/source/abs/report.py +2 -2
  7. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  8. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/delta_lake/report.py +2 -2
  11. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  12. datahub/ingestion/source/elastic_search.py +2 -1
  13. datahub/ingestion/source/ge_profiling_config.py +11 -7
  14. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  15. datahub/ingestion/source/identity/okta.py +2 -1
  16. datahub/ingestion/source/kafka/kafka.py +2 -1
  17. datahub/ingestion/source/kafka_connect/common.py +2 -1
  18. datahub/ingestion/source/ldap.py +2 -1
  19. datahub/ingestion/source/looker/lookml_config.py +9 -5
  20. datahub/ingestion/source/mongodb.py +2 -1
  21. datahub/ingestion/source/nifi.py +2 -1
  22. datahub/ingestion/source/powerbi/config.py +3 -2
  23. datahub/ingestion/source/powerbi/powerbi.py +28 -3
  24. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
  25. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
  26. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
  27. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  28. datahub/ingestion/source/redash.py +5 -5
  29. datahub/ingestion/source/salesforce.py +4 -1
  30. datahub/ingestion/source/snowflake/constants.py +1 -0
  31. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
  33. datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
  34. datahub/ingestion/source/snowflake/snowflake_report.py +8 -1
  35. datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
  36. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
  37. datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
  38. datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
  39. datahub/ingestion/source/tableau/tableau.py +2 -1
  40. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  41. datahub/ingestion/source/unity/report.py +1 -0
  42. datahub/ingestion/source_report/pulsar.py +5 -4
  43. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
  44. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
  45. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
  46. {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
@@ -124,19 +124,20 @@ class SnowflakeFilter:
124
124
  SnowflakeObjectDomain.VIEW,
125
125
  SnowflakeObjectDomain.MATERIALIZED_VIEW,
126
126
  SnowflakeObjectDomain.ICEBERG_TABLE,
127
+ SnowflakeObjectDomain.STREAM,
127
128
  ):
128
129
  return False
129
130
  if _is_sys_table(dataset_name):
130
131
  return False
131
132
 
132
- dataset_params = _split_qualified_name(dataset_name)
133
+ dataset_params = split_qualified_name(dataset_name)
133
134
  if len(dataset_params) != 3:
134
135
  self.structured_reporter.info(
135
136
  title="Unexpected dataset pattern",
136
137
  message=f"Found a {dataset_type} with an unexpected number of parts. Database and schema filtering will not work as expected, but table filtering will still work.",
137
138
  context=dataset_name,
138
139
  )
139
- # We fall-through here so table/view filtering still works.
140
+ # We fall-through here so table/view/stream filtering still works.
140
141
 
141
142
  if (
142
143
  len(dataset_params) >= 1
@@ -169,6 +170,14 @@ class SnowflakeFilter:
169
170
  ):
170
171
  return False
171
172
 
173
+ if (
174
+ dataset_type.lower() == SnowflakeObjectDomain.STREAM
175
+ and not self.filter_config.stream_pattern.allowed(
176
+ _cleanup_qualified_name(dataset_name, self.structured_reporter)
177
+ )
178
+ ):
179
+ return False
180
+
172
181
  return True
173
182
 
174
183
 
@@ -183,17 +192,17 @@ def _is_sys_table(table_name: str) -> bool:
183
192
  return table_name.lower().startswith("sys$")
184
193
 
185
194
 
186
- def _split_qualified_name(qualified_name: str) -> List[str]:
195
+ def split_qualified_name(qualified_name: str) -> List[str]:
187
196
  """
188
197
  Split a qualified name into its constituent parts.
189
198
 
190
- >>> _split_qualified_name("db.my_schema.my_table")
199
+ >>> split_qualified_name("db.my_schema.my_table")
191
200
  ['db', 'my_schema', 'my_table']
192
- >>> _split_qualified_name('"db"."my_schema"."my_table"')
201
+ >>> split_qualified_name('"db"."my_schema"."my_table"')
193
202
  ['db', 'my_schema', 'my_table']
194
- >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
203
+ >>> split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
195
204
  ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
196
- >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
205
+ >>> split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
197
206
  ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
198
207
  """
199
208
 
@@ -231,7 +240,7 @@ def _split_qualified_name(qualified_name: str) -> List[str]:
231
240
  def _cleanup_qualified_name(
232
241
  qualified_name: str, structured_reporter: SourceReport
233
242
  ) -> str:
234
- name_parts = _split_qualified_name(qualified_name)
243
+ name_parts = split_qualified_name(qualified_name)
235
244
  if len(name_parts) != 3:
236
245
  if not _is_sys_table(qualified_name):
237
246
  structured_reporter.info(
@@ -539,15 +539,27 @@ class SnowflakeV2Source(
539
539
  for schema in db.schemas
540
540
  for table_name in schema.views
541
541
  ]
542
+ discovered_streams: List[str] = [
543
+ self.identifiers.get_dataset_identifier(stream_name, schema.name, db.name)
544
+ for db in databases
545
+ for schema in db.schemas
546
+ for stream_name in schema.streams
547
+ ]
542
548
 
543
- if len(discovered_tables) == 0 and len(discovered_views) == 0:
549
+ if (
550
+ len(discovered_tables) == 0
551
+ and len(discovered_views) == 0
552
+ and len(discovered_streams) == 0
553
+ ):
544
554
  self.structured_reporter.failure(
545
555
  GENERIC_PERMISSION_ERROR_KEY,
546
- "No tables/views found. Please check permissions.",
556
+ "No tables/views/streams found. Please check permissions.",
547
557
  )
548
558
  return
549
559
 
550
- self.discovered_datasets = discovered_tables + discovered_views
560
+ self.discovered_datasets = (
561
+ discovered_tables + discovered_views + discovered_streams
562
+ )
551
563
 
552
564
  if self.config.use_queries_v2:
553
565
  with self.report.new_stage(f"*: {VIEW_PARSING}"):
@@ -170,6 +170,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
170
170
  create_lineage_sql_parsed_result,
171
171
  )
172
172
  from datahub.utilities import config_clean
173
+ from datahub.utilities.lossy_collections import LossyList
173
174
  from datahub.utilities.perf_timer import PerfTimer
174
175
  from datahub.utilities.stats_collections import TopKDict
175
176
  from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -798,7 +799,7 @@ class TableauSourceReport(
798
799
  num_upstream_table_lineage_failed_parse_sql: int = 0
799
800
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
800
801
  num_hidden_assets_skipped: int = 0
801
- logged_in_user: List[UserInfo] = dataclass_field(default_factory=list)
802
+ logged_in_user: LossyList[UserInfo] = dataclass_field(default_factory=LossyList)
802
803
 
803
804
  last_authenticated_at: Optional[datetime] = None
804
805
 
@@ -3,6 +3,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Iterable, List, Optional
5
5
 
6
+ from databricks.sdk.service.catalog import DataSourceFormat
6
7
  from sqlalchemy import create_engine
7
8
  from sqlalchemy.engine import Connection
8
9
 
@@ -34,6 +35,11 @@ class UnityCatalogSQLGenericTable(BaseTable):
34
35
  self.size_in_bytes = None
35
36
  self.rows_count = None
36
37
  self.ddl = None
38
+ self.data_source_format = table.data_source_format
39
+
40
+ @property
41
+ def is_delta_table(self) -> bool:
42
+ return self.data_source_format == DataSourceFormat.DELTA
37
43
 
38
44
 
39
45
  class UnityCatalogGEProfiler(GenericProfiler):
@@ -110,13 +116,20 @@ class UnityCatalogGEProfiler(GenericProfiler):
110
116
  profile_table_level_only = self.profiling_config.profile_table_level_only
111
117
 
112
118
  dataset_name = table.ref.qualified_table_name
113
- try:
114
- table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
115
- except Exception as e:
116
- logger.warning(f"Failed to get table size for {dataset_name}: {e}")
119
+ if table.is_delta_table:
120
+ try:
121
+ table.size_in_bytes = _get_dataset_size_in_bytes(table, conn)
122
+ except Exception as e:
123
+ self.report.warning(
124
+ title="Incomplete Dataset Profile",
125
+ message="Failed to get table size",
126
+ context=dataset_name,
127
+ exc=e,
128
+ )
117
129
 
118
130
  if table.size_in_bytes is None:
119
131
  self.report.num_profile_missing_size_in_bytes += 1
132
+
120
133
  if not self.is_dataset_eligible_for_profiling(
121
134
  dataset_name,
122
135
  size_in_bytes=table.size_in_bytes,
@@ -143,6 +156,23 @@ class UnityCatalogGEProfiler(GenericProfiler):
143
156
  self.report.report_dropped(dataset_name)
144
157
  return None
145
158
 
159
+ if profile_table_level_only and table.is_delta_table:
160
+ # For requests with profile_table_level_only set, dataset profile is generated
161
+ # by looking at table.rows_count. For delta tables (a typical databricks table)
162
+ # count(*) is an efficient query to compute row count.
163
+ try:
164
+ table.rows_count = _get_dataset_row_count(table, conn)
165
+ except Exception as e:
166
+ self.report.warning(
167
+ title="Incomplete Dataset Profile",
168
+ message="Failed to get table row count",
169
+ context=dataset_name,
170
+ exc=e,
171
+ )
172
+
173
+ if table.rows_count is None:
174
+ self.report.num_profile_missing_row_count += 1
175
+
146
176
  self.report.report_entity_profiled(dataset_name)
147
177
  logger.debug(f"Preparing profiling request for {dataset_name}")
148
178
  return TableProfilerRequest(
@@ -160,6 +190,9 @@ def _get_dataset_size_in_bytes(
160
190
  conn.dialect.identifier_preparer.quote(c)
161
191
  for c in [table.ref.catalog, table.ref.schema, table.ref.table]
162
192
  )
193
+ # This query only works for delta table.
194
+ # Ref: https://docs.databricks.com/en/delta/table-details.html
195
+ # Note: Any change here should also update _get_dataset_row_count
163
196
  row = conn.execute(f"DESCRIBE DETAIL {name}").fetchone()
164
197
  if row is None:
165
198
  return None
@@ -168,3 +201,21 @@ def _get_dataset_size_in_bytes(
168
201
  return int(row._asdict()["sizeInBytes"])
169
202
  except Exception:
170
203
  return None
204
+
205
+
206
+ def _get_dataset_row_count(
207
+ table: UnityCatalogSQLGenericTable, conn: Connection
208
+ ) -> Optional[int]:
209
+ name = ".".join(
210
+ conn.dialect.identifier_preparer.quote(c)
211
+ for c in [table.ref.catalog, table.ref.schema, table.ref.table]
212
+ )
213
+ # This query only works efficiently for delta table
214
+ row = conn.execute(f"select count(*) as numRows from {name}").fetchone()
215
+ if row is None:
216
+ return None
217
+ else:
218
+ try:
219
+ return int(row._asdict()["numRows"])
220
+ except Exception:
221
+ return None
@@ -52,6 +52,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
52
52
  default_factory=LossyDict
53
53
  )
54
54
  num_profile_missing_size_in_bytes: int = 0
55
+ num_profile_missing_row_count: int = 0
55
56
  num_profile_failed_unsupported_column_type: int = 0
56
57
  num_profile_failed_int_casts: int = 0
57
58
 
@@ -1,9 +1,10 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional
2
+ from typing import Optional
3
3
 
4
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
5
  StaleEntityRemovalSourceReport,
6
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
7
8
 
8
9
 
9
10
  @dataclass
@@ -12,9 +13,9 @@ class PulsarSourceReport(StaleEntityRemovalSourceReport):
12
13
  tenants_scanned: Optional[int] = None
13
14
  namespaces_scanned: Optional[int] = None
14
15
  topics_scanned: Optional[int] = None
15
- tenants_filtered: List[str] = field(default_factory=list)
16
- namespaces_filtered: List[str] = field(default_factory=list)
17
- topics_filtered: List[str] = field(default_factory=list)
16
+ tenants_filtered: LossyList[str] = field(default_factory=LossyList)
17
+ namespaces_filtered: LossyList[str] = field(default_factory=LossyList)
18
+ topics_filtered: LossyList[str] = field(default_factory=LossyList)
18
19
 
19
20
  def report_pulsar_version(self, version: str) -> None:
20
21
  self.pulsar_version = version