acryl-datahub 1.1.0.3rc1__py3-none-any.whl → 1.1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (68) hide show
  1. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/METADATA +2474 -2474
  2. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/RECORD +68 -68
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +27 -0
  5. datahub/cli/delete_cli.py +117 -19
  6. datahub/emitter/rest_emitter.py +18 -1
  7. datahub/ingestion/api/source.py +2 -0
  8. datahub/ingestion/glossary/classification_mixin.py +5 -0
  9. datahub/ingestion/graph/client.py +42 -2
  10. datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
  11. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  12. datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
  13. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  14. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  15. datahub/ingestion/source/dremio/dremio_api.py +98 -68
  16. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  17. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  18. datahub/ingestion/source/dremio/dremio_source.py +90 -77
  19. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  20. datahub/ingestion/source/file.py +3 -0
  21. datahub/ingestion/source/ge_data_profiler.py +48 -8
  22. datahub/ingestion/source/ge_profiling_config.py +11 -0
  23. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  24. datahub/ingestion/source/kafka/kafka.py +16 -0
  25. datahub/ingestion/source/looker/looker_source.py +1 -0
  26. datahub/ingestion/source/powerbi/powerbi.py +1 -0
  27. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  28. datahub/ingestion/source/redshift/redshift.py +21 -1
  29. datahub/ingestion/source/sac/sac.py +3 -1
  30. datahub/ingestion/source/sigma/sigma.py +1 -0
  31. datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
  32. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  33. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
  35. datahub/ingestion/source/sql/clickhouse.py +3 -1
  36. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  37. datahub/ingestion/source/sql/hana.py +3 -1
  38. datahub/ingestion/source/sql/hive_metastore.py +3 -1
  39. datahub/ingestion/source/sql/mariadb.py +0 -1
  40. datahub/ingestion/source/sql/mssql/source.py +8 -1
  41. datahub/ingestion/source/sql/mysql.py +0 -1
  42. datahub/ingestion/source/sql/postgres.py +0 -1
  43. datahub/ingestion/source/sql/sql_common.py +12 -0
  44. datahub/ingestion/source/superset.py +1 -1
  45. datahub/ingestion/source/tableau/tableau.py +1 -0
  46. datahub/ingestion/source/unity/source.py +1 -0
  47. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  48. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  49. datahub/metadata/_internal_schema_classes.py +25 -0
  50. datahub/metadata/schema.avsc +18 -1
  51. datahub/metadata/schemas/ContainerProperties.avsc +6 -0
  52. datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
  53. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  54. datahub/metadata/schemas/DataJobInfo.avsc +6 -0
  55. datahub/metadata/schemas/DataProcessKey.avsc +6 -0
  56. datahub/metadata/schemas/DatasetKey.avsc +6 -0
  57. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
  58. datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
  59. datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
  60. datahub/metadata/schemas/MLModelKey.avsc +6 -0
  61. datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
  62. datahub/sdk/main_client.py +9 -10
  63. datahub/sql_parsing/sqlglot_lineage.py +22 -0
  64. datahub/utilities/stats_collections.py +4 -0
  65. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/WHEEL +0 -0
  66. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/entry_points.txt +0 -0
  67. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/licenses/LICENSE +0 -0
  68. {acryl_datahub-1.1.0.3rc1.dist-info → acryl_datahub-1.1.0.4.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
119
  class DremioSourceConfig(
119
120
  DremioConnectionConfig,
120
121
  StatefulIngestionConfigBase,
122
+ BaseTimeWindowConfig,
121
123
  EnvConfigMixin,
122
124
  PlatformInstanceConfigMixin,
123
125
  ):
@@ -1,22 +1,43 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
9
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
+ from datahub.utilities.stats_collections import (
13
+ TopKDict,
14
+ float_top_k_dict,
15
+ int_top_k_dict,
16
+ )
9
17
 
10
18
 
11
19
  @dataclass
12
20
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
21
+ SQLSourceReport,
22
+ StaleEntityRemovalSourceReport,
23
+ IngestionStageReport,
24
+ BaseTimeWindowReport,
14
25
  ):
15
26
  num_containers_failed: int = 0
16
27
  num_datasets_failed: int = 0
17
28
  containers_scanned: int = 0
18
29
  containers_filtered: int = 0
19
30
 
31
+ api_calls_total: int = 0
32
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
36
+ default_factory=float_top_k_dict
37
+ )
38
+
39
+ sql_aggregator: Optional[SqlAggregatorReport] = None
40
+
20
41
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
42
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
43
  # for future implementation of min / max / percentiles etc.
@@ -51,7 +51,11 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
51
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
52
  StatefulIngestionSourceBase,
53
53
  )
54
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
54
+ from datahub.ingestion.source_report.ingestion_stage import (
55
+ LINEAGE_EXTRACTION,
56
+ METADATA_EXTRACTION,
57
+ PROFILING,
58
+ )
55
59
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
56
60
  DatasetLineageTypeClass,
57
61
  UpstreamClass,
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
89
93
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
90
94
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
91
95
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
96
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
97
  class DremioSource(StatefulIngestionSourceBase):
93
98
  """
94
99
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
126
131
  self.default_db = "dremio"
127
132
  self.config = config
128
133
  self.report = DremioSourceReport()
134
+
135
+ # Set time window for query lineage extraction
136
+ self.report.window_start_time, self.report.window_end_time = (
137
+ self.config.start_time,
138
+ self.config.end_time,
139
+ )
140
+
129
141
  self.source_map: Dict[str, DremioSourceMapEntry] = dict()
130
142
 
131
143
  # Initialize API operations
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
154
166
  generate_operations=True,
155
167
  usage_config=self.config.usage,
156
168
  )
169
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
157
170
 
158
171
  # For profiling
159
172
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -190,84 +203,85 @@ class DremioSource(StatefulIngestionSourceBase):
190
203
 
191
204
  self.source_map = self._build_source_map()
192
205
 
193
- # Process Containers
194
- containers = self.dremio_catalog.get_containers()
195
- for container in containers:
196
- try:
197
- yield from self.process_container(container)
198
- logger.info(
199
- f"Dremio container {container.container_name} emitted successfully"
200
- )
201
- except Exception as exc:
202
- self.report.num_containers_failed += 1 # Increment failed containers
203
- self.report.report_failure(
204
- message="Failed to process Dremio container",
205
- context=f"{'.'.join(container.path)}.{container.container_name}",
206
- exc=exc,
207
- )
206
+ with self.report.new_stage(METADATA_EXTRACTION):
207
+ # Process Containers
208
+ containers = self.dremio_catalog.get_containers()
209
+ for container in containers:
210
+ try:
211
+ yield from self.process_container(container)
212
+ logger.info(
213
+ f"Dremio container {container.container_name} emitted successfully"
214
+ )
215
+ except Exception as exc:
216
+ self.report.num_containers_failed += 1
217
+ self.report.report_failure(
218
+ message="Failed to process Dremio container",
219
+ context=f"{'.'.join(container.path)}.{container.container_name}",
220
+ exc=exc,
221
+ )
208
222
 
209
- # Process Datasets
210
- datasets = self.dremio_catalog.get_datasets()
223
+ # Process Datasets
224
+ datasets = self.dremio_catalog.get_datasets()
211
225
 
212
- for dataset_info in datasets:
213
- try:
214
- yield from self.process_dataset(dataset_info)
215
- logger.info(
216
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
217
- )
218
- except Exception as exc:
219
- self.report.num_datasets_failed += 1 # Increment failed datasets
220
- self.report.report_failure(
221
- message="Failed to process Dremio dataset",
222
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
223
- exc=exc,
224
- )
226
+ for dataset_info in datasets:
227
+ try:
228
+ yield from self.process_dataset(dataset_info)
229
+ logger.info(
230
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
231
+ )
232
+ except Exception as exc:
233
+ self.report.num_datasets_failed += 1 # Increment failed datasets
234
+ self.report.report_failure(
235
+ message="Failed to process Dremio dataset",
236
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
237
+ exc=exc,
238
+ )
225
239
 
226
- # Optionally Process Query Lineage
227
- if self.config.include_query_lineage:
228
- self.get_query_lineage_workunits()
229
-
230
- # Process Glossary Terms
231
- glossary_terms = self.dremio_catalog.get_glossary_terms()
232
-
233
- for glossary_term in glossary_terms:
234
- try:
235
- yield from self.process_glossary_term(glossary_term)
236
- except Exception as exc:
237
- self.report.report_failure(
238
- message="Failed to process Glossary terms",
239
- context=f"{glossary_term.glossary_term}",
240
- exc=exc,
241
- )
240
+ # Process Glossary Terms
241
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
242
242
 
243
- # Generate workunit for aggregated SQL parsing results
244
- for mcp in self.sql_parsing_aggregator.gen_metadata():
245
- self.report.report_workunit(mcp.as_workunit())
246
- yield mcp.as_workunit()
247
-
248
- # Profiling
249
- if self.config.is_profiling_enabled():
250
- with ThreadPoolExecutor(
251
- max_workers=self.config.profiling.max_workers
252
- ) as executor:
253
- future_to_dataset = {
254
- executor.submit(self.generate_profiles, dataset): dataset
255
- for dataset in datasets
256
- }
257
-
258
- for future in as_completed(future_to_dataset):
259
- dataset_info = future_to_dataset[future]
260
- try:
261
- yield from future.result()
262
- except Exception as exc:
263
- self.report.profiling_skipped_other[
264
- dataset_info.resource_name
265
- ] += 1
266
- self.report.report_failure(
267
- message="Failed to profile dataset",
268
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
269
- exc=exc,
270
- )
243
+ for glossary_term in glossary_terms:
244
+ try:
245
+ yield from self.process_glossary_term(glossary_term)
246
+ except Exception as exc:
247
+ self.report.report_failure(
248
+ message="Failed to process Glossary terms",
249
+ context=f"{glossary_term.glossary_term}",
250
+ exc=exc,
251
+ )
252
+
253
+ # Optionally Process Query Lineage
254
+ if self.config.include_query_lineage:
255
+ with self.report.new_stage(LINEAGE_EXTRACTION):
256
+ self.get_query_lineage_workunits()
257
+
258
+ # Generate workunit for aggregated SQL parsing results
259
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
260
+ yield mcp.as_workunit()
261
+
262
+ # Profiling
263
+ if self.config.is_profiling_enabled():
264
+ with self.report.new_stage(PROFILING), ThreadPoolExecutor(
265
+ max_workers=self.config.profiling.max_workers
266
+ ) as executor:
267
+ future_to_dataset = {
268
+ executor.submit(self.generate_profiles, dataset): dataset
269
+ for dataset in datasets
270
+ }
271
+
272
+ for future in as_completed(future_to_dataset):
273
+ dataset_info = future_to_dataset[future]
274
+ try:
275
+ yield from future.result()
276
+ except Exception as exc:
277
+ self.report.profiling_skipped_other[
278
+ dataset_info.resource_name
279
+ ] += 1
280
+ self.report.report_failure(
281
+ message="Failed to profile dataset",
282
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
283
+ exc=exc,
284
+ )
271
285
 
272
286
  def process_container(
273
287
  self, container_info: DremioContainer
@@ -388,8 +402,7 @@ class DremioSource(StatefulIngestionSourceBase):
388
402
  env=self.config.env,
389
403
  platform_instance=self.config.platform_instance,
390
404
  )
391
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
392
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
405
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
393
406
 
394
407
  def generate_view_lineage(
395
408
  self, dataset_urn: str, parents: List[str]
@@ -1,3 +1,7 @@
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+
4
+
1
5
  class DremioSQLQueries:
2
6
  QUERY_DATASETS_CE = """
3
7
  SELECT* FROM
@@ -235,28 +239,83 @@ class DremioSQLQueries:
235
239
  TABLE_NAME ASC
236
240
  """
237
241
 
238
- # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/
239
- # queried_datasets incorrectly documented as [varchar]. Observed as varchar.
240
- # LENGTH used as opposed to ARRAY_SIZE
241
- QUERY_ALL_JOBS = """
242
- SELECT
243
- job_id,
244
- user_name,
245
- submitted_ts,
246
- query,
247
- queried_datasets
248
- FROM
249
- SYS.JOBS_RECENT
250
- WHERE
251
- STATUS = 'COMPLETED'
252
- AND LENGTH(queried_datasets)>0
253
- AND user_name != '$dremio$'
254
- AND query_type not like '%INTERNAL%'
255
- """
242
+ @staticmethod
243
+ def _get_default_start_timestamp_millis() -> str:
244
+ """Get default start timestamp (1 day ago) in milliseconds precision format"""
245
+ one_day_ago = datetime.now() - timedelta(days=1)
246
+ return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
247
+ :-3
248
+ ] # Truncate to milliseconds
249
+
250
+ @staticmethod
251
+ def _get_default_end_timestamp_millis() -> str:
252
+ """Get default end timestamp (now) in milliseconds precision format"""
253
+ now = datetime.now()
254
+ return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
255
+
256
+ @staticmethod
257
+ def get_query_all_jobs(
258
+ start_timestamp_millis: Optional[str] = None,
259
+ end_timestamp_millis: Optional[str] = None,
260
+ ) -> str:
261
+ """
262
+ Get query for all jobs with optional time filtering.
263
+
264
+ Args:
265
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
266
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
267
+
268
+ Returns:
269
+ SQL query string with time filtering applied
270
+ """
271
+ if start_timestamp_millis is None:
272
+ start_timestamp_millis = (
273
+ DremioSQLQueries._get_default_start_timestamp_millis()
274
+ )
275
+ if end_timestamp_millis is None:
276
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
277
+
278
+ return f"""
279
+ SELECT
280
+ job_id,
281
+ user_name,
282
+ submitted_ts,
283
+ query,
284
+ queried_datasets
285
+ FROM
286
+ SYS.JOBS_RECENT
287
+ WHERE
288
+ STATUS = 'COMPLETED'
289
+ AND LENGTH(queried_datasets)>0
290
+ AND user_name != '$dremio$'
291
+ AND query_type not like '%INTERNAL%'
292
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
293
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
294
+ """
295
+
296
+ @staticmethod
297
+ def get_query_all_jobs_cloud(
298
+ start_timestamp_millis: Optional[str] = None,
299
+ end_timestamp_millis: Optional[str] = None,
300
+ ) -> str:
301
+ """
302
+ Get query for all jobs in Dremio Cloud with optional time filtering.
303
+
304
+ Args:
305
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
306
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
307
+
308
+ Returns:
309
+ SQL query string with time filtering applied
310
+ """
311
+ if start_timestamp_millis is None:
312
+ start_timestamp_millis = (
313
+ DremioSQLQueries._get_default_start_timestamp_millis()
314
+ )
315
+ if end_timestamp_millis is None:
316
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
256
317
 
257
- # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical
258
- # queried_datasets correctly documented as [varchar]
259
- QUERY_ALL_JOBS_CLOUD = """
318
+ return f"""
260
319
  SELECT
261
320
  job_id,
262
321
  user_name,
@@ -270,6 +329,8 @@ class DremioSQLQueries:
270
329
  AND ARRAY_SIZE(queried_datasets)>0
271
330
  AND user_name != '$dremio$'
272
331
  AND query_type not like '%INTERNAL%'
332
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
333
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
273
334
  """
274
335
 
275
336
  QUERY_TYPES = [
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
20
20
  from datahub.ingestion.api.decorators import (
21
+ SourceCapability,
21
22
  SupportStatus,
23
+ capability,
22
24
  config_class,
23
25
  platform_name,
24
26
  support_status,
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
187
189
  @platform_name("Metadata File")
188
190
  @config_class(FileSourceConfig)
189
191
  @support_status(SupportStatus.CERTIFIED)
192
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
190
193
  class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
191
194
  """
192
195
  This plugin pulls metadata from a previously generated file.
@@ -120,7 +120,6 @@ SNOWFLAKE = "snowflake"
120
120
  BIGQUERY = "bigquery"
121
121
  REDSHIFT = "redshift"
122
122
  DATABRICKS = "databricks"
123
- TRINO = "trino"
124
123
 
125
124
  # Type names for Databricks, to match Title Case types in sqlalchemy
126
125
  ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
@@ -206,6 +205,17 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
206
205
  )
207
206
  )
208
207
  return convert_to_json_serializable(element_values.fetchone()[0])
208
+ elif (
209
+ self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
210
+ or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
211
+ ):
212
+ return convert_to_json_serializable(
213
+ self.engine.execute(
214
+ sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
215
+ self._table
216
+ )
217
+ ).scalar()
218
+ )
209
219
  return convert_to_json_serializable(
210
220
  self.engine.execute(
211
221
  sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
@@ -734,11 +744,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
734
744
  def _get_dataset_column_distinct_value_frequencies(
735
745
  self, column_profile: DatasetFieldProfileClass, column: str
736
746
  ) -> None:
737
- if self.config.include_field_distinct_value_frequencies:
747
+ if not self.config.include_field_distinct_value_frequencies:
748
+ return
749
+ try:
750
+ results = self.dataset.engine.execute(
751
+ sa.select(
752
+ [
753
+ sa.column(column),
754
+ sa.func.count(sa.column(column)),
755
+ ]
756
+ )
757
+ .select_from(self.dataset._table)
758
+ .where(sa.column(column).is_not(None))
759
+ .group_by(sa.column(column))
760
+ ).fetchall()
761
+
738
762
  column_profile.distinctValueFrequencies = [
739
- ValueFrequencyClass(value=str(value), frequency=count)
740
- for value, count in self.dataset.get_column_value_counts(column).items()
763
+ ValueFrequencyClass(value=str(value), frequency=int(count))
764
+ for value, count in results
741
765
  ]
766
+ # sort so output is deterministic. don't do it in SQL because not all column
767
+ # types are sortable in SQL (such as JSON data types on Athena/Trino).
768
+ column_profile.distinctValueFrequencies = sorted(
769
+ column_profile.distinctValueFrequencies, key=lambda x: x.value
770
+ )
771
+ except Exception as e:
772
+ logger.debug(
773
+ f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
774
+ )
775
+
776
+ self.report.report_warning(
777
+ title="Profiling: Unable to Calculate Distinct Value Frequencies",
778
+ message="Distinct value frequencies for the column will not be accessible",
779
+ context=f"{self.dataset_name}.{column}",
780
+ exc=e,
781
+ )
742
782
 
743
783
  @_run_with_query_combiner
744
784
  def _get_dataset_column_histogram(
@@ -1395,12 +1435,12 @@ class DatahubGEProfiler:
1395
1435
  )
1396
1436
  return None
1397
1437
  finally:
1398
- if batch is not None and self.base_engine.engine.name.upper() in [
1399
- "TRINO",
1400
- "AWSATHENA",
1438
+ if batch is not None and self.base_engine.engine.name.lower() in [
1439
+ GXSqlDialect.TRINO,
1440
+ GXSqlDialect.AWSATHENA,
1401
1441
  ]:
1402
1442
  if (
1403
- self.base_engine.engine.name.upper() == "TRINO"
1443
+ self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
1404
1444
  or temp_view is not None
1405
1445
  ):
1406
1446
  self._drop_temp_table(batch)
@@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
125
125
  description="Profile table only if it has been updated since these many number of days. "
126
126
  "If set to `null`, no constraint of last modified time for tables to profile. "
127
127
  "Supported only in `snowflake` and `BigQuery`.",
128
+ schema_extra={"supported_sources": ["snowflake", "bigquery"]},
128
129
  )
129
130
 
130
131
  profile_table_size_limit: Optional[int] = Field(
@@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
132
133
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
133
134
  "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
135
  "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
136
+ schema_extra={
137
+ "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
138
+ },
135
139
  )
136
140
 
137
141
  profile_table_row_limit: Optional[int] = Field(
@@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
139
143
  description="Profile tables only if their row count is less than specified count. "
140
144
  "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
145
  "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
146
+ schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
142
147
  )
143
148
 
144
149
  profile_table_row_count_estimate_only: bool = Field(
145
150
  default=False,
146
151
  description="Use an approximate query for row count. This will be much faster but slightly "
147
152
  "less accurate. Only supported for Postgres and MySQL. ",
153
+ schema_extra={"supported_sources": ["postgres", "mysql"]},
148
154
  )
149
155
 
150
156
  # The query combiner enables us to combine multiple queries into a single query,
@@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
161
167
  default=True,
162
168
  description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
163
169
  "If enabled, latest partition data is used for profiling.",
170
+ schema_extra={"supported_sources": ["athena", "bigquery"]},
164
171
  )
165
172
  partition_datetime: Optional[datetime.datetime] = Field(
166
173
  default=None,
167
174
  description="If specified, profile only the partition which matches this datetime. "
168
175
  "If not specified, profile the latest partition. Only Bigquery supports this.",
176
+ schema_extra={"supported_sources": ["bigquery"]},
169
177
  )
170
178
  use_sampling: bool = Field(
171
179
  default=True,
172
180
  description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
173
181
  "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
182
+ schema_extra={"supported_sources": ["bigquery", "snowflake"]},
174
183
  )
175
184
 
176
185
  sample_size: int = Field(
177
186
  default=10000,
178
187
  description="Number of rows to be sampled from table for column level profiling."
179
188
  "Applicable only if `use_sampling` is set to True.",
189
+ schema_extra={"supported_sources": ["bigquery", "snowflake"]},
180
190
  )
181
191
 
182
192
  profile_external_tables: bool = Field(
183
193
  default=False,
184
194
  description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
195
+ schema_extra={"supported_sources": ["redshift", "snowflake"]},
185
196
  )
186
197
 
187
198
  tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
@@ -134,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
134
134
  SourceCapability.OWNERSHIP,
135
135
  "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
136
136
  )
137
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
137
+ @capability(
138
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
139
+ )
138
140
  class IcebergSource(StatefulIngestionSourceBase):
139
141
  """
140
142
  ## Integration Details
@@ -189,6 +189,22 @@ class KafkaConnectionTest:
189
189
  SourceCapability.SCHEMA_METADATA,
190
190
  "Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
191
191
  )
192
+ @capability(
193
+ SourceCapability.DATA_PROFILING,
194
+ "Not supported",
195
+ supported=False,
196
+ )
197
+ @capability(
198
+ SourceCapability.LINEAGE_COARSE,
199
+ "Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
200
+ supported=False,
201
+ )
202
+ @capability(
203
+ SourceCapability.LINEAGE_FINE,
204
+ "Not supported",
205
+ supported=False,
206
+ )
207
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
192
208
  class KafkaSource(StatefulIngestionSourceBase, TestableSource):
193
209
  """
194
210
  This plugin extracts the following:
@@ -126,6 +126,7 @@ logger = logging.getLogger(__name__)
126
126
  SourceCapability.USAGE_STATS,
127
127
  "Enabled by default, configured using `extract_usage_history`",
128
128
  )
129
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
129
130
  class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
130
131
  """
131
132
  This plugin extracts the following:
@@ -1253,6 +1253,7 @@ class Mapper:
1253
1253
  SourceCapability.DATA_PROFILING,
1254
1254
  "Optionally enabled via configuration profiling.enabled",
1255
1255
  )
1256
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
1256
1257
  class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1257
1258
  """
1258
1259
  This plugin extracts the following:
@@ -109,6 +109,7 @@ logger = logging.getLogger(__name__)
109
109
  "Enabled by default, configured using `ingest_owner`",
110
110
  )
111
111
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
112
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
112
113
  class QlikSenseSource(StatefulIngestionSourceBase, TestableSource):
113
114
  """
114
115
  This plugin extracts the following: