acryl-datahub 1.1.0.3rc2__py3-none-any.whl → 1.1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/METADATA +2542 -2542
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/RECORD +66 -66
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +27 -0
- datahub/cli/delete_cli.py +117 -19
- datahub/emitter/rest_emitter.py +18 -1
- datahub/ingestion/api/source.py +2 -0
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +42 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
- datahub/ingestion/source/dbt/dbt_common.py +3 -1
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/dremio/dremio_api.py +98 -68
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +90 -77
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/ge_data_profiler.py +48 -8
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/powerbi/powerbi.py +1 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/sigma/sigma.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -1
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -1
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +12 -0
- datahub/ingestion/source/tableau/tableau.py +1 -0
- datahub/ingestion/source/unity/source.py +1 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/metadata/_internal_schema_classes.py +25 -0
- datahub/metadata/schema.avsc +18 -1
- datahub/metadata/schemas/ContainerProperties.avsc +6 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +6 -0
- datahub/metadata/schemas/DataProcessKey.avsc +6 -0
- datahub/metadata/schemas/DatasetKey.avsc +6 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
- datahub/metadata/schemas/MLModelKey.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
- datahub/sql_parsing/sqlglot_lineage.py +21 -6
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.3rc2.dist-info → acryl_datahub-1.1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
119
|
class DremioSourceConfig(
|
|
119
120
|
DremioConnectionConfig,
|
|
120
121
|
StatefulIngestionConfigBase,
|
|
122
|
+
BaseTimeWindowConfig,
|
|
121
123
|
EnvConfigMixin,
|
|
122
124
|
PlatformInstanceConfigMixin,
|
|
123
125
|
):
|
|
@@ -1,22 +1,43 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
9
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
11
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
12
|
+
from datahub.utilities.stats_collections import (
|
|
13
|
+
TopKDict,
|
|
14
|
+
float_top_k_dict,
|
|
15
|
+
int_top_k_dict,
|
|
16
|
+
)
|
|
9
17
|
|
|
10
18
|
|
|
11
19
|
@dataclass
|
|
12
20
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
21
|
+
SQLSourceReport,
|
|
22
|
+
StaleEntityRemovalSourceReport,
|
|
23
|
+
IngestionStageReport,
|
|
24
|
+
BaseTimeWindowReport,
|
|
14
25
|
):
|
|
15
26
|
num_containers_failed: int = 0
|
|
16
27
|
num_datasets_failed: int = 0
|
|
17
28
|
containers_scanned: int = 0
|
|
18
29
|
containers_filtered: int = 0
|
|
19
30
|
|
|
31
|
+
api_calls_total: int = 0
|
|
32
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
36
|
+
default_factory=float_top_k_dict
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
40
|
+
|
|
20
41
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
42
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
43
|
# for future implementation of min / max / percentiles etc.
|
|
@@ -51,7 +51,11 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
51
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
52
|
StatefulIngestionSourceBase,
|
|
53
53
|
)
|
|
54
|
-
from datahub.ingestion.source_report.ingestion_stage import
|
|
54
|
+
from datahub.ingestion.source_report.ingestion_stage import (
|
|
55
|
+
LINEAGE_EXTRACTION,
|
|
56
|
+
METADATA_EXTRACTION,
|
|
57
|
+
PROFILING,
|
|
58
|
+
)
|
|
55
59
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
56
60
|
DatasetLineageTypeClass,
|
|
57
61
|
UpstreamClass,
|
|
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
|
|
|
89
93
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
90
94
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
91
95
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
96
|
+
@capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
|
|
92
97
|
class DremioSource(StatefulIngestionSourceBase):
|
|
93
98
|
"""
|
|
94
99
|
This plugin integrates with Dremio to extract and ingest metadata into DataHub.
|
|
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
126
131
|
self.default_db = "dremio"
|
|
127
132
|
self.config = config
|
|
128
133
|
self.report = DremioSourceReport()
|
|
134
|
+
|
|
135
|
+
# Set time window for query lineage extraction
|
|
136
|
+
self.report.window_start_time, self.report.window_end_time = (
|
|
137
|
+
self.config.start_time,
|
|
138
|
+
self.config.end_time,
|
|
139
|
+
)
|
|
140
|
+
|
|
129
141
|
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
130
142
|
|
|
131
143
|
# Initialize API operations
|
|
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
154
166
|
generate_operations=True,
|
|
155
167
|
usage_config=self.config.usage,
|
|
156
168
|
)
|
|
169
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
157
170
|
|
|
158
171
|
# For profiling
|
|
159
172
|
self.profiler = DremioProfiler(config, self.report, dremio_api)
|
|
@@ -190,84 +203,85 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
190
203
|
|
|
191
204
|
self.source_map = self._build_source_map()
|
|
192
205
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
with self.report.new_stage(METADATA_EXTRACTION):
|
|
207
|
+
# Process Containers
|
|
208
|
+
containers = self.dremio_catalog.get_containers()
|
|
209
|
+
for container in containers:
|
|
210
|
+
try:
|
|
211
|
+
yield from self.process_container(container)
|
|
212
|
+
logger.info(
|
|
213
|
+
f"Dremio container {container.container_name} emitted successfully"
|
|
214
|
+
)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
self.report.num_containers_failed += 1
|
|
217
|
+
self.report.report_failure(
|
|
218
|
+
message="Failed to process Dremio container",
|
|
219
|
+
context=f"{'.'.join(container.path)}.{container.container_name}",
|
|
220
|
+
exc=exc,
|
|
221
|
+
)
|
|
208
222
|
|
|
209
|
-
|
|
210
|
-
|
|
223
|
+
# Process Datasets
|
|
224
|
+
datasets = self.dremio_catalog.get_datasets()
|
|
211
225
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
226
|
+
for dataset_info in datasets:
|
|
227
|
+
try:
|
|
228
|
+
yield from self.process_dataset(dataset_info)
|
|
229
|
+
logger.info(
|
|
230
|
+
f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
|
|
231
|
+
)
|
|
232
|
+
except Exception as exc:
|
|
233
|
+
self.report.num_datasets_failed += 1 # Increment failed datasets
|
|
234
|
+
self.report.report_failure(
|
|
235
|
+
message="Failed to process Dremio dataset",
|
|
236
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
237
|
+
exc=exc,
|
|
238
|
+
)
|
|
225
239
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
self.get_query_lineage_workunits()
|
|
229
|
-
|
|
230
|
-
# Process Glossary Terms
|
|
231
|
-
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
232
|
-
|
|
233
|
-
for glossary_term in glossary_terms:
|
|
234
|
-
try:
|
|
235
|
-
yield from self.process_glossary_term(glossary_term)
|
|
236
|
-
except Exception as exc:
|
|
237
|
-
self.report.report_failure(
|
|
238
|
-
message="Failed to process Glossary terms",
|
|
239
|
-
context=f"{glossary_term.glossary_term}",
|
|
240
|
-
exc=exc,
|
|
241
|
-
)
|
|
240
|
+
# Process Glossary Terms
|
|
241
|
+
glossary_terms = self.dremio_catalog.get_glossary_terms()
|
|
242
242
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
243
|
+
for glossary_term in glossary_terms:
|
|
244
|
+
try:
|
|
245
|
+
yield from self.process_glossary_term(glossary_term)
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
self.report.report_failure(
|
|
248
|
+
message="Failed to process Glossary terms",
|
|
249
|
+
context=f"{glossary_term.glossary_term}",
|
|
250
|
+
exc=exc,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Optionally Process Query Lineage
|
|
254
|
+
if self.config.include_query_lineage:
|
|
255
|
+
with self.report.new_stage(LINEAGE_EXTRACTION):
|
|
256
|
+
self.get_query_lineage_workunits()
|
|
257
|
+
|
|
258
|
+
# Generate workunit for aggregated SQL parsing results
|
|
259
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
260
|
+
yield mcp.as_workunit()
|
|
261
|
+
|
|
262
|
+
# Profiling
|
|
263
|
+
if self.config.is_profiling_enabled():
|
|
264
|
+
with self.report.new_stage(PROFILING), ThreadPoolExecutor(
|
|
265
|
+
max_workers=self.config.profiling.max_workers
|
|
266
|
+
) as executor:
|
|
267
|
+
future_to_dataset = {
|
|
268
|
+
executor.submit(self.generate_profiles, dataset): dataset
|
|
269
|
+
for dataset in datasets
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
for future in as_completed(future_to_dataset):
|
|
273
|
+
dataset_info = future_to_dataset[future]
|
|
274
|
+
try:
|
|
275
|
+
yield from future.result()
|
|
276
|
+
except Exception as exc:
|
|
277
|
+
self.report.profiling_skipped_other[
|
|
278
|
+
dataset_info.resource_name
|
|
279
|
+
] += 1
|
|
280
|
+
self.report.report_failure(
|
|
281
|
+
message="Failed to profile dataset",
|
|
282
|
+
context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
|
|
283
|
+
exc=exc,
|
|
284
|
+
)
|
|
271
285
|
|
|
272
286
|
def process_container(
|
|
273
287
|
self, container_info: DremioContainer
|
|
@@ -388,8 +402,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
388
402
|
env=self.config.env,
|
|
389
403
|
platform_instance=self.config.platform_instance,
|
|
390
404
|
)
|
|
391
|
-
|
|
392
|
-
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
405
|
+
yield from self.profiler.get_workunits(dataset_info, dataset_urn)
|
|
393
406
|
|
|
394
407
|
def generate_view_lineage(
|
|
395
408
|
self, dataset_urn: str, parents: List[str]
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
from datetime import datetime, timedelta
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
1
5
|
class DremioSQLQueries:
|
|
2
6
|
QUERY_DATASETS_CE = """
|
|
3
7
|
SELECT* FROM
|
|
@@ -235,28 +239,83 @@ class DremioSQLQueries:
|
|
|
235
239
|
TABLE_NAME ASC
|
|
236
240
|
"""
|
|
237
241
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _get_default_start_timestamp_millis() -> str:
|
|
244
|
+
"""Get default start timestamp (1 day ago) in milliseconds precision format"""
|
|
245
|
+
one_day_ago = datetime.now() - timedelta(days=1)
|
|
246
|
+
return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
|
|
247
|
+
:-3
|
|
248
|
+
] # Truncate to milliseconds
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _get_default_end_timestamp_millis() -> str:
|
|
252
|
+
"""Get default end timestamp (now) in milliseconds precision format"""
|
|
253
|
+
now = datetime.now()
|
|
254
|
+
return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
|
|
255
|
+
|
|
256
|
+
@staticmethod
|
|
257
|
+
def get_query_all_jobs(
|
|
258
|
+
start_timestamp_millis: Optional[str] = None,
|
|
259
|
+
end_timestamp_millis: Optional[str] = None,
|
|
260
|
+
) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Get query for all jobs with optional time filtering.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
|
|
266
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
SQL query string with time filtering applied
|
|
270
|
+
"""
|
|
271
|
+
if start_timestamp_millis is None:
|
|
272
|
+
start_timestamp_millis = (
|
|
273
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
274
|
+
)
|
|
275
|
+
if end_timestamp_millis is None:
|
|
276
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
277
|
+
|
|
278
|
+
return f"""
|
|
279
|
+
SELECT
|
|
280
|
+
job_id,
|
|
281
|
+
user_name,
|
|
282
|
+
submitted_ts,
|
|
283
|
+
query,
|
|
284
|
+
queried_datasets
|
|
285
|
+
FROM
|
|
286
|
+
SYS.JOBS_RECENT
|
|
287
|
+
WHERE
|
|
288
|
+
STATUS = 'COMPLETED'
|
|
289
|
+
AND LENGTH(queried_datasets)>0
|
|
290
|
+
AND user_name != '$dremio$'
|
|
291
|
+
AND query_type not like '%INTERNAL%'
|
|
292
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
293
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_query_all_jobs_cloud(
|
|
298
|
+
start_timestamp_millis: Optional[str] = None,
|
|
299
|
+
end_timestamp_millis: Optional[str] = None,
|
|
300
|
+
) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Get query for all jobs in Dremio Cloud with optional time filtering.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
|
|
306
|
+
end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
SQL query string with time filtering applied
|
|
310
|
+
"""
|
|
311
|
+
if start_timestamp_millis is None:
|
|
312
|
+
start_timestamp_millis = (
|
|
313
|
+
DremioSQLQueries._get_default_start_timestamp_millis()
|
|
314
|
+
)
|
|
315
|
+
if end_timestamp_millis is None:
|
|
316
|
+
end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
|
|
256
317
|
|
|
257
|
-
|
|
258
|
-
# queried_datasets correctly documented as [varchar]
|
|
259
|
-
QUERY_ALL_JOBS_CLOUD = """
|
|
318
|
+
return f"""
|
|
260
319
|
SELECT
|
|
261
320
|
job_id,
|
|
262
321
|
user_name,
|
|
@@ -270,6 +329,8 @@ class DremioSQLQueries:
|
|
|
270
329
|
AND ARRAY_SIZE(queried_datasets)>0
|
|
271
330
|
AND user_name != '$dremio$'
|
|
272
331
|
AND query_type not like '%INTERNAL%'
|
|
332
|
+
AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
|
|
333
|
+
AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
|
|
273
334
|
"""
|
|
274
335
|
|
|
275
336
|
QUERY_TYPES = [
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
20
|
from datahub.ingestion.api.decorators import (
|
|
21
|
+
SourceCapability,
|
|
21
22
|
SupportStatus,
|
|
23
|
+
capability,
|
|
22
24
|
config_class,
|
|
23
25
|
platform_name,
|
|
24
26
|
support_status,
|
|
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
|
|
|
187
189
|
@platform_name("Metadata File")
|
|
188
190
|
@config_class(FileSourceConfig)
|
|
189
191
|
@support_status(SupportStatus.CERTIFIED)
|
|
192
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
190
193
|
class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
191
194
|
"""
|
|
192
195
|
This plugin pulls metadata from a previously generated file.
|
|
@@ -120,7 +120,6 @@ SNOWFLAKE = "snowflake"
|
|
|
120
120
|
BIGQUERY = "bigquery"
|
|
121
121
|
REDSHIFT = "redshift"
|
|
122
122
|
DATABRICKS = "databricks"
|
|
123
|
-
TRINO = "trino"
|
|
124
123
|
|
|
125
124
|
# Type names for Databricks, to match Title Case types in sqlalchemy
|
|
126
125
|
ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
|
|
@@ -206,6 +205,17 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
206
205
|
)
|
|
207
206
|
)
|
|
208
207
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
208
|
+
elif (
|
|
209
|
+
self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
|
|
210
|
+
or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
|
|
211
|
+
):
|
|
212
|
+
return convert_to_json_serializable(
|
|
213
|
+
self.engine.execute(
|
|
214
|
+
sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
|
|
215
|
+
self._table
|
|
216
|
+
)
|
|
217
|
+
).scalar()
|
|
218
|
+
)
|
|
209
219
|
return convert_to_json_serializable(
|
|
210
220
|
self.engine.execute(
|
|
211
221
|
sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
|
|
@@ -734,11 +744,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
734
744
|
def _get_dataset_column_distinct_value_frequencies(
|
|
735
745
|
self, column_profile: DatasetFieldProfileClass, column: str
|
|
736
746
|
) -> None:
|
|
737
|
-
if self.config.include_field_distinct_value_frequencies:
|
|
747
|
+
if not self.config.include_field_distinct_value_frequencies:
|
|
748
|
+
return
|
|
749
|
+
try:
|
|
750
|
+
results = self.dataset.engine.execute(
|
|
751
|
+
sa.select(
|
|
752
|
+
[
|
|
753
|
+
sa.column(column),
|
|
754
|
+
sa.func.count(sa.column(column)),
|
|
755
|
+
]
|
|
756
|
+
)
|
|
757
|
+
.select_from(self.dataset._table)
|
|
758
|
+
.where(sa.column(column).is_not(None))
|
|
759
|
+
.group_by(sa.column(column))
|
|
760
|
+
).fetchall()
|
|
761
|
+
|
|
738
762
|
column_profile.distinctValueFrequencies = [
|
|
739
|
-
ValueFrequencyClass(value=str(value), frequency=count)
|
|
740
|
-
for value, count in
|
|
763
|
+
ValueFrequencyClass(value=str(value), frequency=int(count))
|
|
764
|
+
for value, count in results
|
|
741
765
|
]
|
|
766
|
+
# sort so output is deterministic. don't do it in SQL because not all column
|
|
767
|
+
# types are sortable in SQL (such as JSON data types on Athena/Trino).
|
|
768
|
+
column_profile.distinctValueFrequencies = sorted(
|
|
769
|
+
column_profile.distinctValueFrequencies, key=lambda x: x.value
|
|
770
|
+
)
|
|
771
|
+
except Exception as e:
|
|
772
|
+
logger.debug(
|
|
773
|
+
f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
self.report.report_warning(
|
|
777
|
+
title="Profiling: Unable to Calculate Distinct Value Frequencies",
|
|
778
|
+
message="Distinct value frequencies for the column will not be accessible",
|
|
779
|
+
context=f"{self.dataset_name}.{column}",
|
|
780
|
+
exc=e,
|
|
781
|
+
)
|
|
742
782
|
|
|
743
783
|
@_run_with_query_combiner
|
|
744
784
|
def _get_dataset_column_histogram(
|
|
@@ -1395,12 +1435,12 @@ class DatahubGEProfiler:
|
|
|
1395
1435
|
)
|
|
1396
1436
|
return None
|
|
1397
1437
|
finally:
|
|
1398
|
-
if batch is not None and self.base_engine.engine.name.
|
|
1399
|
-
|
|
1400
|
-
|
|
1438
|
+
if batch is not None and self.base_engine.engine.name.lower() in [
|
|
1439
|
+
GXSqlDialect.TRINO,
|
|
1440
|
+
GXSqlDialect.AWSATHENA,
|
|
1401
1441
|
]:
|
|
1402
1442
|
if (
|
|
1403
|
-
self.base_engine.engine.name.
|
|
1443
|
+
self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
|
|
1404
1444
|
or temp_view is not None
|
|
1405
1445
|
):
|
|
1406
1446
|
self._drop_temp_table(batch)
|
|
@@ -125,6 +125,7 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
125
125
|
description="Profile table only if it has been updated since these many number of days. "
|
|
126
126
|
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
127
|
"Supported only in `snowflake` and `BigQuery`.",
|
|
128
|
+
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
|
|
128
129
|
)
|
|
129
130
|
|
|
130
131
|
profile_table_size_limit: Optional[int] = Field(
|
|
@@ -132,6 +133,9 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
132
133
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
133
134
|
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
135
|
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
136
|
+
schema_extra={
|
|
137
|
+
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
|
|
138
|
+
},
|
|
135
139
|
)
|
|
136
140
|
|
|
137
141
|
profile_table_row_limit: Optional[int] = Field(
|
|
@@ -139,12 +143,14 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
139
143
|
description="Profile tables only if their row count is less than specified count. "
|
|
140
144
|
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
145
|
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
146
|
+
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
|
|
142
147
|
)
|
|
143
148
|
|
|
144
149
|
profile_table_row_count_estimate_only: bool = Field(
|
|
145
150
|
default=False,
|
|
146
151
|
description="Use an approximate query for row count. This will be much faster but slightly "
|
|
147
152
|
"less accurate. Only supported for Postgres and MySQL. ",
|
|
153
|
+
schema_extra={"supported_sources": ["postgres", "mysql"]},
|
|
148
154
|
)
|
|
149
155
|
|
|
150
156
|
# The query combiner enables us to combine multiple queries into a single query,
|
|
@@ -161,27 +167,32 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
161
167
|
default=True,
|
|
162
168
|
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
|
163
169
|
"If enabled, latest partition data is used for profiling.",
|
|
170
|
+
schema_extra={"supported_sources": ["athena", "bigquery"]},
|
|
164
171
|
)
|
|
165
172
|
partition_datetime: Optional[datetime.datetime] = Field(
|
|
166
173
|
default=None,
|
|
167
174
|
description="If specified, profile only the partition which matches this datetime. "
|
|
168
175
|
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
|
176
|
+
schema_extra={"supported_sources": ["bigquery"]},
|
|
169
177
|
)
|
|
170
178
|
use_sampling: bool = Field(
|
|
171
179
|
default=True,
|
|
172
180
|
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
|
173
181
|
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
|
182
|
+
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
174
183
|
)
|
|
175
184
|
|
|
176
185
|
sample_size: int = Field(
|
|
177
186
|
default=10000,
|
|
178
187
|
description="Number of rows to be sampled from table for column level profiling."
|
|
179
188
|
"Applicable only if `use_sampling` is set to True.",
|
|
189
|
+
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
180
190
|
)
|
|
181
191
|
|
|
182
192
|
profile_external_tables: bool = Field(
|
|
183
193
|
default=False,
|
|
184
194
|
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
|
195
|
+
schema_extra={"supported_sources": ["redshift", "snowflake"]},
|
|
185
196
|
)
|
|
186
197
|
|
|
187
198
|
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
|
|
@@ -134,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
134
134
|
SourceCapability.OWNERSHIP,
|
|
135
135
|
"Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
|
|
136
136
|
)
|
|
137
|
-
@capability(
|
|
137
|
+
@capability(
|
|
138
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
139
|
+
)
|
|
138
140
|
class IcebergSource(StatefulIngestionSourceBase):
|
|
139
141
|
"""
|
|
140
142
|
## Integration Details
|
|
@@ -189,6 +189,22 @@ class KafkaConnectionTest:
|
|
|
189
189
|
SourceCapability.SCHEMA_METADATA,
|
|
190
190
|
"Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
|
|
191
191
|
)
|
|
192
|
+
@capability(
|
|
193
|
+
SourceCapability.DATA_PROFILING,
|
|
194
|
+
"Not supported",
|
|
195
|
+
supported=False,
|
|
196
|
+
)
|
|
197
|
+
@capability(
|
|
198
|
+
SourceCapability.LINEAGE_COARSE,
|
|
199
|
+
"Not supported. If you use Kafka Connect, the kafka-connect source can generate lineage.",
|
|
200
|
+
supported=False,
|
|
201
|
+
)
|
|
202
|
+
@capability(
|
|
203
|
+
SourceCapability.LINEAGE_FINE,
|
|
204
|
+
"Not supported",
|
|
205
|
+
supported=False,
|
|
206
|
+
)
|
|
207
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
192
208
|
class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
193
209
|
"""
|
|
194
210
|
This plugin extracts the following:
|
|
@@ -126,6 +126,7 @@ logger = logging.getLogger(__name__)
|
|
|
126
126
|
SourceCapability.USAGE_STATS,
|
|
127
127
|
"Enabled by default, configured using `extract_usage_history`",
|
|
128
128
|
)
|
|
129
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
129
130
|
class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
130
131
|
"""
|
|
131
132
|
This plugin extracts the following:
|
|
@@ -1253,6 +1253,7 @@ class Mapper:
|
|
|
1253
1253
|
SourceCapability.DATA_PROFILING,
|
|
1254
1254
|
"Optionally enabled via configuration profiling.enabled",
|
|
1255
1255
|
)
|
|
1256
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
1256
1257
|
class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
1257
1258
|
"""
|
|
1258
1259
|
This plugin extracts the following:
|
|
@@ -109,6 +109,7 @@ logger = logging.getLogger(__name__)
|
|
|
109
109
|
"Enabled by default, configured using `ingest_owner`",
|
|
110
110
|
)
|
|
111
111
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
112
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
112
113
|
class QlikSenseSource(StatefulIngestionSourceBase, TestableSource):
|
|
113
114
|
"""
|
|
114
115
|
This plugin extracts the following:
|