acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
|
|
|
65
65
|
description="Sleep between truncation monitoring.",
|
|
66
66
|
)
|
|
67
67
|
|
|
68
|
-
dataprocess_cleanup:
|
|
69
|
-
|
|
68
|
+
dataprocess_cleanup: DataProcessCleanupConfig = Field(
|
|
69
|
+
default_factory=DataProcessCleanupConfig,
|
|
70
70
|
description="Configuration for data process cleanup",
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
soft_deleted_entities_cleanup:
|
|
74
|
-
|
|
73
|
+
soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
|
|
74
|
+
default_factory=SoftDeletedEntitiesCleanupConfig,
|
|
75
75
|
description="Configuration for soft deleted entities cleanup",
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
execution_request_cleanup:
|
|
79
|
-
|
|
78
|
+
execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
|
|
79
|
+
default_factory=DatahubExecutionRequestCleanupConfig,
|
|
80
80
|
description="Configuration for execution request cleanup",
|
|
81
81
|
)
|
|
82
82
|
|
|
@@ -108,28 +108,22 @@ class DataHubGcSource(Source):
|
|
|
108
108
|
self.ctx = ctx
|
|
109
109
|
self.config = config
|
|
110
110
|
self.report = DataHubGcSourceReport()
|
|
111
|
+
self.report.event_not_produced_warn = False
|
|
111
112
|
self.graph = ctx.require_graph("The DataHubGc source")
|
|
112
|
-
self.dataprocess_cleanup
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
self.
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
)
|
|
127
|
-
if self.config.execution_request_cleanup:
|
|
128
|
-
self.execution_request_cleanup = DatahubExecutionRequestCleanup(
|
|
129
|
-
config=self.config.execution_request_cleanup,
|
|
130
|
-
graph=self.graph,
|
|
131
|
-
report=self.report,
|
|
132
|
-
)
|
|
113
|
+
self.dataprocess_cleanup = DataProcessCleanup(
|
|
114
|
+
ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
|
|
115
|
+
)
|
|
116
|
+
self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
|
|
117
|
+
ctx,
|
|
118
|
+
self.config.soft_deleted_entities_cleanup,
|
|
119
|
+
self.report,
|
|
120
|
+
self.config.dry_run,
|
|
121
|
+
)
|
|
122
|
+
self.execution_request_cleanup = DatahubExecutionRequestCleanup(
|
|
123
|
+
config=self.config.execution_request_cleanup,
|
|
124
|
+
graph=self.graph,
|
|
125
|
+
report=self.report,
|
|
126
|
+
)
|
|
133
127
|
|
|
134
128
|
@classmethod
|
|
135
129
|
def create(cls, config_dict, ctx):
|
|
@@ -144,15 +138,32 @@ class DataHubGcSource(Source):
|
|
|
144
138
|
self,
|
|
145
139
|
) -> Iterable[MetadataWorkUnit]:
|
|
146
140
|
if self.config.cleanup_expired_tokens:
|
|
147
|
-
|
|
141
|
+
try:
|
|
142
|
+
self.revoke_expired_tokens()
|
|
143
|
+
except Exception as e:
|
|
144
|
+
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
148
145
|
if self.config.truncate_indices:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
146
|
+
try:
|
|
147
|
+
self.truncate_indices()
|
|
148
|
+
except Exception as e:
|
|
149
|
+
self.report.failure("While trying to truncate indices ", exc=e)
|
|
150
|
+
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
151
|
+
try:
|
|
152
|
+
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
153
|
+
except Exception as e:
|
|
154
|
+
self.report.failure(
|
|
155
|
+
"While trying to cleanup soft deleted entities ", exc=e
|
|
156
|
+
)
|
|
157
|
+
if self.config.execution_request_cleanup.enabled:
|
|
158
|
+
try:
|
|
159
|
+
self.execution_request_cleanup.run()
|
|
160
|
+
except Exception as e:
|
|
161
|
+
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
162
|
+
if self.config.dataprocess_cleanup.enabled:
|
|
163
|
+
try:
|
|
164
|
+
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
156
167
|
yield from []
|
|
157
168
|
|
|
158
169
|
def truncate_indices(self) -> None:
|
|
@@ -98,6 +98,9 @@ query getDataJobRuns($dataJobUrn: String!, $start: Int!, $count: Int!) {
|
|
|
98
98
|
|
|
99
99
|
|
|
100
100
|
class DataProcessCleanupConfig(ConfigModel):
|
|
101
|
+
enabled: bool = Field(
|
|
102
|
+
default=True, description="Whether to do data process cleanup."
|
|
103
|
+
)
|
|
101
104
|
retention_days: Optional[int] = Field(
|
|
102
105
|
10,
|
|
103
106
|
description="Number of days to retain metadata in DataHub",
|
|
@@ -114,11 +117,11 @@ class DataProcessCleanupConfig(ConfigModel):
|
|
|
114
117
|
)
|
|
115
118
|
|
|
116
119
|
delete_empty_data_jobs: bool = Field(
|
|
117
|
-
|
|
120
|
+
False, description="Whether to delete Data Jobs without runs"
|
|
118
121
|
)
|
|
119
122
|
|
|
120
123
|
delete_empty_data_flows: bool = Field(
|
|
121
|
-
|
|
124
|
+
False, description="Whether to delete Data Flows without runs"
|
|
122
125
|
)
|
|
123
126
|
|
|
124
127
|
hard_delete_entities: bool = Field(
|
|
@@ -128,7 +131,7 @@ class DataProcessCleanupConfig(ConfigModel):
|
|
|
128
131
|
|
|
129
132
|
batch_size: int = Field(
|
|
130
133
|
500,
|
|
131
|
-
description="The number of entities to get in a batch from
|
|
134
|
+
description="The number of entities to get in a batch from API",
|
|
132
135
|
)
|
|
133
136
|
|
|
134
137
|
max_workers: int = Field(
|
|
@@ -173,9 +176,9 @@ class DataProcessCleanup:
|
|
|
173
176
|
"""
|
|
174
177
|
This source is a maintenance source which cleans up old/unused aspects.
|
|
175
178
|
|
|
176
|
-
Currently it only supports
|
|
179
|
+
Currently it only supports:
|
|
177
180
|
- DataFlow
|
|
178
|
-
-DataJob
|
|
181
|
+
- DataJob
|
|
179
182
|
- DataProcessInstance
|
|
180
183
|
|
|
181
184
|
"""
|
|
@@ -207,23 +210,34 @@ class DataProcessCleanup:
|
|
|
207
210
|
assert self.ctx.graph
|
|
208
211
|
dpis = []
|
|
209
212
|
start = 0
|
|
213
|
+
# This graphql endpoint doesn't support scrolling and therefore after 10k DPIs it causes performance issues on ES
|
|
214
|
+
# Therefore, we are limiting the max DPIs to 9000
|
|
215
|
+
max_item = 9000
|
|
210
216
|
while True:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
217
|
+
try:
|
|
218
|
+
job_query_result = self.ctx.graph.execute_graphql(
|
|
219
|
+
DATA_PROCESS_INSTANCES_QUERY,
|
|
220
|
+
{"dataJobUrn": job_urn, "start": start, "count": batch_size},
|
|
221
|
+
)
|
|
222
|
+
job_data = job_query_result.get("dataJob")
|
|
223
|
+
if not job_data:
|
|
224
|
+
logger.error(f"Error getting job {job_urn}")
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
runs_data = job_data.get("runs")
|
|
228
|
+
if not runs_data:
|
|
229
|
+
logger.error(f"Error getting runs for {job_urn}")
|
|
230
|
+
break
|
|
231
|
+
|
|
232
|
+
runs = runs_data.get("runs")
|
|
233
|
+
dpis.extend(runs)
|
|
234
|
+
start += batch_size
|
|
235
|
+
if len(runs) < batch_size or start >= max_item:
|
|
236
|
+
break
|
|
237
|
+
except Exception as e:
|
|
238
|
+
self.report.failure(
|
|
239
|
+
f"Exception while fetching DPIs for job {job_urn}:", exc=e
|
|
240
|
+
)
|
|
227
241
|
break
|
|
228
242
|
return dpis
|
|
229
243
|
|
|
@@ -243,9 +257,14 @@ class DataProcessCleanup:
|
|
|
243
257
|
futures[future] = dpi
|
|
244
258
|
|
|
245
259
|
for future in as_completed(futures):
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
260
|
+
try:
|
|
261
|
+
future.result()
|
|
262
|
+
deleted_count_last_n += 1
|
|
263
|
+
futures[future]["deleted"] = True
|
|
264
|
+
except Exception as e:
|
|
265
|
+
self.report.report_failure(
|
|
266
|
+
f"Exception while deleting DPI: {e}", exc=e
|
|
267
|
+
)
|
|
249
268
|
if deleted_count_last_n % self.config.batch_size == 0:
|
|
250
269
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
251
270
|
if self.config.delay:
|
|
@@ -267,7 +286,7 @@ class DataProcessCleanup:
|
|
|
267
286
|
|
|
268
287
|
if self.dry_run:
|
|
269
288
|
logger.info(
|
|
270
|
-
f"Dry run is on otherwise it would have deleted {urn} with hard deletion is{self.config.hard_delete_entities}"
|
|
289
|
+
f"Dry run is on otherwise it would have deleted {urn} with hard deletion is {self.config.hard_delete_entities}"
|
|
271
290
|
)
|
|
272
291
|
return
|
|
273
292
|
|
|
@@ -277,7 +296,12 @@ class DataProcessCleanup:
|
|
|
277
296
|
assert self.ctx.graph
|
|
278
297
|
|
|
279
298
|
dpis = self.fetch_dpis(job.urn, self.config.batch_size)
|
|
280
|
-
dpis.sort(
|
|
299
|
+
dpis.sort(
|
|
300
|
+
key=lambda x: x["created"]["time"]
|
|
301
|
+
if x.get("created") and x["created"].get("time")
|
|
302
|
+
else 0,
|
|
303
|
+
reverse=True,
|
|
304
|
+
)
|
|
281
305
|
|
|
282
306
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
283
307
|
if self.config.keep_last_n:
|
|
@@ -309,15 +333,23 @@ class DataProcessCleanup:
|
|
|
309
333
|
if dpi.get("deleted"):
|
|
310
334
|
continue
|
|
311
335
|
|
|
312
|
-
if
|
|
336
|
+
if (
|
|
337
|
+
not dpi.get("created")
|
|
338
|
+
or not dpi["created"].get("time")
|
|
339
|
+
or dpi["created"]["time"] < retention_time * 1000
|
|
340
|
+
):
|
|
313
341
|
future = executor.submit(
|
|
314
342
|
self.delete_entity, dpi["urn"], "dataprocessInstance"
|
|
315
343
|
)
|
|
316
344
|
futures[future] = dpi
|
|
317
345
|
|
|
318
346
|
for future in as_completed(futures):
|
|
319
|
-
|
|
320
|
-
|
|
347
|
+
try:
|
|
348
|
+
future.result()
|
|
349
|
+
deleted_count_retention += 1
|
|
350
|
+
futures[future]["deleted"] = True
|
|
351
|
+
except Exception as e:
|
|
352
|
+
self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
|
|
321
353
|
|
|
322
354
|
if deleted_count_retention % self.config.batch_size == 0:
|
|
323
355
|
logger.info(
|
|
@@ -328,9 +360,12 @@ class DataProcessCleanup:
|
|
|
328
360
|
logger.info(f"Sleeping for {self.config.delay} seconds")
|
|
329
361
|
time.sleep(self.config.delay)
|
|
330
362
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
363
|
+
if deleted_count_retention > 0:
|
|
364
|
+
logger.info(
|
|
365
|
+
f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
|
|
366
|
+
)
|
|
367
|
+
else:
|
|
368
|
+
logger.debug(f"No DPIs to delete from {job.urn} due to retention")
|
|
334
369
|
|
|
335
370
|
def get_data_flows(self) -> Iterable[DataFlowEntity]:
|
|
336
371
|
assert self.ctx.graph
|
|
@@ -339,17 +374,26 @@ class DataProcessCleanup:
|
|
|
339
374
|
previous_scroll_id: Optional[str] = None
|
|
340
375
|
|
|
341
376
|
while True:
|
|
342
|
-
result =
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
377
|
+
result = None
|
|
378
|
+
try:
|
|
379
|
+
result = self.ctx.graph.execute_graphql(
|
|
380
|
+
DATAFLOW_QUERY,
|
|
381
|
+
{
|
|
382
|
+
"query": "*",
|
|
383
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
384
|
+
"batchSize": self.config.batch_size,
|
|
385
|
+
},
|
|
386
|
+
)
|
|
387
|
+
except Exception as e:
|
|
388
|
+
self.report.failure(
|
|
389
|
+
f"While trying to get dataflows with {scroll_id}", exc=e
|
|
390
|
+
)
|
|
391
|
+
break
|
|
392
|
+
|
|
350
393
|
scrollAcrossEntities = result.get("scrollAcrossEntities")
|
|
351
394
|
if not scrollAcrossEntities:
|
|
352
395
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
396
|
+
logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
|
|
353
397
|
|
|
354
398
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
355
399
|
for flow in scrollAcrossEntities.get("searchResults"):
|
|
@@ -366,6 +410,8 @@ class DataProcessCleanup:
|
|
|
366
410
|
previous_scroll_id = scroll_id
|
|
367
411
|
|
|
368
412
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
413
|
+
if not self.config.enabled:
|
|
414
|
+
return []
|
|
369
415
|
assert self.ctx.graph
|
|
370
416
|
|
|
371
417
|
dataFlows: Dict[str, DataFlowEntity] = {}
|
|
@@ -373,17 +419,26 @@ class DataProcessCleanup:
|
|
|
373
419
|
dataFlows[flow.urn] = flow
|
|
374
420
|
|
|
375
421
|
scroll_id: Optional[str] = None
|
|
422
|
+
previous_scroll_id: Optional[str] = None
|
|
423
|
+
|
|
376
424
|
dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
|
|
377
425
|
deleted_jobs: int = 0
|
|
426
|
+
|
|
378
427
|
while True:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
428
|
+
try:
|
|
429
|
+
result = self.ctx.graph.execute_graphql(
|
|
430
|
+
DATAJOB_QUERY,
|
|
431
|
+
{
|
|
432
|
+
"query": "*",
|
|
433
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
434
|
+
"batchSize": self.config.batch_size,
|
|
435
|
+
},
|
|
436
|
+
)
|
|
437
|
+
except Exception as e:
|
|
438
|
+
self.report.failure(
|
|
439
|
+
f"While trying to get data jobs with {scroll_id}", exc=e
|
|
440
|
+
)
|
|
441
|
+
break
|
|
387
442
|
scrollAcrossEntities = result.get("scrollAcrossEntities")
|
|
388
443
|
if not scrollAcrossEntities:
|
|
389
444
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
@@ -404,7 +459,9 @@ class DataProcessCleanup:
|
|
|
404
459
|
try:
|
|
405
460
|
self.delete_dpi_from_datajobs(datajob_entity)
|
|
406
461
|
except Exception as e:
|
|
407
|
-
|
|
462
|
+
self.report.failure(
|
|
463
|
+
f"While trying to delete {datajob_entity} ", exc=e
|
|
464
|
+
)
|
|
408
465
|
if (
|
|
409
466
|
datajob_entity.total_runs == 0
|
|
410
467
|
and self.config.delete_empty_data_jobs
|
|
@@ -419,9 +476,11 @@ class DataProcessCleanup:
|
|
|
419
476
|
else:
|
|
420
477
|
dataJobs[datajob_entity.flow_urn].append(datajob_entity)
|
|
421
478
|
|
|
422
|
-
if not scroll_id:
|
|
479
|
+
if not scroll_id or previous_scroll_id == scroll_id:
|
|
423
480
|
break
|
|
424
481
|
|
|
482
|
+
previous_scroll_id = scroll_id
|
|
483
|
+
|
|
425
484
|
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
426
485
|
# Delete empty dataflows if needed
|
|
427
486
|
if self.config.delete_empty_data_flows:
|
|
@@ -436,4 +495,5 @@ class DataProcessCleanup:
|
|
|
436
495
|
if deleted_jobs % self.config.batch_size == 0:
|
|
437
496
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
438
497
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
498
|
+
|
|
439
499
|
return []
|
|
@@ -20,6 +20,9 @@ logger = logging.getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
23
|
+
enabled: bool = Field(
|
|
24
|
+
default=True, description="Whether to do soft deletion cleanup."
|
|
25
|
+
)
|
|
23
26
|
retention_days: Optional[int] = Field(
|
|
24
27
|
10,
|
|
25
28
|
description="Number of days to retain metadata in DataHub",
|
|
@@ -60,7 +63,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
60
63
|
description="Query to filter entities",
|
|
61
64
|
)
|
|
62
65
|
limit_entities_delete: Optional[int] = Field(
|
|
63
|
-
|
|
66
|
+
25000, description="Max number of entities to delete."
|
|
64
67
|
)
|
|
65
68
|
|
|
66
69
|
runtime_limit_seconds: Optional[int] = Field(
|
|
@@ -104,7 +107,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
104
107
|
def delete_entity(self, urn: str) -> None:
|
|
105
108
|
assert self.ctx.graph
|
|
106
109
|
|
|
107
|
-
entity_urn = Urn.
|
|
110
|
+
entity_urn = Urn.from_string(urn)
|
|
108
111
|
self.report.num_soft_deleted_entity_removed += 1
|
|
109
112
|
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
110
113
|
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
@@ -156,6 +159,8 @@ class SoftDeletedEntitiesCleanup:
|
|
|
156
159
|
self.delete_entity(urn)
|
|
157
160
|
|
|
158
161
|
def cleanup_soft_deleted_entities(self) -> None:
|
|
162
|
+
if not self.config.enabled:
|
|
163
|
+
return
|
|
159
164
|
assert self.ctx.graph
|
|
160
165
|
start_time = time.time()
|
|
161
166
|
|
|
@@ -7,6 +7,7 @@ import dataclasses
|
|
|
7
7
|
import functools
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
|
+
import re
|
|
10
11
|
import threading
|
|
11
12
|
import traceback
|
|
12
13
|
import unittest.mock
|
|
@@ -56,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
|
|
|
56
57
|
convert_to_cardinality,
|
|
57
58
|
)
|
|
58
59
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
59
|
-
from datahub.
|
|
60
|
+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
61
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
62
|
+
EditableSchemaMetadata,
|
|
63
|
+
NumberType,
|
|
64
|
+
)
|
|
60
65
|
from datahub.metadata.schema_classes import (
|
|
61
66
|
DatasetFieldProfileClass,
|
|
62
67
|
DatasetProfileClass,
|
|
@@ -123,6 +128,8 @@ ProfilerTypeMapping.BINARY_TYPE_NAMES.append("LargeBinary")
|
|
|
123
128
|
|
|
124
129
|
_datasource_connection_injection_lock = threading.Lock()
|
|
125
130
|
|
|
131
|
+
NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")
|
|
132
|
+
|
|
126
133
|
|
|
127
134
|
@contextlib.contextmanager
|
|
128
135
|
def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
|
|
@@ -165,11 +172,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
165
172
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
166
173
|
elif self.engine.dialect.name.lower() == BIGQUERY:
|
|
167
174
|
element_values = self.engine.execute(
|
|
168
|
-
sa.select(
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
]
|
|
172
|
-
).select_from(self._table)
|
|
175
|
+
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
|
|
176
|
+
self._table
|
|
177
|
+
)
|
|
173
178
|
)
|
|
174
179
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
175
180
|
elif self.engine.dialect.name.lower() == SNOWFLAKE:
|
|
@@ -360,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
360
365
|
platform: str
|
|
361
366
|
env: str
|
|
362
367
|
|
|
368
|
+
column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
|
|
369
|
+
|
|
363
370
|
def _get_columns_to_profile(self) -> List[str]:
|
|
364
371
|
if not self.config.any_field_level_metrics_enabled():
|
|
365
372
|
return []
|
|
@@ -373,11 +380,15 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
373
380
|
|
|
374
381
|
for col_dict in self.dataset.columns:
|
|
375
382
|
col = col_dict["name"]
|
|
383
|
+
self.column_types[col] = str(col_dict["type"])
|
|
376
384
|
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
|
377
385
|
if not self.config._allow_deny_patterns.allowed(
|
|
378
386
|
f"{self.dataset_name}.{col}"
|
|
379
387
|
):
|
|
380
388
|
ignored_columns_by_pattern.append(col)
|
|
389
|
+
# We try to ignore nested columns as well
|
|
390
|
+
elif not self.config.profile_nested_fields and "." in col:
|
|
391
|
+
ignored_columns_by_pattern.append(col)
|
|
381
392
|
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
|
|
382
393
|
ignored_columns_by_type.append(col)
|
|
383
394
|
else:
|
|
@@ -407,9 +418,18 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
407
418
|
return columns_to_profile
|
|
408
419
|
|
|
409
420
|
def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
421
|
+
# We don't profiles columns with None types
|
|
422
|
+
if str(sqlalchemy_type) == "NULL":
|
|
423
|
+
return True
|
|
424
|
+
|
|
425
|
+
sql_type = str(sqlalchemy_type)
|
|
426
|
+
|
|
427
|
+
match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)
|
|
428
|
+
|
|
429
|
+
if match:
|
|
430
|
+
sql_type = match.group(1)
|
|
431
|
+
|
|
432
|
+
return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)
|
|
413
433
|
|
|
414
434
|
@_run_with_query_combiner
|
|
415
435
|
def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
|
|
@@ -417,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
417
437
|
self.dataset, column
|
|
418
438
|
)
|
|
419
439
|
|
|
440
|
+
if column_spec.type_ == ProfilerDataType.UNKNOWN:
|
|
441
|
+
try:
|
|
442
|
+
datahub_field_type = resolve_sql_type(
|
|
443
|
+
self.column_types[column], self.dataset.engine.dialect.name.lower()
|
|
444
|
+
)
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.debug(
|
|
447
|
+
f"Error resolving sql type {self.column_types[column]}: {e}"
|
|
448
|
+
)
|
|
449
|
+
datahub_field_type = None
|
|
450
|
+
if datahub_field_type is None:
|
|
451
|
+
return
|
|
452
|
+
if isinstance(datahub_field_type, NumberType):
|
|
453
|
+
column_spec.type_ = ProfilerDataType.NUMERIC
|
|
454
|
+
|
|
420
455
|
@_run_with_query_combiner
|
|
421
456
|
def _get_column_cardinality(
|
|
422
457
|
self, column_spec: _SingleColumnSpec, column: str
|
|
@@ -1397,6 +1432,8 @@ class DatahubGEProfiler:
|
|
|
1397
1432
|
def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
|
|
1398
1433
|
if dialect_name.lower() == POSTGRESQL:
|
|
1399
1434
|
return ["JSON"]
|
|
1435
|
+
elif dialect_name.lower() == BIGQUERY:
|
|
1436
|
+
return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
|
|
1400
1437
|
|
|
1401
1438
|
return []
|
|
1402
1439
|
|
|
@@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
188
188
|
),
|
|
189
189
|
)
|
|
190
190
|
|
|
191
|
+
profile_nested_fields: bool = Field(
|
|
192
|
+
default=False,
|
|
193
|
+
description="Whether to profile complex types like structs, arrays and maps. ",
|
|
194
|
+
)
|
|
195
|
+
|
|
191
196
|
@pydantic.root_validator(pre=True)
|
|
192
197
|
def deprecate_bigquery_temp_table_schema(cls, values):
|
|
193
198
|
# TODO: Update docs to remove mention of this field.
|
|
@@ -9,6 +9,7 @@ from pyiceberg.exceptions import (
|
|
|
9
9
|
NoSuchIcebergTableError,
|
|
10
10
|
NoSuchNamespaceError,
|
|
11
11
|
NoSuchPropertyException,
|
|
12
|
+
NoSuchTableError,
|
|
12
13
|
)
|
|
13
14
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
14
15
|
from pyiceberg.table import Table
|
|
@@ -104,7 +105,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
|
|
|
104
105
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default.")
|
|
105
106
|
@capability(
|
|
106
107
|
SourceCapability.OWNERSHIP,
|
|
107
|
-
"
|
|
108
|
+
"Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
|
|
108
109
|
)
|
|
109
110
|
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
110
111
|
class IcebergSource(StatefulIngestionSourceBase):
|
|
@@ -192,9 +193,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
192
193
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
193
194
|
time_taken = timer.elapsed_seconds()
|
|
194
195
|
self.report.report_table_load_time(time_taken)
|
|
195
|
-
LOGGER.debug(
|
|
196
|
-
f"Loaded table: {table.identifier}, time taken: {time_taken}"
|
|
197
|
-
)
|
|
196
|
+
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
198
197
|
yield from self._create_iceberg_workunit(dataset_name, table)
|
|
199
198
|
except NoSuchPropertyException as e:
|
|
200
199
|
self.report.report_warning(
|
|
@@ -206,12 +205,20 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
206
205
|
)
|
|
207
206
|
except NoSuchIcebergTableError as e:
|
|
208
207
|
self.report.report_warning(
|
|
209
|
-
"
|
|
208
|
+
"not-an-iceberg-table",
|
|
210
209
|
f"Failed to create workunit for {dataset_name}. {e}",
|
|
211
210
|
)
|
|
212
211
|
LOGGER.warning(
|
|
213
212
|
f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
|
|
214
213
|
)
|
|
214
|
+
except NoSuchTableError as e:
|
|
215
|
+
self.report.report_warning(
|
|
216
|
+
"no-such-table",
|
|
217
|
+
f"Failed to create workunit for {dataset_name}. {e}",
|
|
218
|
+
)
|
|
219
|
+
LOGGER.warning(
|
|
220
|
+
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
221
|
+
)
|
|
215
222
|
except Exception as e:
|
|
216
223
|
self.report.report_failure("general", f"Failed to create workunit: {e}")
|
|
217
224
|
LOGGER.exception(
|