acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
65
65
  description="Sleep between truncation monitoring.",
66
66
  )
67
67
 
68
- dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field(
69
- default=None,
68
+ dataprocess_cleanup: DataProcessCleanupConfig = Field(
69
+ default_factory=DataProcessCleanupConfig,
70
70
  description="Configuration for data process cleanup",
71
71
  )
72
72
 
73
- soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field(
74
- default=None,
73
+ soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
74
+ default_factory=SoftDeletedEntitiesCleanupConfig,
75
75
  description="Configuration for soft deleted entities cleanup",
76
76
  )
77
77
 
78
- execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field(
79
- default=None,
78
+ execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
79
+ default_factory=DatahubExecutionRequestCleanupConfig,
80
80
  description="Configuration for execution request cleanup",
81
81
  )
82
82
 
@@ -108,28 +108,22 @@ class DataHubGcSource(Source):
108
108
  self.ctx = ctx
109
109
  self.config = config
110
110
  self.report = DataHubGcSourceReport()
111
+ self.report.event_not_produced_warn = False
111
112
  self.graph = ctx.require_graph("The DataHubGc source")
112
- self.dataprocess_cleanup: Optional[DataProcessCleanup] = None
113
- self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None
114
- self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None
115
-
116
- if self.config.dataprocess_cleanup:
117
- self.dataprocess_cleanup = DataProcessCleanup(
118
- ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
119
- )
120
- if self.config.soft_deleted_entities_cleanup:
121
- self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
122
- ctx,
123
- self.config.soft_deleted_entities_cleanup,
124
- self.report,
125
- self.config.dry_run,
126
- )
127
- if self.config.execution_request_cleanup:
128
- self.execution_request_cleanup = DatahubExecutionRequestCleanup(
129
- config=self.config.execution_request_cleanup,
130
- graph=self.graph,
131
- report=self.report,
132
- )
113
+ self.dataprocess_cleanup = DataProcessCleanup(
114
+ ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
115
+ )
116
+ self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
117
+ ctx,
118
+ self.config.soft_deleted_entities_cleanup,
119
+ self.report,
120
+ self.config.dry_run,
121
+ )
122
+ self.execution_request_cleanup = DatahubExecutionRequestCleanup(
123
+ config=self.config.execution_request_cleanup,
124
+ graph=self.graph,
125
+ report=self.report,
126
+ )
133
127
 
134
128
  @classmethod
135
129
  def create(cls, config_dict, ctx):
@@ -144,15 +138,32 @@ class DataHubGcSource(Source):
144
138
  self,
145
139
  ) -> Iterable[MetadataWorkUnit]:
146
140
  if self.config.cleanup_expired_tokens:
147
- self.revoke_expired_tokens()
141
+ try:
142
+ self.revoke_expired_tokens()
143
+ except Exception as e:
144
+ self.report.failure("While trying to cleanup expired token ", exc=e)
148
145
  if self.config.truncate_indices:
149
- self.truncate_indices()
150
- if self.dataprocess_cleanup:
151
- yield from self.dataprocess_cleanup.get_workunits_internal()
152
- if self.soft_deleted_entities_cleanup:
153
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
154
- if self.execution_request_cleanup:
155
- self.execution_request_cleanup.run()
146
+ try:
147
+ self.truncate_indices()
148
+ except Exception as e:
149
+ self.report.failure("While trying to truncate indices ", exc=e)
150
+ if self.config.soft_deleted_entities_cleanup.enabled:
151
+ try:
152
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
+ except Exception as e:
154
+ self.report.failure(
155
+ "While trying to cleanup soft deleted entities ", exc=e
156
+ )
157
+ if self.config.execution_request_cleanup.enabled:
158
+ try:
159
+ self.execution_request_cleanup.run()
160
+ except Exception as e:
161
+ self.report.failure("While trying to cleanup execution request ", exc=e)
162
+ if self.config.dataprocess_cleanup.enabled:
163
+ try:
164
+ yield from self.dataprocess_cleanup.get_workunits_internal()
165
+ except Exception as e:
166
+ self.report.failure("While trying to cleanup data process ", exc=e)
156
167
  yield from []
157
168
 
158
169
  def truncate_indices(self) -> None:
@@ -98,6 +98,9 @@ query getDataJobRuns($dataJobUrn: String!, $start: Int!, $count: Int!) {
98
98
 
99
99
 
100
100
  class DataProcessCleanupConfig(ConfigModel):
101
+ enabled: bool = Field(
102
+ default=True, description="Whether to do data process cleanup."
103
+ )
101
104
  retention_days: Optional[int] = Field(
102
105
  10,
103
106
  description="Number of days to retain metadata in DataHub",
@@ -114,11 +117,11 @@ class DataProcessCleanupConfig(ConfigModel):
114
117
  )
115
118
 
116
119
  delete_empty_data_jobs: bool = Field(
117
- True, description="Wether to delete Data Jobs without runs"
120
+ False, description="Whether to delete Data Jobs without runs"
118
121
  )
119
122
 
120
123
  delete_empty_data_flows: bool = Field(
121
- True, description="Wether to delete Data Flows without runs"
124
+ False, description="Whether to delete Data Flows without runs"
122
125
  )
123
126
 
124
127
  hard_delete_entities: bool = Field(
@@ -128,7 +131,7 @@ class DataProcessCleanupConfig(ConfigModel):
128
131
 
129
132
  batch_size: int = Field(
130
133
  500,
131
- description="The number of entities to get in a batch from GraphQL",
134
+ description="The number of entities to get in a batch from API",
132
135
  )
133
136
 
134
137
  max_workers: int = Field(
@@ -173,9 +176,9 @@ class DataProcessCleanup:
173
176
  """
174
177
  This source is a maintenance source which cleans up old/unused aspects.
175
178
 
176
- Currently it only supports:.
179
+ Currently it only supports:
177
180
  - DataFlow
178
- -DataJob
181
+ - DataJob
179
182
  - DataProcessInstance
180
183
 
181
184
  """
@@ -207,23 +210,34 @@ class DataProcessCleanup:
207
210
  assert self.ctx.graph
208
211
  dpis = []
209
212
  start = 0
213
+ # This graphql endpoint doesn't support scrolling and therefore after 10k DPIs it causes performance issues on ES
214
+ # Therefore, we are limiting the max DPIs to 9000
215
+ max_item = 9000
210
216
  while True:
211
- job_query_result = self.ctx.graph.execute_graphql(
212
- DATA_PROCESS_INSTANCES_QUERY,
213
- {"dataJobUrn": job_urn, "start": start, "count": batch_size},
214
- )
215
- job_data = job_query_result.get("dataJob")
216
- if not job_data:
217
- raise ValueError(f"Error getting job {job_urn}")
218
-
219
- runs_data = job_data.get("runs")
220
- if not runs_data:
221
- raise ValueError(f"Error getting runs for {job_urn}")
222
-
223
- runs = runs_data.get("runs")
224
- dpis.extend(runs)
225
- start += batch_size
226
- if len(runs) < batch_size:
217
+ try:
218
+ job_query_result = self.ctx.graph.execute_graphql(
219
+ DATA_PROCESS_INSTANCES_QUERY,
220
+ {"dataJobUrn": job_urn, "start": start, "count": batch_size},
221
+ )
222
+ job_data = job_query_result.get("dataJob")
223
+ if not job_data:
224
+ logger.error(f"Error getting job {job_urn}")
225
+ break
226
+
227
+ runs_data = job_data.get("runs")
228
+ if not runs_data:
229
+ logger.error(f"Error getting runs for {job_urn}")
230
+ break
231
+
232
+ runs = runs_data.get("runs")
233
+ dpis.extend(runs)
234
+ start += batch_size
235
+ if len(runs) < batch_size or start >= max_item:
236
+ break
237
+ except Exception as e:
238
+ self.report.failure(
239
+ f"Exception while fetching DPIs for job {job_urn}:", exc=e
240
+ )
227
241
  break
228
242
  return dpis
229
243
 
@@ -243,9 +257,14 @@ class DataProcessCleanup:
243
257
  futures[future] = dpi
244
258
 
245
259
  for future in as_completed(futures):
246
- deleted_count_last_n += 1
247
- futures[future]["deleted"] = True
248
-
260
+ try:
261
+ future.result()
262
+ deleted_count_last_n += 1
263
+ futures[future]["deleted"] = True
264
+ except Exception as e:
265
+ self.report.report_failure(
266
+ f"Exception while deleting DPI: {e}", exc=e
267
+ )
249
268
  if deleted_count_last_n % self.config.batch_size == 0:
250
269
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
251
270
  if self.config.delay:
@@ -267,7 +286,7 @@ class DataProcessCleanup:
267
286
 
268
287
  if self.dry_run:
269
288
  logger.info(
270
- f"Dry run is on otherwise it would have deleted {urn} with hard deletion is{self.config.hard_delete_entities}"
289
+ f"Dry run is on otherwise it would have deleted {urn} with hard deletion is {self.config.hard_delete_entities}"
271
290
  )
272
291
  return
273
292
 
@@ -277,7 +296,12 @@ class DataProcessCleanup:
277
296
  assert self.ctx.graph
278
297
 
279
298
  dpis = self.fetch_dpis(job.urn, self.config.batch_size)
280
- dpis.sort(key=lambda x: x["created"]["time"], reverse=True)
299
+ dpis.sort(
300
+ key=lambda x: x["created"]["time"]
301
+ if x.get("created") and x["created"].get("time")
302
+ else 0,
303
+ reverse=True,
304
+ )
281
305
 
282
306
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
283
307
  if self.config.keep_last_n:
@@ -309,15 +333,23 @@ class DataProcessCleanup:
309
333
  if dpi.get("deleted"):
310
334
  continue
311
335
 
312
- if dpi["created"]["time"] < retention_time * 1000:
336
+ if (
337
+ not dpi.get("created")
338
+ or not dpi["created"].get("time")
339
+ or dpi["created"]["time"] < retention_time * 1000
340
+ ):
313
341
  future = executor.submit(
314
342
  self.delete_entity, dpi["urn"], "dataprocessInstance"
315
343
  )
316
344
  futures[future] = dpi
317
345
 
318
346
  for future in as_completed(futures):
319
- deleted_count_retention += 1
320
- futures[future]["deleted"] = True
347
+ try:
348
+ future.result()
349
+ deleted_count_retention += 1
350
+ futures[future]["deleted"] = True
351
+ except Exception as e:
352
+ self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
321
353
 
322
354
  if deleted_count_retention % self.config.batch_size == 0:
323
355
  logger.info(
@@ -328,9 +360,12 @@ class DataProcessCleanup:
328
360
  logger.info(f"Sleeping for {self.config.delay} seconds")
329
361
  time.sleep(self.config.delay)
330
362
 
331
- logger.info(
332
- f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
333
- )
363
+ if deleted_count_retention > 0:
364
+ logger.info(
365
+ f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
366
+ )
367
+ else:
368
+ logger.debug(f"No DPIs to delete from {job.urn} due to retention")
334
369
 
335
370
  def get_data_flows(self) -> Iterable[DataFlowEntity]:
336
371
  assert self.ctx.graph
@@ -339,17 +374,26 @@ class DataProcessCleanup:
339
374
  previous_scroll_id: Optional[str] = None
340
375
 
341
376
  while True:
342
- result = self.ctx.graph.execute_graphql(
343
- DATAFLOW_QUERY,
344
- {
345
- "query": "*",
346
- "scrollId": scroll_id if scroll_id else None,
347
- "batchSize": self.config.batch_size,
348
- },
349
- )
377
+ result = None
378
+ try:
379
+ result = self.ctx.graph.execute_graphql(
380
+ DATAFLOW_QUERY,
381
+ {
382
+ "query": "*",
383
+ "scrollId": scroll_id if scroll_id else None,
384
+ "batchSize": self.config.batch_size,
385
+ },
386
+ )
387
+ except Exception as e:
388
+ self.report.failure(
389
+ f"While trying to get dataflows with {scroll_id}", exc=e
390
+ )
391
+ break
392
+
350
393
  scrollAcrossEntities = result.get("scrollAcrossEntities")
351
394
  if not scrollAcrossEntities:
352
395
  raise ValueError("Missing scrollAcrossEntities in response")
396
+ logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
353
397
 
354
398
  scroll_id = scrollAcrossEntities.get("nextScrollId")
355
399
  for flow in scrollAcrossEntities.get("searchResults"):
@@ -366,6 +410,8 @@ class DataProcessCleanup:
366
410
  previous_scroll_id = scroll_id
367
411
 
368
412
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
413
+ if not self.config.enabled:
414
+ return []
369
415
  assert self.ctx.graph
370
416
 
371
417
  dataFlows: Dict[str, DataFlowEntity] = {}
@@ -373,17 +419,26 @@ class DataProcessCleanup:
373
419
  dataFlows[flow.urn] = flow
374
420
 
375
421
  scroll_id: Optional[str] = None
422
+ previous_scroll_id: Optional[str] = None
423
+
376
424
  dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
377
425
  deleted_jobs: int = 0
426
+
378
427
  while True:
379
- result = self.ctx.graph.execute_graphql(
380
- DATAJOB_QUERY,
381
- {
382
- "query": "*",
383
- "scrollId": scroll_id if scroll_id else None,
384
- "batchSize": self.config.batch_size,
385
- },
386
- )
428
+ try:
429
+ result = self.ctx.graph.execute_graphql(
430
+ DATAJOB_QUERY,
431
+ {
432
+ "query": "*",
433
+ "scrollId": scroll_id if scroll_id else None,
434
+ "batchSize": self.config.batch_size,
435
+ },
436
+ )
437
+ except Exception as e:
438
+ self.report.failure(
439
+ f"While trying to get data jobs with {scroll_id}", exc=e
440
+ )
441
+ break
387
442
  scrollAcrossEntities = result.get("scrollAcrossEntities")
388
443
  if not scrollAcrossEntities:
389
444
  raise ValueError("Missing scrollAcrossEntities in response")
@@ -404,7 +459,9 @@ class DataProcessCleanup:
404
459
  try:
405
460
  self.delete_dpi_from_datajobs(datajob_entity)
406
461
  except Exception as e:
407
- logger.error(f"While trying to delete {datajob_entity} got {e}")
462
+ self.report.failure(
463
+ f"While trying to delete {datajob_entity} ", exc=e
464
+ )
408
465
  if (
409
466
  datajob_entity.total_runs == 0
410
467
  and self.config.delete_empty_data_jobs
@@ -419,9 +476,11 @@ class DataProcessCleanup:
419
476
  else:
420
477
  dataJobs[datajob_entity.flow_urn].append(datajob_entity)
421
478
 
422
- if not scroll_id:
479
+ if not scroll_id or previous_scroll_id == scroll_id:
423
480
  break
424
481
 
482
+ previous_scroll_id = scroll_id
483
+
425
484
  logger.info(f"Deleted {deleted_jobs} DataJobs")
426
485
  # Delete empty dataflows if needed
427
486
  if self.config.delete_empty_data_flows:
@@ -436,4 +495,5 @@ class DataProcessCleanup:
436
495
  if deleted_jobs % self.config.batch_size == 0:
437
496
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
438
497
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
498
+
439
499
  return []
@@ -20,6 +20,9 @@ logger = logging.getLogger(__name__)
20
20
 
21
21
 
22
22
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
+ enabled: bool = Field(
24
+ default=True, description="Whether to do soft deletion cleanup."
25
+ )
23
26
  retention_days: Optional[int] = Field(
24
27
  10,
25
28
  description="Number of days to retain metadata in DataHub",
@@ -60,7 +63,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
60
63
  description="Query to filter entities",
61
64
  )
62
65
  limit_entities_delete: Optional[int] = Field(
63
- 10000, description="Max number of entities to delete."
66
+ 25000, description="Max number of entities to delete."
64
67
  )
65
68
 
66
69
  runtime_limit_seconds: Optional[int] = Field(
@@ -104,7 +107,7 @@ class SoftDeletedEntitiesCleanup:
104
107
  def delete_entity(self, urn: str) -> None:
105
108
  assert self.ctx.graph
106
109
 
107
- entity_urn = Urn.create_from_string(urn)
110
+ entity_urn = Urn.from_string(urn)
108
111
  self.report.num_soft_deleted_entity_removed += 1
109
112
  self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
110
113
  self.report.num_soft_deleted_entity_removed_by_type.get(
@@ -156,6 +159,8 @@ class SoftDeletedEntitiesCleanup:
156
159
  self.delete_entity(urn)
157
160
 
158
161
  def cleanup_soft_deleted_entities(self) -> None:
162
+ if not self.config.enabled:
163
+ return
159
164
  assert self.ctx.graph
160
165
  start_time = time.time()
161
166
 
@@ -7,6 +7,7 @@ import dataclasses
7
7
  import functools
8
8
  import json
9
9
  import logging
10
+ import re
10
11
  import threading
11
12
  import traceback
12
13
  import unittest.mock
@@ -56,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
56
57
  convert_to_cardinality,
57
58
  )
58
59
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
59
- from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
60
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
61
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
62
+ EditableSchemaMetadata,
63
+ NumberType,
64
+ )
60
65
  from datahub.metadata.schema_classes import (
61
66
  DatasetFieldProfileClass,
62
67
  DatasetProfileClass,
@@ -123,6 +128,8 @@ ProfilerTypeMapping.BINARY_TYPE_NAMES.append("LargeBinary")
123
128
 
124
129
  _datasource_connection_injection_lock = threading.Lock()
125
130
 
131
+ NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")
132
+
126
133
 
127
134
  @contextlib.contextmanager
128
135
  def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
@@ -165,11 +172,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
165
172
  return convert_to_json_serializable(element_values.fetchone()[0])
166
173
  elif self.engine.dialect.name.lower() == BIGQUERY:
167
174
  element_values = self.engine.execute(
168
- sa.select(
169
- [
170
- sa.func.coalesce(sa.text(f"APPROX_COUNT_DISTINCT(`{column}`)")),
171
- ]
172
- ).select_from(self._table)
175
+ sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
176
+ self._table
177
+ )
173
178
  )
174
179
  return convert_to_json_serializable(element_values.fetchone()[0])
175
180
  elif self.engine.dialect.name.lower() == SNOWFLAKE:
@@ -360,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
360
365
  platform: str
361
366
  env: str
362
367
 
368
+ column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
369
+
363
370
  def _get_columns_to_profile(self) -> List[str]:
364
371
  if not self.config.any_field_level_metrics_enabled():
365
372
  return []
@@ -373,11 +380,15 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
373
380
 
374
381
  for col_dict in self.dataset.columns:
375
382
  col = col_dict["name"]
383
+ self.column_types[col] = str(col_dict["type"])
376
384
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
377
385
  if not self.config._allow_deny_patterns.allowed(
378
386
  f"{self.dataset_name}.{col}"
379
387
  ):
380
388
  ignored_columns_by_pattern.append(col)
389
+ # We try to ignore nested columns as well
390
+ elif not self.config.profile_nested_fields and "." in col:
391
+ ignored_columns_by_pattern.append(col)
381
392
  elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
382
393
  ignored_columns_by_type.append(col)
383
394
  else:
@@ -407,9 +418,18 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
407
418
  return columns_to_profile
408
419
 
409
420
  def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
410
- return str(sqlalchemy_type) in _get_column_types_to_ignore(
411
- self.dataset.engine.dialect.name
412
- )
421
+ # We don't profiles columns with None types
422
+ if str(sqlalchemy_type) == "NULL":
423
+ return True
424
+
425
+ sql_type = str(sqlalchemy_type)
426
+
427
+ match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)
428
+
429
+ if match:
430
+ sql_type = match.group(1)
431
+
432
+ return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)
413
433
 
414
434
  @_run_with_query_combiner
415
435
  def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
@@ -417,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
417
437
  self.dataset, column
418
438
  )
419
439
 
440
+ if column_spec.type_ == ProfilerDataType.UNKNOWN:
441
+ try:
442
+ datahub_field_type = resolve_sql_type(
443
+ self.column_types[column], self.dataset.engine.dialect.name.lower()
444
+ )
445
+ except Exception as e:
446
+ logger.debug(
447
+ f"Error resolving sql type {self.column_types[column]}: {e}"
448
+ )
449
+ datahub_field_type = None
450
+ if datahub_field_type is None:
451
+ return
452
+ if isinstance(datahub_field_type, NumberType):
453
+ column_spec.type_ = ProfilerDataType.NUMERIC
454
+
420
455
  @_run_with_query_combiner
421
456
  def _get_column_cardinality(
422
457
  self, column_spec: _SingleColumnSpec, column: str
@@ -1397,6 +1432,8 @@ class DatahubGEProfiler:
1397
1432
  def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
1398
1433
  if dialect_name.lower() == POSTGRESQL:
1399
1434
  return ["JSON"]
1435
+ elif dialect_name.lower() == BIGQUERY:
1436
+ return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
1400
1437
 
1401
1438
  return []
1402
1439
 
@@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
188
188
  ),
189
189
  )
190
190
 
191
+ profile_nested_fields: bool = Field(
192
+ default=False,
193
+ description="Whether to profile complex types like structs, arrays and maps. ",
194
+ )
195
+
191
196
  @pydantic.root_validator(pre=True)
192
197
  def deprecate_bigquery_temp_table_schema(cls, values):
193
198
  # TODO: Update docs to remove mention of this field.
@@ -9,6 +9,7 @@ from pyiceberg.exceptions import (
9
9
  NoSuchIcebergTableError,
10
10
  NoSuchNamespaceError,
11
11
  NoSuchPropertyException,
12
+ NoSuchTableError,
12
13
  )
13
14
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
14
15
  from pyiceberg.table import Table
@@ -104,7 +105,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
104
105
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.")
105
106
  @capability(
106
107
  SourceCapability.OWNERSHIP,
107
- "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.",
108
+ "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
108
109
  )
109
110
  @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
110
111
  class IcebergSource(StatefulIngestionSourceBase):
@@ -192,9 +193,7 @@ class IcebergSource(StatefulIngestionSourceBase):
192
193
  table = thread_local.local_catalog.load_table(dataset_path)
193
194
  time_taken = timer.elapsed_seconds()
194
195
  self.report.report_table_load_time(time_taken)
195
- LOGGER.debug(
196
- f"Loaded table: {table.identifier}, time taken: {time_taken}"
197
- )
196
+ LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
198
197
  yield from self._create_iceberg_workunit(dataset_name, table)
199
198
  except NoSuchPropertyException as e:
200
199
  self.report.report_warning(
@@ -206,12 +205,20 @@ class IcebergSource(StatefulIngestionSourceBase):
206
205
  )
207
206
  except NoSuchIcebergTableError as e:
208
207
  self.report.report_warning(
209
- "no-iceberg-table",
208
+ "not-an-iceberg-table",
210
209
  f"Failed to create workunit for {dataset_name}. {e}",
211
210
  )
212
211
  LOGGER.warning(
213
212
  f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
214
213
  )
214
+ except NoSuchTableError as e:
215
+ self.report.report_warning(
216
+ "no-such-table",
217
+ f"Failed to create workunit for {dataset_name}. {e}",
218
+ )
219
+ LOGGER.warning(
220
+ f"NoSuchTableError while processing table {dataset_path}, skipping it.",
221
+ )
215
222
  except Exception as e:
216
223
  self.report.report_failure("general", f"Failed to create workunit: {e}")
217
224
  LOGGER.exception(