acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
65
65
  description="Sleep between truncation monitoring.",
66
66
  )
67
67
 
68
- dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field(
69
- default=None,
68
+ dataprocess_cleanup: DataProcessCleanupConfig = Field(
69
+ default_factory=DataProcessCleanupConfig,
70
70
  description="Configuration for data process cleanup",
71
71
  )
72
72
 
73
- soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field(
74
- default=None,
73
+ soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
74
+ default_factory=SoftDeletedEntitiesCleanupConfig,
75
75
  description="Configuration for soft deleted entities cleanup",
76
76
  )
77
77
 
78
- execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field(
79
- default=None,
78
+ execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
79
+ default_factory=DatahubExecutionRequestCleanupConfig,
80
80
  description="Configuration for execution request cleanup",
81
81
  )
82
82
 
@@ -108,28 +108,22 @@ class DataHubGcSource(Source):
108
108
  self.ctx = ctx
109
109
  self.config = config
110
110
  self.report = DataHubGcSourceReport()
111
+ self.report.event_not_produced_warn = False
111
112
  self.graph = ctx.require_graph("The DataHubGc source")
112
- self.dataprocess_cleanup: Optional[DataProcessCleanup] = None
113
- self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None
114
- self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None
115
-
116
- if self.config.dataprocess_cleanup:
117
- self.dataprocess_cleanup = DataProcessCleanup(
118
- ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
119
- )
120
- if self.config.soft_deleted_entities_cleanup:
121
- self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
122
- ctx,
123
- self.config.soft_deleted_entities_cleanup,
124
- self.report,
125
- self.config.dry_run,
126
- )
127
- if self.config.execution_request_cleanup:
128
- self.execution_request_cleanup = DatahubExecutionRequestCleanup(
129
- config=self.config.execution_request_cleanup,
130
- graph=self.graph,
131
- report=self.report,
132
- )
113
+ self.dataprocess_cleanup = DataProcessCleanup(
114
+ ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
115
+ )
116
+ self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
117
+ ctx,
118
+ self.config.soft_deleted_entities_cleanup,
119
+ self.report,
120
+ self.config.dry_run,
121
+ )
122
+ self.execution_request_cleanup = DatahubExecutionRequestCleanup(
123
+ config=self.config.execution_request_cleanup,
124
+ graph=self.graph,
125
+ report=self.report,
126
+ )
133
127
 
134
128
  @classmethod
135
129
  def create(cls, config_dict, ctx):
@@ -144,15 +138,32 @@ class DataHubGcSource(Source):
144
138
  self,
145
139
  ) -> Iterable[MetadataWorkUnit]:
146
140
  if self.config.cleanup_expired_tokens:
147
- self.revoke_expired_tokens()
141
+ try:
142
+ self.revoke_expired_tokens()
143
+ except Exception as e:
144
+ self.report.failure("While trying to cleanup expired token ", exc=e)
148
145
  if self.config.truncate_indices:
149
- self.truncate_indices()
150
- if self.dataprocess_cleanup:
151
- yield from self.dataprocess_cleanup.get_workunits_internal()
152
- if self.soft_deleted_entities_cleanup:
153
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
154
- if self.execution_request_cleanup:
155
- self.execution_request_cleanup.run()
146
+ try:
147
+ self.truncate_indices()
148
+ except Exception as e:
149
+ self.report.failure("While trying to truncate indices ", exc=e)
150
+ if self.config.soft_deleted_entities_cleanup.enabled:
151
+ try:
152
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
153
+ except Exception as e:
154
+ self.report.failure(
155
+ "While trying to cleanup soft deleted entities ", exc=e
156
+ )
157
+ if self.config.execution_request_cleanup.enabled:
158
+ try:
159
+ self.execution_request_cleanup.run()
160
+ except Exception as e:
161
+ self.report.failure("While trying to cleanup execution request ", exc=e)
162
+ if self.config.dataprocess_cleanup.enabled:
163
+ try:
164
+ yield from self.dataprocess_cleanup.get_workunits_internal()
165
+ except Exception as e:
166
+ self.report.failure("While trying to cleanup data process ", exc=e)
156
167
  yield from []
157
168
 
158
169
  def truncate_indices(self) -> None:
@@ -98,6 +98,9 @@ query getDataJobRuns($dataJobUrn: String!, $start: Int!, $count: Int!) {
98
98
 
99
99
 
100
100
  class DataProcessCleanupConfig(ConfigModel):
101
+ enabled: bool = Field(
102
+ default=True, description="Whether to do data process cleanup."
103
+ )
101
104
  retention_days: Optional[int] = Field(
102
105
  10,
103
106
  description="Number of days to retain metadata in DataHub",
@@ -114,11 +117,11 @@ class DataProcessCleanupConfig(ConfigModel):
114
117
  )
115
118
 
116
119
  delete_empty_data_jobs: bool = Field(
117
- True, description="Wether to delete Data Jobs without runs"
120
+ False, description="Whether to delete Data Jobs without runs"
118
121
  )
119
122
 
120
123
  delete_empty_data_flows: bool = Field(
121
- True, description="Wether to delete Data Flows without runs"
124
+ False, description="Whether to delete Data Flows without runs"
122
125
  )
123
126
 
124
127
  hard_delete_entities: bool = Field(
@@ -128,7 +131,7 @@ class DataProcessCleanupConfig(ConfigModel):
128
131
 
129
132
  batch_size: int = Field(
130
133
  500,
131
- description="The number of entities to get in a batch from GraphQL",
134
+ description="The number of entities to get in a batch from API",
132
135
  )
133
136
 
134
137
  max_workers: int = Field(
@@ -173,9 +176,9 @@ class DataProcessCleanup:
173
176
  """
174
177
  This source is a maintenance source which cleans up old/unused aspects.
175
178
 
176
- Currently it only supports:.
179
+ Currently it only supports:
177
180
  - DataFlow
178
- -DataJob
181
+ - DataJob
179
182
  - DataProcessInstance
180
183
 
181
184
  """
@@ -207,23 +210,34 @@ class DataProcessCleanup:
207
210
  assert self.ctx.graph
208
211
  dpis = []
209
212
  start = 0
213
+ # This graphql endpoint doesn't support scrolling and therefore after 10k DPIs it causes performance issues on ES
214
+ # Therefore, we are limiting the max DPIs to 9000
215
+ max_item = 9000
210
216
  while True:
211
- job_query_result = self.ctx.graph.execute_graphql(
212
- DATA_PROCESS_INSTANCES_QUERY,
213
- {"dataJobUrn": job_urn, "start": start, "count": batch_size},
214
- )
215
- job_data = job_query_result.get("dataJob")
216
- if not job_data:
217
- raise ValueError(f"Error getting job {job_urn}")
218
-
219
- runs_data = job_data.get("runs")
220
- if not runs_data:
221
- raise ValueError(f"Error getting runs for {job_urn}")
222
-
223
- runs = runs_data.get("runs")
224
- dpis.extend(runs)
225
- start += batch_size
226
- if len(runs) < batch_size:
217
+ try:
218
+ job_query_result = self.ctx.graph.execute_graphql(
219
+ DATA_PROCESS_INSTANCES_QUERY,
220
+ {"dataJobUrn": job_urn, "start": start, "count": batch_size},
221
+ )
222
+ job_data = job_query_result.get("dataJob")
223
+ if not job_data:
224
+ logger.error(f"Error getting job {job_urn}")
225
+ break
226
+
227
+ runs_data = job_data.get("runs")
228
+ if not runs_data:
229
+ logger.error(f"Error getting runs for {job_urn}")
230
+ break
231
+
232
+ runs = runs_data.get("runs")
233
+ dpis.extend(runs)
234
+ start += batch_size
235
+ if len(runs) < batch_size or start >= max_item:
236
+ break
237
+ except Exception as e:
238
+ self.report.failure(
239
+ f"Exception while fetching DPIs for job {job_urn}:", exc=e
240
+ )
227
241
  break
228
242
  return dpis
229
243
 
@@ -243,9 +257,14 @@ class DataProcessCleanup:
243
257
  futures[future] = dpi
244
258
 
245
259
  for future in as_completed(futures):
246
- deleted_count_last_n += 1
247
- futures[future]["deleted"] = True
248
-
260
+ try:
261
+ future.result()
262
+ deleted_count_last_n += 1
263
+ futures[future]["deleted"] = True
264
+ except Exception as e:
265
+ self.report.report_failure(
266
+ f"Exception while deleting DPI: {e}", exc=e
267
+ )
249
268
  if deleted_count_last_n % self.config.batch_size == 0:
250
269
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
251
270
  if self.config.delay:
@@ -267,7 +286,7 @@ class DataProcessCleanup:
267
286
 
268
287
  if self.dry_run:
269
288
  logger.info(
270
- f"Dry run is on otherwise it would have deleted {urn} with hard deletion is{self.config.hard_delete_entities}"
289
+ f"Dry run is on otherwise it would have deleted {urn} with hard deletion is {self.config.hard_delete_entities}"
271
290
  )
272
291
  return
273
292
 
@@ -277,7 +296,12 @@ class DataProcessCleanup:
277
296
  assert self.ctx.graph
278
297
 
279
298
  dpis = self.fetch_dpis(job.urn, self.config.batch_size)
280
- dpis.sort(key=lambda x: x["created"]["time"], reverse=True)
299
+ dpis.sort(
300
+ key=lambda x: x["created"]["time"]
301
+ if x.get("created") and x["created"].get("time")
302
+ else 0,
303
+ reverse=True,
304
+ )
281
305
 
282
306
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
283
307
  if self.config.keep_last_n:
@@ -309,15 +333,23 @@ class DataProcessCleanup:
309
333
  if dpi.get("deleted"):
310
334
  continue
311
335
 
312
- if dpi["created"]["time"] < retention_time * 1000:
336
+ if (
337
+ not dpi.get("created")
338
+ or not dpi["created"].get("time")
339
+ or dpi["created"]["time"] < retention_time * 1000
340
+ ):
313
341
  future = executor.submit(
314
342
  self.delete_entity, dpi["urn"], "dataprocessInstance"
315
343
  )
316
344
  futures[future] = dpi
317
345
 
318
346
  for future in as_completed(futures):
319
- deleted_count_retention += 1
320
- futures[future]["deleted"] = True
347
+ try:
348
+ future.result()
349
+ deleted_count_retention += 1
350
+ futures[future]["deleted"] = True
351
+ except Exception as e:
352
+ self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
321
353
 
322
354
  if deleted_count_retention % self.config.batch_size == 0:
323
355
  logger.info(
@@ -328,9 +360,12 @@ class DataProcessCleanup:
328
360
  logger.info(f"Sleeping for {self.config.delay} seconds")
329
361
  time.sleep(self.config.delay)
330
362
 
331
- logger.info(
332
- f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
333
- )
363
+ if deleted_count_retention > 0:
364
+ logger.info(
365
+ f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
366
+ )
367
+ else:
368
+ logger.debug(f"No DPIs to delete from {job.urn} due to retention")
334
369
 
335
370
  def get_data_flows(self) -> Iterable[DataFlowEntity]:
336
371
  assert self.ctx.graph
@@ -339,17 +374,26 @@ class DataProcessCleanup:
339
374
  previous_scroll_id: Optional[str] = None
340
375
 
341
376
  while True:
342
- result = self.ctx.graph.execute_graphql(
343
- DATAFLOW_QUERY,
344
- {
345
- "query": "*",
346
- "scrollId": scroll_id if scroll_id else None,
347
- "batchSize": self.config.batch_size,
348
- },
349
- )
377
+ result = None
378
+ try:
379
+ result = self.ctx.graph.execute_graphql(
380
+ DATAFLOW_QUERY,
381
+ {
382
+ "query": "*",
383
+ "scrollId": scroll_id if scroll_id else None,
384
+ "batchSize": self.config.batch_size,
385
+ },
386
+ )
387
+ except Exception as e:
388
+ self.report.failure(
389
+ f"While trying to get dataflows with {scroll_id}", exc=e
390
+ )
391
+ break
392
+
350
393
  scrollAcrossEntities = result.get("scrollAcrossEntities")
351
394
  if not scrollAcrossEntities:
352
395
  raise ValueError("Missing scrollAcrossEntities in response")
396
+ logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
353
397
 
354
398
  scroll_id = scrollAcrossEntities.get("nextScrollId")
355
399
  for flow in scrollAcrossEntities.get("searchResults"):
@@ -366,6 +410,8 @@ class DataProcessCleanup:
366
410
  previous_scroll_id = scroll_id
367
411
 
368
412
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
413
+ if not self.config.enabled:
414
+ return []
369
415
  assert self.ctx.graph
370
416
 
371
417
  dataFlows: Dict[str, DataFlowEntity] = {}
@@ -373,17 +419,26 @@ class DataProcessCleanup:
373
419
  dataFlows[flow.urn] = flow
374
420
 
375
421
  scroll_id: Optional[str] = None
422
+ previous_scroll_id: Optional[str] = None
423
+
376
424
  dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
377
425
  deleted_jobs: int = 0
426
+
378
427
  while True:
379
- result = self.ctx.graph.execute_graphql(
380
- DATAJOB_QUERY,
381
- {
382
- "query": "*",
383
- "scrollId": scroll_id if scroll_id else None,
384
- "batchSize": self.config.batch_size,
385
- },
386
- )
428
+ try:
429
+ result = self.ctx.graph.execute_graphql(
430
+ DATAJOB_QUERY,
431
+ {
432
+ "query": "*",
433
+ "scrollId": scroll_id if scroll_id else None,
434
+ "batchSize": self.config.batch_size,
435
+ },
436
+ )
437
+ except Exception as e:
438
+ self.report.failure(
439
+ f"While trying to get data jobs with {scroll_id}", exc=e
440
+ )
441
+ break
387
442
  scrollAcrossEntities = result.get("scrollAcrossEntities")
388
443
  if not scrollAcrossEntities:
389
444
  raise ValueError("Missing scrollAcrossEntities in response")
@@ -404,7 +459,9 @@ class DataProcessCleanup:
404
459
  try:
405
460
  self.delete_dpi_from_datajobs(datajob_entity)
406
461
  except Exception as e:
407
- logger.error(f"While trying to delete {datajob_entity} got {e}")
462
+ self.report.failure(
463
+ f"While trying to delete {datajob_entity} ", exc=e
464
+ )
408
465
  if (
409
466
  datajob_entity.total_runs == 0
410
467
  and self.config.delete_empty_data_jobs
@@ -419,9 +476,11 @@ class DataProcessCleanup:
419
476
  else:
420
477
  dataJobs[datajob_entity.flow_urn].append(datajob_entity)
421
478
 
422
- if not scroll_id:
479
+ if not scroll_id or previous_scroll_id == scroll_id:
423
480
  break
424
481
 
482
+ previous_scroll_id = scroll_id
483
+
425
484
  logger.info(f"Deleted {deleted_jobs} DataJobs")
426
485
  # Delete empty dataflows if needed
427
486
  if self.config.delete_empty_data_flows:
@@ -436,4 +495,5 @@ class DataProcessCleanup:
436
495
  if deleted_jobs % self.config.batch_size == 0:
437
496
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
438
497
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
498
+
439
499
  return []
@@ -20,6 +20,9 @@ logger = logging.getLogger(__name__)
20
20
 
21
21
 
22
22
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
+ enabled: bool = Field(
24
+ default=True, description="Whether to do soft deletion cleanup."
25
+ )
23
26
  retention_days: Optional[int] = Field(
24
27
  10,
25
28
  description="Number of days to retain metadata in DataHub",
@@ -60,7 +63,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
60
63
  description="Query to filter entities",
61
64
  )
62
65
  limit_entities_delete: Optional[int] = Field(
63
- 10000, description="Max number of entities to delete."
66
+ 25000, description="Max number of entities to delete."
64
67
  )
65
68
 
66
69
  runtime_limit_seconds: Optional[int] = Field(
@@ -104,7 +107,7 @@ class SoftDeletedEntitiesCleanup:
104
107
  def delete_entity(self, urn: str) -> None:
105
108
  assert self.ctx.graph
106
109
 
107
- entity_urn = Urn.create_from_string(urn)
110
+ entity_urn = Urn.from_string(urn)
108
111
  self.report.num_soft_deleted_entity_removed += 1
109
112
  self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
110
113
  self.report.num_soft_deleted_entity_removed_by_type.get(
@@ -156,6 +159,8 @@ class SoftDeletedEntitiesCleanup:
156
159
  self.delete_entity(urn)
157
160
 
158
161
  def cleanup_soft_deleted_entities(self) -> None:
162
+ if not self.config.enabled:
163
+ return
159
164
  assert self.ctx.graph
160
165
  start_time = time.time()
161
166
 
@@ -57,7 +57,11 @@ from datahub.ingestion.source.profiling.common import (
57
57
  convert_to_cardinality,
58
58
  )
59
59
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
60
- from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
60
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
61
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
62
+ EditableSchemaMetadata,
63
+ NumberType,
64
+ )
61
65
  from datahub.metadata.schema_classes import (
62
66
  DatasetFieldProfileClass,
63
67
  DatasetProfileClass,
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
361
365
  platform: str
362
366
  env: str
363
367
 
368
+ column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
369
+
364
370
  def _get_columns_to_profile(self) -> List[str]:
365
371
  if not self.config.any_field_level_metrics_enabled():
366
372
  return []
@@ -374,6 +380,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
374
380
 
375
381
  for col_dict in self.dataset.columns:
376
382
  col = col_dict["name"]
383
+ self.column_types[col] = str(col_dict["type"])
377
384
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
378
385
  if not self.config._allow_deny_patterns.allowed(
379
386
  f"{self.dataset_name}.{col}"
@@ -430,6 +437,21 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
430
437
  self.dataset, column
431
438
  )
432
439
 
440
+ if column_spec.type_ == ProfilerDataType.UNKNOWN:
441
+ try:
442
+ datahub_field_type = resolve_sql_type(
443
+ self.column_types[column], self.dataset.engine.dialect.name.lower()
444
+ )
445
+ except Exception as e:
446
+ logger.debug(
447
+ f"Error resolving sql type {self.column_types[column]}: {e}"
448
+ )
449
+ datahub_field_type = None
450
+ if datahub_field_type is None:
451
+ return
452
+ if isinstance(datahub_field_type, NumberType):
453
+ column_spec.type_ = ProfilerDataType.NUMERIC
454
+
433
455
  @_run_with_query_combiner
434
456
  def _get_column_cardinality(
435
457
  self, column_spec: _SingleColumnSpec, column: str
@@ -9,6 +9,7 @@ from pyiceberg.exceptions import (
9
9
  NoSuchIcebergTableError,
10
10
  NoSuchNamespaceError,
11
11
  NoSuchPropertyException,
12
+ NoSuchTableError,
12
13
  )
13
14
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
14
15
  from pyiceberg.table import Table
@@ -104,7 +105,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
104
105
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.")
105
106
  @capability(
106
107
  SourceCapability.OWNERSHIP,
107
- "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.",
108
+ "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
108
109
  )
109
110
  @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
110
111
  class IcebergSource(StatefulIngestionSourceBase):
@@ -192,9 +193,7 @@ class IcebergSource(StatefulIngestionSourceBase):
192
193
  table = thread_local.local_catalog.load_table(dataset_path)
193
194
  time_taken = timer.elapsed_seconds()
194
195
  self.report.report_table_load_time(time_taken)
195
- LOGGER.debug(
196
- f"Loaded table: {table.identifier}, time taken: {time_taken}"
197
- )
196
+ LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
198
197
  yield from self._create_iceberg_workunit(dataset_name, table)
199
198
  except NoSuchPropertyException as e:
200
199
  self.report.report_warning(
@@ -206,12 +205,20 @@ class IcebergSource(StatefulIngestionSourceBase):
206
205
  )
207
206
  except NoSuchIcebergTableError as e:
208
207
  self.report.report_warning(
209
- "no-iceberg-table",
208
+ "not-an-iceberg-table",
210
209
  f"Failed to create workunit for {dataset_name}. {e}",
211
210
  )
212
211
  LOGGER.warning(
213
212
  f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
214
213
  )
214
+ except NoSuchTableError as e:
215
+ self.report.report_warning(
216
+ "no-such-table",
217
+ f"Failed to create workunit for {dataset_name}. {e}",
218
+ )
219
+ LOGGER.warning(
220
+ f"NoSuchTableError while processing table {dataset_path}, skipping it.",
221
+ )
215
222
  except Exception as e:
216
223
  self.report.report_failure("general", f"Failed to create workunit: {e}")
217
224
  LOGGER.exception(
@@ -141,6 +141,10 @@ class KafkaSourceConfig(
141
141
  default=False,
142
142
  description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
143
143
  )
144
+ ingest_schemas_as_entities: bool = pydantic.Field(
145
+ default=False,
146
+ description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
147
+ )
144
148
 
145
149
 
146
150
  def get_kafka_consumer(
@@ -148,7 +152,7 @@ def get_kafka_consumer(
148
152
  ) -> confluent_kafka.Consumer:
149
153
  consumer = confluent_kafka.Consumer(
150
154
  {
151
- "group.id": "test",
155
+ "group.id": "datahub-kafka-ingestion",
152
156
  "bootstrap.servers": connection.bootstrap,
153
157
  **connection.consumer_config,
154
158
  }
@@ -164,6 +168,25 @@ def get_kafka_consumer(
164
168
  return consumer
165
169
 
166
170
 
171
+ def get_kafka_admin_client(
172
+ connection: KafkaConsumerConnectionConfig,
173
+ ) -> AdminClient:
174
+ client = AdminClient(
175
+ {
176
+ "group.id": "datahub-kafka-ingestion",
177
+ "bootstrap.servers": connection.bootstrap,
178
+ **connection.consumer_config,
179
+ }
180
+ )
181
+ if CallableConsumerConfig.is_callable_config(connection.consumer_config):
182
+ # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
183
+ # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
184
+ logger.debug("Initiating polling for kafka admin client")
185
+ client.poll(timeout=30)
186
+ logger.debug("Initiated polling for kafka admin client")
187
+ return client
188
+
189
+
167
190
  @dataclass
168
191
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
169
192
  topics_scanned: int = 0
@@ -278,13 +301,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
278
301
  def init_kafka_admin_client(self) -> None:
279
302
  try:
280
303
  # TODO: Do we require separate config than existing consumer_config ?
281
- self.admin_client = AdminClient(
282
- {
283
- "group.id": "test",
284
- "bootstrap.servers": self.source_config.connection.bootstrap,
285
- **self.source_config.connection.consumer_config,
286
- }
287
- )
304
+ self.admin_client = get_kafka_admin_client(self.source_config.connection)
288
305
  except Exception as e:
289
306
  logger.debug(e, exc_info=e)
290
307
  self.report.report_warning(
@@ -330,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
330
347
  else:
331
348
  self.report.report_dropped(topic)
332
349
 
333
- # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
334
- for subject in self.schema_registry_client.get_subjects():
335
- try:
336
- yield from self._extract_record(
337
- subject, True, topic_detail=None, extra_topic_config=None
338
- )
339
- except Exception as e:
340
- logger.warning(f"Failed to extract subject {subject}", exc_info=True)
341
- self.report.report_warning(
342
- "subject", f"Exception while extracting topic {subject}: {e}"
343
- )
350
+ if self.source_config.ingest_schemas_as_entities:
351
+ # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
352
+ for subject in self.schema_registry_client.get_subjects():
353
+ try:
354
+ yield from self._extract_record(
355
+ subject, True, topic_detail=None, extra_topic_config=None
356
+ )
357
+ except Exception as e:
358
+ logger.warning(
359
+ f"Failed to extract subject {subject}", exc_info=True
360
+ )
361
+ self.report.report_warning(
362
+ "subject", f"Exception while extracting topic {subject}: {e}"
363
+ )
344
364
 
345
365
  def _extract_record(
346
366
  self,