acryl-datahub 0.15.0.1rc11__py3-none-any.whl → 0.15.0.1rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (40) hide show
  1. {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/METADATA +2320 -2324
  2. {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/RECORD +40 -39
  3. datahub/__init__.py +1 -1
  4. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  5. datahub/configuration/common.py +2 -5
  6. datahub/emitter/mce_builder.py +17 -1
  7. datahub/emitter/mcp_builder.py +2 -7
  8. datahub/emitter/mcp_patch_builder.py +2 -2
  9. datahub/emitter/rest_emitter.py +2 -2
  10. datahub/ingestion/api/closeable.py +3 -3
  11. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  12. datahub/ingestion/api/report.py +4 -1
  13. datahub/ingestion/api/sink.py +4 -3
  14. datahub/ingestion/api/source_helpers.py +2 -6
  15. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  16. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  17. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  18. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  19. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +159 -71
  20. datahub/ingestion/source/s3/source.py +1 -1
  21. datahub/ingestion/source/sql/hive.py +15 -0
  22. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  23. datahub/ingestion/source/sql/mssql/source.py +1 -1
  24. datahub/ingestion/source/sql/sql_common.py +41 -102
  25. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  26. datahub/ingestion/source/sql/sql_report.py +2 -0
  27. datahub/ingestion/source/state/checkpoint.py +2 -1
  28. datahub/ingestion/source/tableau/tableau.py +1 -4
  29. datahub/ingestion/source/unity/proxy.py +8 -27
  30. datahub/metadata/_schema_classes.py +61 -1
  31. datahub/metadata/_urns/urn_defs.py +168 -168
  32. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  33. datahub/metadata/schema.avsc +64 -29
  34. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  35. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  36. datahub/utilities/time.py +8 -3
  37. datahub/utilities/urns/_urn_base.py +5 -7
  38. {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/entry_points.txt +0 -0
  40. {acryl_datahub-0.15.0.1rc11.dist-info → acryl_datahub-0.15.0.1rc13.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass, field
4
- from datetime import datetime, timezone
4
+ from datetime import datetime
5
5
  from functools import lru_cache
6
6
  from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional
7
7
 
@@ -15,6 +15,7 @@ from google.cloud.bigquery.table import (
15
15
  TimePartitioningType,
16
16
  )
17
17
 
18
+ from datahub.emitter.mce_builder import parse_ts_millis
18
19
  from datahub.ingestion.api.source import SourceReport
19
20
  from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
20
21
  from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
@@ -393,13 +394,7 @@ class BigQuerySchemaApi:
393
394
  name=table.table_name,
394
395
  created=table.created,
395
396
  table_type=table.table_type,
396
- last_altered=(
397
- datetime.fromtimestamp(
398
- table.get("last_altered") / 1000, tz=timezone.utc
399
- )
400
- if table.get("last_altered") is not None
401
- else None
402
- ),
397
+ last_altered=parse_ts_millis(table.get("last_altered")),
403
398
  size_in_bytes=table.get("bytes"),
404
399
  rows_count=table.get("row_count"),
405
400
  comment=table.comment,
@@ -460,11 +455,7 @@ class BigQuerySchemaApi:
460
455
  return BigqueryView(
461
456
  name=view.table_name,
462
457
  created=view.created,
463
- last_altered=(
464
- datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
465
- if view.get("last_altered") is not None
466
- else None
467
- ),
458
+ last_altered=(parse_ts_millis(view.get("last_altered"))),
468
459
  comment=view.comment,
469
460
  view_definition=view.view_definition,
470
461
  materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
@@ -705,13 +696,7 @@ class BigQuerySchemaApi:
705
696
  return BigqueryTableSnapshot(
706
697
  name=snapshot.table_name,
707
698
  created=snapshot.created,
708
- last_altered=(
709
- datetime.fromtimestamp(
710
- snapshot.get("last_altered") / 1000, tz=timezone.utc
711
- )
712
- if snapshot.get("last_altered") is not None
713
- else None
714
- ),
699
+ last_altered=parse_ts_millis(snapshot.get("last_altered")),
715
700
  comment=snapshot.comment,
716
701
  ddl=snapshot.ddl,
717
702
  snapshot_time=snapshot.snapshot_time,
@@ -12,6 +12,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient
12
12
  from confluent_kafka.schema_registry.avro import AvroDeserializer
13
13
 
14
14
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
15
+ from datahub.emitter.mce_builder import parse_ts_millis
15
16
  from datahub.ingestion.api.closeable import Closeable
16
17
  from datahub.ingestion.api.common import PipelineContext
17
18
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
@@ -92,7 +93,7 @@ class DataHubKafkaReader(Closeable):
92
93
  if mcl.created and mcl.created.time > stop_time.timestamp() * 1000:
93
94
  logger.info(
94
95
  f"Stopped reading from kafka, reached MCL "
95
- f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}"
96
+ f"with audit stamp {parse_ts_millis(mcl.created.time)}"
96
97
  )
97
98
  break
98
99
 
@@ -167,9 +167,11 @@ class DataJobEntity:
167
167
  class DataProcessCleanupReport(SourceReport):
168
168
  num_aspects_removed: int = 0
169
169
  num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
170
- sample_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
170
+ sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
171
171
  default_factory=TopKDict
172
172
  )
173
+ num_data_flows_found: int = 0
174
+ num_data_jobs_found: int = 0
173
175
 
174
176
 
175
177
  class DataProcessCleanup:
@@ -265,13 +267,17 @@ class DataProcessCleanup:
265
267
  self.report.report_failure(
266
268
  f"Exception while deleting DPI: {e}", exc=e
267
269
  )
268
- if deleted_count_last_n % self.config.batch_size == 0:
270
+ if (
271
+ deleted_count_last_n % self.config.batch_size == 0
272
+ and deleted_count_last_n > 0
273
+ ):
269
274
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
270
275
  if self.config.delay:
271
276
  logger.info(f"Sleeping for {self.config.delay} seconds")
272
277
  time.sleep(self.config.delay)
273
278
 
274
- logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
279
+ if deleted_count_last_n > 0:
280
+ logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
275
281
 
276
282
  def delete_entity(self, urn: str, type: str) -> None:
277
283
  assert self.ctx.graph
@@ -280,9 +286,9 @@ class DataProcessCleanup:
280
286
  self.report.num_aspect_removed_by_type[type] = (
281
287
  self.report.num_aspect_removed_by_type.get(type, 0) + 1
282
288
  )
283
- if type not in self.report.sample_removed_aspects_by_type:
284
- self.report.sample_removed_aspects_by_type[type] = LossyList()
285
- self.report.sample_removed_aspects_by_type[type].append(urn)
289
+ if type not in self.report.sample_soft_deleted_aspects_by_type:
290
+ self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
291
+ self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
286
292
 
287
293
  if self.dry_run:
288
294
  logger.info(
@@ -351,7 +357,10 @@ class DataProcessCleanup:
351
357
  except Exception as e:
352
358
  self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
353
359
 
354
- if deleted_count_retention % self.config.batch_size == 0:
360
+ if (
361
+ deleted_count_retention % self.config.batch_size == 0
362
+ and deleted_count_retention > 0
363
+ ):
355
364
  logger.info(
356
365
  f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
357
366
  )
@@ -393,6 +402,7 @@ class DataProcessCleanup:
393
402
  scrollAcrossEntities = result.get("scrollAcrossEntities")
394
403
  if not scrollAcrossEntities:
395
404
  raise ValueError("Missing scrollAcrossEntities in response")
405
+ self.report.num_data_flows_found += scrollAcrossEntities.get("count")
396
406
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
397
407
 
398
408
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -415,8 +425,9 @@ class DataProcessCleanup:
415
425
  assert self.ctx.graph
416
426
 
417
427
  dataFlows: Dict[str, DataFlowEntity] = {}
418
- for flow in self.get_data_flows():
419
- dataFlows[flow.urn] = flow
428
+ if self.config.delete_empty_data_flows:
429
+ for flow in self.get_data_flows():
430
+ dataFlows[flow.urn] = flow
420
431
 
421
432
  scroll_id: Optional[str] = None
422
433
  previous_scroll_id: Optional[str] = None
@@ -443,6 +454,7 @@ class DataProcessCleanup:
443
454
  if not scrollAcrossEntities:
444
455
  raise ValueError("Missing scrollAcrossEntities in response")
445
456
 
457
+ self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
446
458
  logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
447
459
 
448
460
  scroll_id = scrollAcrossEntities.get("nextScrollId")
@@ -481,7 +493,8 @@ class DataProcessCleanup:
481
493
 
482
494
  previous_scroll_id = scroll_id
483
495
 
484
- logger.info(f"Deleted {deleted_jobs} DataJobs")
496
+ if deleted_jobs > 0:
497
+ logger.info(f"Deleted {deleted_jobs} DataJobs")
485
498
  # Delete empty dataflows if needed
486
499
  if self.config.delete_empty_data_flows:
487
500
  deleted_data_flows: int = 0
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  import time
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime, timezone
6
- from typing import List, Optional
6
+ from threading import Lock
7
+ from typing import Dict, Iterable, List, Optional
7
8
 
8
9
  from pydantic import Field
9
10
 
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ QUERY_QUERY_ENTITY = """
23
+ query listQueries($input: ScrollAcrossEntitiesInput!) {
24
+ scrollAcrossEntities(input: $input) {
25
+ nextScrollId
26
+ count
27
+ searchResults {
28
+ entity {
29
+ ... on QueryEntity {
30
+ urn
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ """
37
+
21
38
 
22
39
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
40
  enabled: bool = Field(
24
41
  default=True, description="Whether to do soft deletion cleanup."
25
42
  )
26
- retention_days: Optional[int] = Field(
43
+ retention_days: int = Field(
27
44
  10,
28
45
  description="Number of days to retain metadata in DataHub",
29
46
  )
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
62
79
  default=None,
63
80
  description="Query to filter entities",
64
81
  )
82
+
65
83
  limit_entities_delete: Optional[int] = Field(
66
84
  25000, description="Max number of entities to delete."
67
85
  )
68
86
 
69
- runtime_limit_seconds: Optional[int] = Field(
70
- None,
87
+ futures_max_at_time: int = Field(
88
+ 1000, description="Max number of futures to have at a time."
89
+ )
90
+
91
+ runtime_limit_seconds: int = Field(
92
+ 7200, # 2 hours by default
71
93
  description="Runtime limit in seconds",
72
94
  )
73
95
 
74
96
 
75
97
  @dataclass
76
98
  class SoftDeletedEntitiesReport(SourceReport):
77
- num_soft_deleted_entity_removed: int = 0
78
- num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field(
79
- default_factory=TopKDict
80
- )
81
- sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
99
+ num_queries_found: int = 0
100
+ num_soft_deleted_entity_processed: int = 0
101
+ num_soft_deleted_retained_due_to_age: int = 0
102
+ num_soft_deleted_entity_removal_started: int = 0
103
+ num_hard_deleted: int = 0
104
+ num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
105
+ sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
82
106
  default_factory=TopKDict
83
107
  )
84
108
 
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
103
127
  self.config = config
104
128
  self.report = report
105
129
  self.dry_run = dry_run
130
+ self.start_time = 0.0
131
+ self._report_lock: Lock = Lock()
132
+ self.last_print_time = 0.0
133
+
134
+ def _increment_retained_count(self) -> None:
135
+ """Thread-safe method to update report fields"""
136
+ with self._report_lock:
137
+ self.report.num_soft_deleted_retained_due_to_age += 1
138
+
139
+ def _increment_removal_started_count(self) -> None:
140
+ """Thread-safe method to update report fields"""
141
+ with self._report_lock:
142
+ self.report.num_soft_deleted_entity_removal_started += 1
143
+
144
+ def _update_report(self, urn: str, entity_type: str) -> None:
145
+ """Thread-safe method to update report fields"""
146
+ with self._report_lock:
147
+ self.report.num_hard_deleted += 1
148
+
149
+ current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
150
+ self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
151
+ if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
152
+ self.report.sample_hard_deleted_aspects_by_type[
153
+ entity_type
154
+ ] = LossyList()
155
+ self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
106
156
 
107
157
  def delete_entity(self, urn: str) -> None:
108
158
  assert self.ctx.graph
109
159
 
110
160
  entity_urn = Urn.from_string(urn)
111
- self.report.num_soft_deleted_entity_removed += 1
112
- self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
113
- self.report.num_soft_deleted_entity_removed_by_type.get(
114
- entity_urn.entity_type, 0
115
- )
116
- + 1
117
- )
118
- if (
119
- entity_urn.entity_type
120
- not in self.report.sample_soft_deleted_removed_aspects_by_type
121
- ):
122
- self.report.sample_soft_deleted_removed_aspects_by_type[
123
- entity_urn.entity_type
124
- ] = LossyList()
125
- self.report.sample_soft_deleted_removed_aspects_by_type[
126
- entity_urn.entity_type
127
- ].append(urn)
128
-
129
161
  if self.dry_run:
130
162
  logger.info(
131
163
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
132
164
  )
133
165
  return
134
-
166
+ self._increment_removal_started_count()
135
167
  self.ctx.graph.delete_entity(urn=urn, hard=True)
136
168
  self.ctx.graph.delete_references_to_urn(
137
169
  urn=urn,
138
170
  dry_run=False,
139
171
  )
172
+ self._update_report(urn, entity_urn.entity_type)
140
173
 
141
174
  def delete_soft_deleted_entity(self, urn: str) -> None:
142
175
  assert self.ctx.graph
143
176
 
144
- if self.config.retention_days is None:
145
- logger.info("Retention days is not set, skipping soft delete cleanup")
146
- return
147
-
148
177
  retention_time = (
149
178
  int(datetime.now(timezone.utc).timestamp())
150
179
  - self.config.retention_days * 24 * 60 * 60
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
157
186
  ]["created"]["time"] < (retention_time * 1000):
158
187
  logger.debug(f"Hard deleting {urn}")
159
188
  self.delete_entity(urn)
189
+ else:
190
+ self._increment_retained_count()
191
+
192
+ def _print_report(self) -> None:
193
+ time_taken = round(time.time() - self.last_print_time, 1)
194
+ # Print report every 2 minutes
195
+ if time_taken > 120:
196
+ self.last_print_time = time.time()
197
+ logger.info(f"\n{self.report.as_string()}")
198
+
199
+ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
200
+ done, not_done = wait(futures, return_when=FIRST_COMPLETED)
201
+ futures = {future: urn for future, urn in futures.items() if future in not_done}
202
+
203
+ for future in done:
204
+ self._print_report()
205
+ if future.exception():
206
+ logger.error(
207
+ f"Failed to delete entity {futures[future]}: {future.exception()}"
208
+ )
209
+ self.report.failure(
210
+ f"Failed to delete entity {futures[future]}",
211
+ exc=future.exception(),
212
+ )
213
+ self.report.num_soft_deleted_entity_processed += 1
214
+ if (
215
+ self.report.num_soft_deleted_entity_processed % self.config.batch_size
216
+ == 0
217
+ ):
218
+ if self.config.delay:
219
+ logger.debug(
220
+ f"Sleeping for {self.config.delay} seconds before further processing batch"
221
+ )
222
+ time.sleep(self.config.delay)
223
+ return futures
160
224
 
161
- def cleanup_soft_deleted_entities(self) -> None:
162
- if not self.config.enabled:
163
- return
225
+ def _get_soft_deleted_queries(self) -> Iterable[str]:
164
226
  assert self.ctx.graph
165
- start_time = time.time()
166
-
167
- deleted_count_retention = 0
168
- urns = self.ctx.graph.get_urns_by_filter(
227
+ scroll_id: Optional[str] = None
228
+ while True:
229
+ try:
230
+ result = self.ctx.graph.execute_graphql(
231
+ QUERY_QUERY_ENTITY,
232
+ {
233
+ "input": {
234
+ "types": ["QUERY"],
235
+ "query": "*",
236
+ "scrollId": scroll_id if scroll_id else None,
237
+ "count": self.config.batch_size,
238
+ "orFilters": [
239
+ {
240
+ "and": [
241
+ {
242
+ "field": "removed",
243
+ "values": ["true"],
244
+ "condition": "EQUAL",
245
+ }
246
+ ]
247
+ }
248
+ ],
249
+ }
250
+ },
251
+ )
252
+ except Exception as e:
253
+ self.report.failure(
254
+ f"While trying to get queries with {scroll_id}", exc=e
255
+ )
256
+ break
257
+ scroll_across_entities = result.get("scrollAcrossEntities")
258
+ if not scroll_across_entities:
259
+ break
260
+ scroll_id = scroll_across_entities.get("nextScrollId")
261
+ self.report.num_queries_found += scroll_across_entities.get("count")
262
+ for query in scroll_across_entities.get("searchResults"):
263
+ yield query["entity"]["urn"]
264
+
265
+ def _get_urns(self) -> Iterable[str]:
266
+ assert self.ctx.graph
267
+ yield from self.ctx.graph.get_urns_by_filter(
169
268
  entity_types=self.config.entity_types,
170
269
  platform=self.config.platform,
171
270
  env=self.config.env,
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
173
272
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
174
273
  batch_size=self.config.batch_size,
175
274
  )
275
+ yield from self._get_soft_deleted_queries()
276
+
277
+ def cleanup_soft_deleted_entities(self) -> None:
278
+ if not self.config.enabled:
279
+ return
280
+ self.start_time = time.time()
176
281
 
177
- futures = {}
282
+ futures: Dict[Future, str] = dict()
178
283
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
179
- num_urns_submitted = 0
180
- for urn in urns:
181
- num_urns_submitted += 1
284
+ for urn in self._get_urns():
285
+ self._print_report()
286
+ while len(futures) >= self.config.futures_max_at_time:
287
+ futures = self._process_futures(futures)
182
288
  if (
183
289
  self.config.limit_entities_delete
184
- and num_urns_submitted > self.config.limit_entities_delete
290
+ and self.report.num_hard_deleted > self.config.limit_entities_delete
185
291
  ):
186
292
  logger.info(
187
- f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
293
+ f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
188
294
  )
189
295
  break
190
296
  if (
191
297
  self.config.runtime_limit_seconds
192
- and time.time() - start_time > self.config.runtime_limit_seconds
298
+ and time.time() - self.start_time
299
+ > self.config.runtime_limit_seconds
193
300
  ):
194
301
  logger.info(
195
- f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
302
+ f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
196
303
  )
197
304
  break
198
305
 
199
306
  future = executor.submit(self.delete_soft_deleted_entity, urn)
200
307
  futures[future] = urn
201
308
 
202
- if not futures:
203
- return
204
- for future in as_completed(futures):
205
- if future.exception():
206
- logger.error(
207
- f"Failed to delete entity {futures[future]}: {future.exception()}"
208
- )
209
- self.report.failure(
210
- f"Failed to delete entity {futures[future]}",
211
- exc=future.exception(),
212
- )
213
- deleted_count_retention += 1
214
-
215
- if deleted_count_retention % self.config.batch_size == 0:
216
- logger.info(
217
- f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
218
- )
219
-
220
- if self.config.delay:
221
- logger.debug(
222
- f"Sleeping for {self.config.delay} seconds before getting next batch"
223
- )
224
- time.sleep(self.config.delay)
309
+ logger.info(f"Waiting for {len(futures)} futures to complete")
310
+ while len(futures) > 0:
311
+ self._print_report()
312
+ futures = self._process_futures(futures)
@@ -225,7 +225,7 @@ class S3Source(StatefulIngestionSourceBase):
225
225
  self.init_spark()
226
226
 
227
227
  def init_spark(self):
228
- os.environ.setdefault("SPARK_VERSION", "3.3")
228
+ os.environ.setdefault("SPARK_VERSION", "3.5")
229
229
  spark_version = os.environ["SPARK_VERSION"]
230
230
 
231
231
  # Importing here to avoid Deequ dependency for non profiling use cases
@@ -838,3 +838,18 @@ class HiveSource(TwoTierSQLAlchemySource):
838
838
  entityUrn=dataset_urn,
839
839
  aspect=view_properties_aspect,
840
840
  ).as_workunit()
841
+
842
+ if view_definition and self.config.include_view_lineage:
843
+ default_db = None
844
+ default_schema = None
845
+ try:
846
+ default_db, default_schema = self.get_db_schema(dataset_name)
847
+ except ValueError:
848
+ logger.warning(f"Invalid view identifier: {dataset_name}")
849
+
850
+ self.aggregator.add_view_definition(
851
+ view_urn=dataset_urn,
852
+ view_definition=view_definition,
853
+ default_db=default_db,
854
+ default_schema=default_schema,
855
+ )
@@ -123,6 +123,10 @@ class HiveMetastore(BasicSQLAlchemyConfig):
123
123
  description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
124
124
  )
125
125
 
126
+ include_view_lineage: bool = Field(
127
+ default=False, description="", hidden_from_docs=True
128
+ )
129
+
126
130
  include_catalog_name_in_ids: bool = Field(
127
131
  default=False,
128
132
  description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
@@ -160,6 +164,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
160
164
  @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
161
165
  @capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
162
166
  @capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
167
+ @capability(
168
+ SourceCapability.LINEAGE_COARSE, "View lineage is not supported", supported=False
169
+ )
163
170
  class HiveMetastoreSource(SQLAlchemySource):
164
171
  """
165
172
  This plugin extracts the following:
@@ -724,7 +724,7 @@ class SQLServerSource(SQLAlchemySource):
724
724
  ):
725
725
  yield from auto_workunit(
726
726
  generate_procedure_lineage(
727
- schema_resolver=self.schema_resolver,
727
+ schema_resolver=self.get_schema_resolver(),
728
728
  procedure=procedure,
729
729
  procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
730
730
  is_temp_table=self.is_temp_table,