acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor,
|
|
3
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
|
-
from
|
|
6
|
+
from threading import Lock
|
|
7
|
+
from typing import Dict, Iterable, List, Optional
|
|
7
8
|
|
|
8
9
|
from pydantic import Field
|
|
9
10
|
|
|
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
22
|
+
QUERY_QUERY_ENTITY = """
|
|
23
|
+
query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
+
scrollAcrossEntities(input: $input) {
|
|
25
|
+
nextScrollId
|
|
26
|
+
count
|
|
27
|
+
searchResults {
|
|
28
|
+
entity {
|
|
29
|
+
... on QueryEntity {
|
|
30
|
+
urn
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
21
38
|
|
|
22
39
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
23
40
|
enabled: bool = Field(
|
|
24
41
|
default=True, description="Whether to do soft deletion cleanup."
|
|
25
42
|
)
|
|
26
|
-
retention_days:
|
|
43
|
+
retention_days: int = Field(
|
|
27
44
|
10,
|
|
28
45
|
description="Number of days to retain metadata in DataHub",
|
|
29
46
|
)
|
|
@@ -62,25 +79,34 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
62
79
|
default=None,
|
|
63
80
|
description="Query to filter entities",
|
|
64
81
|
)
|
|
82
|
+
|
|
65
83
|
limit_entities_delete: Optional[int] = Field(
|
|
66
84
|
25000, description="Max number of entities to delete."
|
|
67
85
|
)
|
|
68
86
|
|
|
69
|
-
|
|
70
|
-
|
|
87
|
+
futures_max_at_time: int = Field(
|
|
88
|
+
1000, description="Max number of futures to have at a time."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
runtime_limit_seconds: int = Field(
|
|
92
|
+
7200, # 2 hours by default
|
|
71
93
|
description="Runtime limit in seconds",
|
|
72
94
|
)
|
|
73
95
|
|
|
74
96
|
|
|
75
97
|
@dataclass
|
|
76
98
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
99
|
+
num_queries_found: int = 0
|
|
100
|
+
num_soft_deleted_entity_processed: int = 0
|
|
101
|
+
num_soft_deleted_retained_due_to_age: int = 0
|
|
102
|
+
num_soft_deleted_entity_removal_started: int = 0
|
|
103
|
+
num_hard_deleted: int = 0
|
|
104
|
+
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
105
|
+
sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
82
106
|
default_factory=TopKDict
|
|
83
107
|
)
|
|
108
|
+
runtime_limit_reached: bool = False
|
|
109
|
+
deletion_limit_reached: bool = False
|
|
84
110
|
|
|
85
111
|
|
|
86
112
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -103,48 +129,55 @@ class SoftDeletedEntitiesCleanup:
|
|
|
103
129
|
self.config = config
|
|
104
130
|
self.report = report
|
|
105
131
|
self.dry_run = dry_run
|
|
132
|
+
self.start_time = 0.0
|
|
133
|
+
self._report_lock: Lock = Lock()
|
|
134
|
+
self.last_print_time = 0.0
|
|
135
|
+
|
|
136
|
+
def _increment_retained_count(self) -> None:
|
|
137
|
+
"""Thread-safe method to update report fields"""
|
|
138
|
+
with self._report_lock:
|
|
139
|
+
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
140
|
+
|
|
141
|
+
def _increment_removal_started_count(self) -> None:
|
|
142
|
+
"""Thread-safe method to update report fields"""
|
|
143
|
+
with self._report_lock:
|
|
144
|
+
self.report.num_soft_deleted_entity_removal_started += 1
|
|
145
|
+
|
|
146
|
+
def _update_report(self, urn: str, entity_type: str) -> None:
|
|
147
|
+
"""Thread-safe method to update report fields"""
|
|
148
|
+
with self._report_lock:
|
|
149
|
+
self.report.num_hard_deleted += 1
|
|
150
|
+
|
|
151
|
+
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
152
|
+
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
153
|
+
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
154
|
+
self.report.sample_hard_deleted_aspects_by_type[
|
|
155
|
+
entity_type
|
|
156
|
+
] = LossyList()
|
|
157
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
106
158
|
|
|
107
159
|
def delete_entity(self, urn: str) -> None:
|
|
108
160
|
assert self.ctx.graph
|
|
109
161
|
|
|
110
162
|
entity_urn = Urn.from_string(urn)
|
|
111
|
-
self.report.num_soft_deleted_entity_removed += 1
|
|
112
|
-
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
113
|
-
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
114
|
-
entity_urn.entity_type, 0
|
|
115
|
-
)
|
|
116
|
-
+ 1
|
|
117
|
-
)
|
|
118
|
-
if (
|
|
119
|
-
entity_urn.entity_type
|
|
120
|
-
not in self.report.sample_soft_deleted_removed_aspects_by_type
|
|
121
|
-
):
|
|
122
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
123
|
-
entity_urn.entity_type
|
|
124
|
-
] = LossyList()
|
|
125
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
126
|
-
entity_urn.entity_type
|
|
127
|
-
].append(urn)
|
|
128
|
-
|
|
129
163
|
if self.dry_run:
|
|
130
164
|
logger.info(
|
|
131
165
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
132
166
|
)
|
|
133
167
|
return
|
|
134
|
-
|
|
168
|
+
if self._deletion_limit_reached() or self._times_up():
|
|
169
|
+
return
|
|
170
|
+
self._increment_removal_started_count()
|
|
135
171
|
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
136
172
|
self.ctx.graph.delete_references_to_urn(
|
|
137
173
|
urn=urn,
|
|
138
174
|
dry_run=False,
|
|
139
175
|
)
|
|
176
|
+
self._update_report(urn, entity_urn.entity_type)
|
|
140
177
|
|
|
141
178
|
def delete_soft_deleted_entity(self, urn: str) -> None:
|
|
142
179
|
assert self.ctx.graph
|
|
143
180
|
|
|
144
|
-
if self.config.retention_days is None:
|
|
145
|
-
logger.info("Retention days is not set, skipping soft delete cleanup")
|
|
146
|
-
return
|
|
147
|
-
|
|
148
181
|
retention_time = (
|
|
149
182
|
int(datetime.now(timezone.utc).timestamp())
|
|
150
183
|
- self.config.retention_days * 24 * 60 * 60
|
|
@@ -157,15 +190,84 @@ class SoftDeletedEntitiesCleanup:
|
|
|
157
190
|
]["created"]["time"] < (retention_time * 1000):
|
|
158
191
|
logger.debug(f"Hard deleting {urn}")
|
|
159
192
|
self.delete_entity(urn)
|
|
193
|
+
else:
|
|
194
|
+
self._increment_retained_count()
|
|
195
|
+
|
|
196
|
+
def _print_report(self) -> None:
|
|
197
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
198
|
+
# Print report every 2 minutes
|
|
199
|
+
if time_taken > 120:
|
|
200
|
+
self.last_print_time = time.time()
|
|
201
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
202
|
+
|
|
203
|
+
def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
|
|
204
|
+
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
205
|
+
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
206
|
+
|
|
207
|
+
for future in done:
|
|
208
|
+
self._print_report()
|
|
209
|
+
if future.exception():
|
|
210
|
+
self.report.failure(
|
|
211
|
+
title="Failed to delete entity",
|
|
212
|
+
message="Failed to delete entity",
|
|
213
|
+
context=futures[future],
|
|
214
|
+
exc=future.exception(),
|
|
215
|
+
)
|
|
216
|
+
self.report.num_soft_deleted_entity_processed += 1
|
|
217
|
+
if (
|
|
218
|
+
self.report.num_soft_deleted_entity_processed % self.config.batch_size
|
|
219
|
+
== 0
|
|
220
|
+
):
|
|
221
|
+
if self.config.delay:
|
|
222
|
+
logger.debug(
|
|
223
|
+
f"Sleeping for {self.config.delay} seconds before further processing batch"
|
|
224
|
+
)
|
|
225
|
+
time.sleep(self.config.delay)
|
|
226
|
+
return futures
|
|
160
227
|
|
|
161
|
-
def
|
|
162
|
-
if not self.config.enabled:
|
|
163
|
-
return
|
|
228
|
+
def _get_soft_deleted_queries(self) -> Iterable[str]:
|
|
164
229
|
assert self.ctx.graph
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
230
|
+
scroll_id: Optional[str] = None
|
|
231
|
+
while True:
|
|
232
|
+
try:
|
|
233
|
+
result = self.ctx.graph.execute_graphql(
|
|
234
|
+
QUERY_QUERY_ENTITY,
|
|
235
|
+
{
|
|
236
|
+
"input": {
|
|
237
|
+
"types": ["QUERY"],
|
|
238
|
+
"query": "*",
|
|
239
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
240
|
+
"count": self.config.batch_size,
|
|
241
|
+
"orFilters": [
|
|
242
|
+
{
|
|
243
|
+
"and": [
|
|
244
|
+
{
|
|
245
|
+
"field": "removed",
|
|
246
|
+
"values": ["true"],
|
|
247
|
+
"condition": "EQUAL",
|
|
248
|
+
}
|
|
249
|
+
]
|
|
250
|
+
}
|
|
251
|
+
],
|
|
252
|
+
}
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
except Exception as e:
|
|
256
|
+
self.report.failure(
|
|
257
|
+
f"While trying to get queries with {scroll_id}", exc=e
|
|
258
|
+
)
|
|
259
|
+
break
|
|
260
|
+
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
261
|
+
if not scroll_across_entities or not scroll_across_entities.get("count"):
|
|
262
|
+
break
|
|
263
|
+
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
264
|
+
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
265
|
+
for query in scroll_across_entities.get("searchResults"):
|
|
266
|
+
yield query["entity"]["urn"]
|
|
267
|
+
|
|
268
|
+
def _get_urns(self) -> Iterable[str]:
|
|
269
|
+
assert self.ctx.graph
|
|
270
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
169
271
|
entity_types=self.config.entity_types,
|
|
170
272
|
platform=self.config.platform,
|
|
171
273
|
env=self.config.env,
|
|
@@ -173,52 +275,45 @@ class SoftDeletedEntitiesCleanup:
|
|
|
173
275
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
174
276
|
batch_size=self.config.batch_size,
|
|
175
277
|
)
|
|
278
|
+
yield from self._get_soft_deleted_queries()
|
|
279
|
+
|
|
280
|
+
def _times_up(self) -> bool:
|
|
281
|
+
if (
|
|
282
|
+
self.config.runtime_limit_seconds
|
|
283
|
+
and time.time() - self.start_time > self.config.runtime_limit_seconds
|
|
284
|
+
):
|
|
285
|
+
with self._report_lock:
|
|
286
|
+
self.report.runtime_limit_reached = True
|
|
287
|
+
return True
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
def _deletion_limit_reached(self) -> bool:
|
|
291
|
+
if (
|
|
292
|
+
self.config.limit_entities_delete
|
|
293
|
+
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
294
|
+
):
|
|
295
|
+
with self._report_lock:
|
|
296
|
+
self.report.deletion_limit_reached = True
|
|
297
|
+
return True
|
|
298
|
+
return False
|
|
299
|
+
|
|
300
|
+
def cleanup_soft_deleted_entities(self) -> None:
|
|
301
|
+
if not self.config.enabled:
|
|
302
|
+
return
|
|
303
|
+
self.start_time = time.time()
|
|
176
304
|
|
|
177
|
-
futures =
|
|
305
|
+
futures: Dict[Future, str] = dict()
|
|
178
306
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
and num_urns_submitted > self.config.limit_entities_delete
|
|
185
|
-
):
|
|
186
|
-
logger.info(
|
|
187
|
-
f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
|
|
188
|
-
)
|
|
307
|
+
for urn in self._get_urns():
|
|
308
|
+
self._print_report()
|
|
309
|
+
while len(futures) >= self.config.futures_max_at_time:
|
|
310
|
+
futures = self._process_futures(futures)
|
|
311
|
+
if self._deletion_limit_reached() or self._times_up():
|
|
189
312
|
break
|
|
190
|
-
if (
|
|
191
|
-
self.config.runtime_limit_seconds
|
|
192
|
-
and time.time() - start_time > self.config.runtime_limit_seconds
|
|
193
|
-
):
|
|
194
|
-
logger.info(
|
|
195
|
-
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
|
|
196
|
-
)
|
|
197
|
-
break
|
|
198
|
-
|
|
199
313
|
future = executor.submit(self.delete_soft_deleted_entity, urn)
|
|
200
314
|
futures[future] = urn
|
|
201
315
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
logger.error(
|
|
207
|
-
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
-
)
|
|
209
|
-
self.report.failure(
|
|
210
|
-
f"Failed to delete entity {futures[future]}",
|
|
211
|
-
exc=future.exception(),
|
|
212
|
-
)
|
|
213
|
-
deleted_count_retention += 1
|
|
214
|
-
|
|
215
|
-
if deleted_count_retention % self.config.batch_size == 0:
|
|
216
|
-
logger.info(
|
|
217
|
-
f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if self.config.delay:
|
|
221
|
-
logger.debug(
|
|
222
|
-
f"Sleeping for {self.config.delay} seconds before getting next batch"
|
|
223
|
-
)
|
|
224
|
-
time.sleep(self.config.delay)
|
|
316
|
+
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
317
|
+
while len(futures) > 0:
|
|
318
|
+
self._print_report()
|
|
319
|
+
futures = self._process_futures(futures)
|
|
@@ -10,6 +10,7 @@ from pyiceberg.exceptions import (
|
|
|
10
10
|
NoSuchNamespaceError,
|
|
11
11
|
NoSuchPropertyException,
|
|
12
12
|
NoSuchTableError,
|
|
13
|
+
ServerError,
|
|
13
14
|
)
|
|
14
15
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
15
16
|
from pyiceberg.table import Table
|
|
@@ -145,6 +146,13 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
145
146
|
self.report.report_no_listed_namespaces(len(namespaces))
|
|
146
147
|
tables_count = 0
|
|
147
148
|
for namespace in namespaces:
|
|
149
|
+
namespace_repr = ".".join(namespace)
|
|
150
|
+
if not self.config.namespace_pattern.allowed(namespace_repr):
|
|
151
|
+
LOGGER.info(
|
|
152
|
+
f"Namespace {namespace_repr} is not allowed by config pattern, skipping"
|
|
153
|
+
)
|
|
154
|
+
self.report.report_dropped(f"{namespace_repr}.*")
|
|
155
|
+
continue
|
|
148
156
|
try:
|
|
149
157
|
tables = catalog.list_tables(namespace)
|
|
150
158
|
tables_count += len(tables)
|
|
@@ -181,6 +189,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
181
189
|
if not self.config.table_pattern.allowed(dataset_name):
|
|
182
190
|
# Dataset name is rejected by pattern, report as dropped.
|
|
183
191
|
self.report.report_dropped(dataset_name)
|
|
192
|
+
LOGGER.debug(
|
|
193
|
+
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
194
|
+
)
|
|
184
195
|
return
|
|
185
196
|
try:
|
|
186
197
|
if not hasattr(thread_local, "local_catalog"):
|
|
@@ -219,6 +230,22 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
219
230
|
LOGGER.warning(
|
|
220
231
|
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
221
232
|
)
|
|
233
|
+
except FileNotFoundError as e:
|
|
234
|
+
self.report.report_warning(
|
|
235
|
+
"file-not-found",
|
|
236
|
+
f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
|
|
237
|
+
)
|
|
238
|
+
LOGGER.warning(
|
|
239
|
+
f"FileNotFoundError while processing table {dataset_path}, skipping it."
|
|
240
|
+
)
|
|
241
|
+
except ServerError as e:
|
|
242
|
+
self.report.report_warning(
|
|
243
|
+
"iceberg-rest-server-error",
|
|
244
|
+
f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
|
|
245
|
+
)
|
|
246
|
+
LOGGER.warning(
|
|
247
|
+
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
248
|
+
)
|
|
222
249
|
except Exception as e:
|
|
223
250
|
self.report.report_failure("general", f"Failed to create workunit: {e}")
|
|
224
251
|
LOGGER.exception(
|
|
@@ -269,7 +296,6 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
269
296
|
] = table.current_snapshot().manifest_list
|
|
270
297
|
dataset_properties = DatasetPropertiesClass(
|
|
271
298
|
name=table.name()[-1],
|
|
272
|
-
tags=[],
|
|
273
299
|
description=table.metadata.properties.get("comment", None),
|
|
274
300
|
customProperties=custom_properties,
|
|
275
301
|
)
|
|
@@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
|
|
|
68
68
|
default=AllowDenyPattern.allow_all(),
|
|
69
69
|
description="Regex patterns for tables to filter in ingestion.",
|
|
70
70
|
)
|
|
71
|
+
namespace_pattern: AllowDenyPattern = Field(
|
|
72
|
+
default=AllowDenyPattern.allow_all(),
|
|
73
|
+
description="Regex patterns for namespaces to filter in ingestion.",
|
|
74
|
+
)
|
|
71
75
|
user_ownership_property: Optional[str] = Field(
|
|
72
76
|
default="owner",
|
|
73
77
|
description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.",
|
|
File without changes
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Dict, Iterable, List, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic.fields import Field
|
|
6
|
+
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
8
|
+
from datahub.configuration.source_common import (
|
|
9
|
+
DatasetLineageProviderConfigBase,
|
|
10
|
+
PlatformInstanceConfigMixin,
|
|
11
|
+
)
|
|
12
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
13
|
+
StaleEntityRemovalSourceReport,
|
|
14
|
+
StatefulStaleMetadataRemovalConfig,
|
|
15
|
+
)
|
|
16
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
|
+
StatefulIngestionConfigBase,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
KAFKA = "kafka"
|
|
23
|
+
SOURCE = "source"
|
|
24
|
+
SINK = "sink"
|
|
25
|
+
CONNECTOR_CLASS = "connector.class"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ProvidedConfig(ConfigModel):
|
|
29
|
+
provider: str
|
|
30
|
+
path_key: str
|
|
31
|
+
value: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GenericConnectorConfig(ConfigModel):
|
|
35
|
+
connector_name: str
|
|
36
|
+
source_dataset: str
|
|
37
|
+
source_platform: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class KafkaConnectSourceConfig(
|
|
41
|
+
PlatformInstanceConfigMixin,
|
|
42
|
+
DatasetLineageProviderConfigBase,
|
|
43
|
+
StatefulIngestionConfigBase,
|
|
44
|
+
):
|
|
45
|
+
# See the Connect REST Interface for details
|
|
46
|
+
# https://docs.confluent.io/platform/current/connect/references/restapi.html#
|
|
47
|
+
connect_uri: str = Field(
|
|
48
|
+
default="http://localhost:8083/", description="URI to connect to."
|
|
49
|
+
)
|
|
50
|
+
username: Optional[str] = Field(default=None, description="Kafka Connect username.")
|
|
51
|
+
password: Optional[str] = Field(default=None, description="Kafka Connect password.")
|
|
52
|
+
cluster_name: Optional[str] = Field(
|
|
53
|
+
default="connect-cluster", description="Cluster to ingest from."
|
|
54
|
+
)
|
|
55
|
+
# convert lineage dataset's urns to lowercase
|
|
56
|
+
convert_lineage_urns_to_lowercase: bool = Field(
|
|
57
|
+
default=False,
|
|
58
|
+
description="Whether to convert the urns of ingested lineage dataset to lowercase",
|
|
59
|
+
)
|
|
60
|
+
connector_patterns: AllowDenyPattern = Field(
|
|
61
|
+
default=AllowDenyPattern.allow_all(),
|
|
62
|
+
description="regex patterns for connectors to filter for ingestion.",
|
|
63
|
+
)
|
|
64
|
+
provided_configs: Optional[List[ProvidedConfig]] = Field(
|
|
65
|
+
default=None, description="Provided Configurations"
|
|
66
|
+
)
|
|
67
|
+
connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
|
|
68
|
+
default=None,
|
|
69
|
+
description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
|
|
70
|
+
)
|
|
71
|
+
platform_instance_map: Optional[Dict[str, str]] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
|
|
74
|
+
)
|
|
75
|
+
generic_connectors: List[GenericConnectorConfig] = Field(
|
|
76
|
+
default=[],
|
|
77
|
+
description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
85
|
+
connectors_scanned: int = 0
|
|
86
|
+
filtered: List[str] = field(default_factory=list)
|
|
87
|
+
|
|
88
|
+
def report_connector_scanned(self, connector: str) -> None:
|
|
89
|
+
self.connectors_scanned += 1
|
|
90
|
+
|
|
91
|
+
def report_dropped(self, connector: str) -> None:
|
|
92
|
+
self.filtered.append(connector)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class KafkaConnectLineage:
|
|
97
|
+
"""Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
|
|
98
|
+
|
|
99
|
+
source_platform: str
|
|
100
|
+
target_dataset: str
|
|
101
|
+
target_platform: str
|
|
102
|
+
job_property_bag: Optional[Dict[str, str]] = None
|
|
103
|
+
source_dataset: Optional[str] = None
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class ConnectorManifest:
|
|
108
|
+
"""Each instance is potential DataFlow"""
|
|
109
|
+
|
|
110
|
+
name: str
|
|
111
|
+
type: str
|
|
112
|
+
config: Dict
|
|
113
|
+
tasks: Dict
|
|
114
|
+
url: Optional[str] = None
|
|
115
|
+
flow_property_bag: Optional[Dict[str, str]] = None
|
|
116
|
+
lineages: List[KafkaConnectLineage] = field(default_factory=list)
|
|
117
|
+
topic_names: Iterable[str] = field(default_factory=list)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def remove_prefix(text: str, prefix: str) -> str:
|
|
121
|
+
if text.startswith(prefix):
|
|
122
|
+
index = len(prefix)
|
|
123
|
+
return text[index:]
|
|
124
|
+
return text
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def unquote(
|
|
128
|
+
string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
|
|
129
|
+
) -> str:
|
|
130
|
+
"""
|
|
131
|
+
If string starts and ends with a quote, unquote it
|
|
132
|
+
"""
|
|
133
|
+
trailing_quote = trailing_quote if trailing_quote else leading_quote
|
|
134
|
+
if string.startswith(leading_quote) and string.endswith(trailing_quote):
|
|
135
|
+
string = string[1:-1]
|
|
136
|
+
return string
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_dataset_name(
|
|
140
|
+
database_name: Optional[str],
|
|
141
|
+
source_table: str,
|
|
142
|
+
) -> str:
|
|
143
|
+
if database_name:
|
|
144
|
+
dataset_name = database_name + "." + source_table
|
|
145
|
+
else:
|
|
146
|
+
dataset_name = source_table
|
|
147
|
+
|
|
148
|
+
return dataset_name
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_platform_instance(
|
|
152
|
+
config: KafkaConnectSourceConfig, connector_name: str, platform: str
|
|
153
|
+
) -> Optional[str]:
|
|
154
|
+
instance_name = None
|
|
155
|
+
if (
|
|
156
|
+
config.connect_to_platform_map
|
|
157
|
+
and config.connect_to_platform_map.get(connector_name)
|
|
158
|
+
and config.connect_to_platform_map[connector_name].get(platform)
|
|
159
|
+
):
|
|
160
|
+
instance_name = config.connect_to_platform_map[connector_name][platform]
|
|
161
|
+
if config.platform_instance_map and config.platform_instance_map.get(platform):
|
|
162
|
+
logger.warning(
|
|
163
|
+
f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
|
|
164
|
+
"Will prefer connector specific platform instance from connect_to_platform_map."
|
|
165
|
+
)
|
|
166
|
+
elif config.platform_instance_map and config.platform_instance_map.get(platform):
|
|
167
|
+
instance_name = config.platform_instance_map[platform]
|
|
168
|
+
logger.info(
|
|
169
|
+
f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
|
|
170
|
+
)
|
|
171
|
+
return instance_name
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def transform_connector_config(
|
|
175
|
+
connector_config: Dict, provided_configs: List[ProvidedConfig]
|
|
176
|
+
) -> None:
|
|
177
|
+
"""This method will update provided configs in connector config values, if any"""
|
|
178
|
+
lookupsByProvider = {}
|
|
179
|
+
for pconfig in provided_configs:
|
|
180
|
+
lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
|
|
181
|
+
for k, v in connector_config.items():
|
|
182
|
+
for key, value in lookupsByProvider.items():
|
|
183
|
+
if key in v:
|
|
184
|
+
connector_config[k] = connector_config[k].replace(key, value)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
|
|
188
|
+
def has_three_level_hierarchy(platform: str) -> bool:
|
|
189
|
+
return platform in ["postgres", "trino", "redshift", "snowflake"]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class BaseConnector:
|
|
194
|
+
connector_manifest: ConnectorManifest
|
|
195
|
+
config: KafkaConnectSourceConfig
|
|
196
|
+
report: KafkaConnectSourceReport
|
|
197
|
+
|
|
198
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
def extract_flow_property_bag(self) -> Optional[Dict[str, str]]:
|
|
202
|
+
return None
|