acryl-datahub-cloud 0.3.12rc15__py3-none-any.whl → 0.3.12.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub-cloud might be problematic. Click here for more details.
- acryl_datahub_cloud/_codegen_config.json +1 -1
- acryl_datahub_cloud/lineage_features/source.py +221 -28
- acryl_datahub_cloud/metadata/_urns/urn_defs.py +1913 -1913
- acryl_datahub_cloud/metadata/schema.avsc +24446 -23971
- acryl_datahub_cloud/metadata/schema_classes.py +640 -634
- acryl_datahub_cloud/metadata/schemas/ContainerProperties.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataFlowInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataJobInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DataProcessKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/MLModelDeploymentKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +4 -0
- acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +4 -0
- {acryl_datahub_cloud-0.3.12rc15.dist-info → acryl_datahub_cloud-0.3.12.1rc1.dist-info}/METADATA +47 -46
- {acryl_datahub_cloud-0.3.12rc15.dist-info → acryl_datahub_cloud-0.3.12.1rc1.dist-info}/RECORD +20 -20
- {acryl_datahub_cloud-0.3.12rc15.dist-info → acryl_datahub_cloud-0.3.12.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub_cloud-0.3.12rc15.dist-info → acryl_datahub_cloud-0.3.12.1rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub_cloud-0.3.12rc15.dist-info → acryl_datahub_cloud-0.3.12.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
+
import time
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from datetime import datetime, timezone
|
|
6
|
-
from typing import Dict, Iterable, List, Set
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Set
|
|
7
8
|
|
|
8
9
|
from opensearchpy import OpenSearch
|
|
10
|
+
from opensearchpy.exceptions import (
|
|
11
|
+
ConnectionError as OpenSearchConnectionError,
|
|
12
|
+
ConnectionTimeout,
|
|
13
|
+
RequestError,
|
|
14
|
+
TransportError,
|
|
15
|
+
)
|
|
16
|
+
from pydantic import validator
|
|
17
|
+
from tenacity import (
|
|
18
|
+
before_sleep_log,
|
|
19
|
+
retry,
|
|
20
|
+
retry_if_exception_type,
|
|
21
|
+
stop_after_attempt,
|
|
22
|
+
wait_exponential,
|
|
23
|
+
)
|
|
9
24
|
|
|
10
25
|
from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
|
|
11
26
|
from acryl_datahub_cloud.elasticsearch.graph_service import ElasticGraphRow
|
|
@@ -20,6 +35,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
20
35
|
)
|
|
21
36
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
22
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
23
39
|
from datahub.metadata.schema_classes import AuditStampClass, LineageFeaturesClass
|
|
24
40
|
|
|
25
41
|
logger = logging.getLogger(__name__)
|
|
@@ -28,13 +44,38 @@ SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system"
|
|
|
28
44
|
|
|
29
45
|
|
|
30
46
|
class LineageFeaturesSourceConfig(ConfigModel):
|
|
47
|
+
enabled: bool = True
|
|
31
48
|
search_index: ElasticSearchClientConfig = ElasticSearchClientConfig()
|
|
32
49
|
query_timeout: int = 30
|
|
33
|
-
extract_batch_size: int =
|
|
50
|
+
extract_batch_size: int = 3000
|
|
51
|
+
max_retries: int = 3
|
|
52
|
+
retry_delay_seconds: int = 5
|
|
53
|
+
retry_backoff_multiplier: float = 2.0
|
|
54
|
+
|
|
55
|
+
@validator("max_retries")
|
|
56
|
+
def validate_max_retries(cls, v: int) -> int:
|
|
57
|
+
if v < 1:
|
|
58
|
+
raise ValueError("max_retries must be at least 1")
|
|
59
|
+
return v
|
|
60
|
+
|
|
61
|
+
@validator("retry_delay_seconds")
|
|
62
|
+
def validate_retry_delay_seconds(cls, v: int) -> int:
|
|
63
|
+
if v < 1:
|
|
64
|
+
raise ValueError("retry_delay_seconds must be at least 1")
|
|
65
|
+
return v
|
|
66
|
+
|
|
67
|
+
@validator("retry_backoff_multiplier")
|
|
68
|
+
def validate_retry_backoff_multiplier(cls, v: float) -> float:
|
|
69
|
+
if v < 1.0:
|
|
70
|
+
raise ValueError("retry_backoff_multiplier must be at least 1.0")
|
|
71
|
+
return v
|
|
34
72
|
|
|
35
73
|
|
|
36
74
|
@dataclass
|
|
37
|
-
class LineageExtractGraphSourceReport(SourceReport):
|
|
75
|
+
class LineageExtractGraphSourceReport(SourceReport, IngestionStageReport):
|
|
76
|
+
valid_urns_count: int = 0
|
|
77
|
+
upstream_count: int = 0
|
|
78
|
+
downstream_count: int = 0
|
|
38
79
|
edges_scanned: int = 0
|
|
39
80
|
|
|
40
81
|
|
|
@@ -42,9 +83,15 @@ class LineageExtractGraphSourceReport(SourceReport):
|
|
|
42
83
|
@config_class(LineageFeaturesSourceConfig)
|
|
43
84
|
@support_status(SupportStatus.INCUBATING)
|
|
44
85
|
class DataHubLineageFeaturesSource(Source):
|
|
86
|
+
"""
|
|
87
|
+
DataHub Lineage Features Source that extracts lineage information from Elasticsearch/OpenSearch.
|
|
88
|
+
"""
|
|
89
|
+
|
|
45
90
|
platform = "datahub"
|
|
46
91
|
|
|
47
|
-
def __init__(
|
|
92
|
+
def __init__(
|
|
93
|
+
self, config: LineageFeaturesSourceConfig, ctx: PipelineContext
|
|
94
|
+
) -> None:
|
|
48
95
|
super().__init__(ctx)
|
|
49
96
|
self.config: LineageFeaturesSourceConfig = config
|
|
50
97
|
self.report = LineageExtractGraphSourceReport()
|
|
@@ -53,9 +100,140 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
53
100
|
self.valid_urns: Set[str] = set()
|
|
54
101
|
self.upstream_counts: Dict[str, int] = defaultdict(int)
|
|
55
102
|
self.downstream_counts: Dict[str, int] = defaultdict(int)
|
|
103
|
+
self.last_print_time = time.time()
|
|
104
|
+
|
|
105
|
+
def _get_retry_decorator(
|
|
106
|
+
self,
|
|
107
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
108
|
+
"""Create a retry decorator based on config parameters"""
|
|
109
|
+
|
|
110
|
+
def should_retry_exception(exception: Exception) -> bool:
|
|
111
|
+
"""Custom retry predicate for OpenSearch exceptions"""
|
|
112
|
+
if isinstance(
|
|
113
|
+
exception,
|
|
114
|
+
(
|
|
115
|
+
OpenSearchConnectionError,
|
|
116
|
+
ConnectionTimeout,
|
|
117
|
+
RequestError,
|
|
118
|
+
TransportError,
|
|
119
|
+
),
|
|
120
|
+
):
|
|
121
|
+
return True
|
|
122
|
+
# Also retry on general connection and timeout errors
|
|
123
|
+
if isinstance(exception, (ConnectionError, TimeoutError)):
|
|
124
|
+
return True
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
return retry(
|
|
128
|
+
retry=retry_if_exception_type(
|
|
129
|
+
(
|
|
130
|
+
OpenSearchConnectionError,
|
|
131
|
+
ConnectionTimeout,
|
|
132
|
+
RequestError,
|
|
133
|
+
TransportError,
|
|
134
|
+
ConnectionError,
|
|
135
|
+
TimeoutError,
|
|
136
|
+
)
|
|
137
|
+
),
|
|
138
|
+
stop=stop_after_attempt(self.config.max_retries),
|
|
139
|
+
wait=wait_exponential(
|
|
140
|
+
multiplier=self.config.retry_backoff_multiplier,
|
|
141
|
+
min=self.config.retry_delay_seconds,
|
|
142
|
+
max=30,
|
|
143
|
+
),
|
|
144
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
145
|
+
reraise=True,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def _create_pit_with_retry(self, server: OpenSearch, index: str) -> str:
|
|
149
|
+
"""Create a Point-in-Time (PIT) with retry logic"""
|
|
150
|
+
retry_decorator = self._get_retry_decorator()
|
|
151
|
+
|
|
152
|
+
@retry_decorator
|
|
153
|
+
def _create_pit() -> str:
|
|
154
|
+
logger.debug(f"Creating PIT for index: {index}")
|
|
155
|
+
response = server.create_pit(index, keep_alive="10m")
|
|
156
|
+
pit = response.get("pit_id")
|
|
157
|
+
if not pit:
|
|
158
|
+
raise Exception("Failed to create PIT - no pit_id returned")
|
|
159
|
+
logger.debug(f"Successfully created PIT: {pit}")
|
|
160
|
+
return pit
|
|
161
|
+
|
|
162
|
+
return _create_pit()
|
|
163
|
+
|
|
164
|
+
def _search_with_retry(
|
|
165
|
+
self, server: OpenSearch, query: dict, batch_size: int
|
|
166
|
+
) -> dict:
|
|
167
|
+
"""Execute search with retry logic"""
|
|
168
|
+
retry_decorator = self._get_retry_decorator()
|
|
169
|
+
|
|
170
|
+
@retry_decorator
|
|
171
|
+
def _search() -> dict:
|
|
172
|
+
logger.debug(f"Executing search with batch size: {batch_size}")
|
|
173
|
+
return server.search(
|
|
174
|
+
body=query,
|
|
175
|
+
size=batch_size,
|
|
176
|
+
params={"timeout": self.config.query_timeout},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return _search()
|
|
180
|
+
|
|
181
|
+
def _delete_pit_with_retry(self, server: OpenSearch, pit: str) -> None:
|
|
182
|
+
"""Delete Point-in-Time (PIT) with retry logic"""
|
|
183
|
+
retry_decorator = self._get_retry_decorator()
|
|
184
|
+
|
|
185
|
+
@retry_decorator
|
|
186
|
+
def _delete_pit() -> None:
|
|
187
|
+
logger.debug(f"Deleting PIT: {pit}")
|
|
188
|
+
server.delete_pit(body={"pit_id": pit})
|
|
189
|
+
logger.debug(f"Successfully deleted PIT: {pit}")
|
|
190
|
+
|
|
191
|
+
_delete_pit()
|
|
192
|
+
|
|
193
|
+
def _create_opensearch_client_with_retry(self) -> OpenSearch:
|
|
194
|
+
"""Create OpenSearch client with retry logic"""
|
|
195
|
+
retry_decorator = self._get_retry_decorator()
|
|
196
|
+
|
|
197
|
+
@retry_decorator
|
|
198
|
+
def _create_client() -> OpenSearch:
|
|
199
|
+
logger.debug(
|
|
200
|
+
f"Creating OpenSearch client for endpoint: {self.config.search_index.endpoint}"
|
|
201
|
+
)
|
|
202
|
+
return OpenSearch(
|
|
203
|
+
[self.config.search_index.endpoint],
|
|
204
|
+
http_auth=(
|
|
205
|
+
self.config.search_index.username,
|
|
206
|
+
self.config.search_index.password,
|
|
207
|
+
),
|
|
208
|
+
use_ssl=self.config.search_index.use_ssl,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return _create_client()
|
|
212
|
+
|
|
213
|
+
def _update_report(self) -> None:
|
|
214
|
+
"""
|
|
215
|
+
Information to see whether we are close to hitting the memory limits
|
|
216
|
+
"""
|
|
217
|
+
self.report.valid_urns_count = len(self.valid_urns)
|
|
218
|
+
self.report.upstream_count = len(self.upstream_counts.keys())
|
|
219
|
+
self.report.downstream_count = len(self.downstream_counts.keys())
|
|
220
|
+
|
|
221
|
+
def _print_report(self) -> None:
|
|
222
|
+
"""
|
|
223
|
+
Printing is required like this because the report is only printed
|
|
224
|
+
when the workunits are yielded
|
|
225
|
+
In case of background processes we won't know the progress if this is not done
|
|
226
|
+
"""
|
|
227
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
228
|
+
# Print report every 2 minutes
|
|
229
|
+
if time_taken > 120:
|
|
230
|
+
self._update_report()
|
|
231
|
+
self.last_print_time = time.time()
|
|
232
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
56
233
|
|
|
57
234
|
def process_batch(self, results: Iterable[dict]) -> None:
|
|
58
235
|
for doc in results:
|
|
236
|
+
self._print_report()
|
|
59
237
|
row = ElasticGraphRow.from_elastic_doc(doc["_source"])
|
|
60
238
|
self.report.edges_scanned += 1
|
|
61
239
|
if (
|
|
@@ -65,20 +243,18 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
65
243
|
self.upstream_counts[row.source_urn] += 1
|
|
66
244
|
self.downstream_counts[row.destination_urn] += 1
|
|
67
245
|
|
|
68
|
-
def
|
|
246
|
+
def populate_valid_urns(self) -> None:
|
|
69
247
|
graph = self.ctx.require_graph("Load non soft-deleted urns")
|
|
70
248
|
for urn in graph.get_urns_by_filter(batch_size=self.config.extract_batch_size):
|
|
249
|
+
self._print_report()
|
|
71
250
|
self.valid_urns.add(urn)
|
|
72
251
|
|
|
252
|
+
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
253
|
+
with self.report.new_stage("Load valid URNs"):
|
|
254
|
+
self.populate_valid_urns()
|
|
255
|
+
|
|
73
256
|
timestamp = datetime.now(tz=timezone.utc)
|
|
74
|
-
server =
|
|
75
|
-
[self.config.search_index.endpoint],
|
|
76
|
-
http_auth=(
|
|
77
|
-
self.config.search_index.username,
|
|
78
|
-
self.config.search_index.password,
|
|
79
|
-
),
|
|
80
|
-
use_ssl=self.config.search_index.use_ssl,
|
|
81
|
-
)
|
|
257
|
+
server = self._create_opensearch_client_with_retry()
|
|
82
258
|
|
|
83
259
|
query = {
|
|
84
260
|
"query": {
|
|
@@ -113,27 +289,42 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
113
289
|
}
|
|
114
290
|
|
|
115
291
|
index = f"{self.config.search_index.index_prefix}graph_service_v1"
|
|
116
|
-
|
|
292
|
+
pit = self._create_pit_with_retry(server, index)
|
|
117
293
|
|
|
118
294
|
# TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
|
|
119
|
-
pit = response.get("pit_id")
|
|
120
295
|
query.update({"pit": {"id": pit, "keep_alive": "10m"}})
|
|
121
296
|
|
|
122
297
|
# TODO: Using slicing we can parallelize the ES calls below:
|
|
123
298
|
# https://opensearch.org/docs/latest/search-plugins/searching-data/point-in-time/#search-slicing
|
|
124
299
|
batch_size = self.config.extract_batch_size
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
300
|
+
with self.report.new_stage("Extract lineage features"):
|
|
301
|
+
try:
|
|
302
|
+
while True:
|
|
303
|
+
results = self._search_with_retry(server, query, batch_size)
|
|
304
|
+
self.process_batch(results["hits"]["hits"])
|
|
305
|
+
if len(results["hits"]["hits"]) < batch_size:
|
|
306
|
+
break
|
|
307
|
+
query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.error(f"Error during lineage extraction: {e}")
|
|
310
|
+
self.report.report_failure(
|
|
311
|
+
title="Lineage extraction failed",
|
|
312
|
+
message="Failed to extract lineage features from Elasticsearch",
|
|
313
|
+
context=f"Error: {str(e)}",
|
|
314
|
+
exc=e,
|
|
315
|
+
)
|
|
316
|
+
# Ensure PIT is cleaned up even on error
|
|
317
|
+
try:
|
|
318
|
+
self._delete_pit_with_retry(server, pit)
|
|
319
|
+
except Exception as cleanup_error:
|
|
320
|
+
logger.warning(
|
|
321
|
+
f"Failed to cleanup PIT after error: {cleanup_error}"
|
|
322
|
+
)
|
|
323
|
+
raise
|
|
324
|
+
# So previous stage's calculations are done
|
|
325
|
+
self.report.new_stage("Extract lineage features End")
|
|
326
|
+
self._update_report()
|
|
327
|
+
self._delete_pit_with_retry(server, pit)
|
|
137
328
|
|
|
138
329
|
# In Python 3.9, can be replaced by `self.self.upstream_counts.keys() | self.downstream_counts.keys()`
|
|
139
330
|
for urn in set(self.upstream_counts.keys()).union(
|
|
@@ -142,7 +333,7 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
142
333
|
logger.debug(
|
|
143
334
|
f"{urn}: {self.upstream_counts[urn]}, {self.downstream_counts[urn]}"
|
|
144
335
|
)
|
|
145
|
-
|
|
336
|
+
wu = MetadataChangeProposalWrapper(
|
|
146
337
|
entityUrn=urn,
|
|
147
338
|
aspect=LineageFeaturesClass(
|
|
148
339
|
upstreamCount=self.upstream_counts[urn],
|
|
@@ -153,6 +344,8 @@ class DataHubLineageFeaturesSource(Source):
|
|
|
153
344
|
),
|
|
154
345
|
),
|
|
155
346
|
).as_workunit()
|
|
347
|
+
self.report.report_workunit(wu)
|
|
348
|
+
yield wu
|
|
156
349
|
|
|
157
350
|
def get_report(self) -> SourceReport:
|
|
158
351
|
return self.report
|