acryl-datahub-cloud 0.3.12rc15__py3-none-any.whl → 0.3.12.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.12rc15",
3
+ "version": "0.3.12.1rc1",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -1,11 +1,26 @@
1
1
  import logging
2
2
  import os
3
+ import time
3
4
  from collections import defaultdict
4
5
  from dataclasses import dataclass
5
6
  from datetime import datetime, timezone
6
- from typing import Dict, Iterable, List, Set
7
+ from typing import Any, Callable, Dict, Iterable, List, Set
7
8
 
8
9
  from opensearchpy import OpenSearch
10
+ from opensearchpy.exceptions import (
11
+ ConnectionError as OpenSearchConnectionError,
12
+ ConnectionTimeout,
13
+ RequestError,
14
+ TransportError,
15
+ )
16
+ from pydantic import validator
17
+ from tenacity import (
18
+ before_sleep_log,
19
+ retry,
20
+ retry_if_exception_type,
21
+ stop_after_attempt,
22
+ wait_exponential,
23
+ )
9
24
 
10
25
  from acryl_datahub_cloud.elasticsearch.config import ElasticSearchClientConfig
11
26
  from acryl_datahub_cloud.elasticsearch.graph_service import ElasticGraphRow
@@ -20,6 +35,7 @@ from datahub.ingestion.api.decorators import (
20
35
  )
21
36
  from datahub.ingestion.api.source import Source, SourceReport
22
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
23
39
  from datahub.metadata.schema_classes import AuditStampClass, LineageFeaturesClass
24
40
 
25
41
  logger = logging.getLogger(__name__)
@@ -28,13 +44,38 @@ SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system"
28
44
 
29
45
 
30
46
  class LineageFeaturesSourceConfig(ConfigModel):
47
+ enabled: bool = True
31
48
  search_index: ElasticSearchClientConfig = ElasticSearchClientConfig()
32
49
  query_timeout: int = 30
33
- extract_batch_size: int = 2000
50
+ extract_batch_size: int = 3000
51
+ max_retries: int = 3
52
+ retry_delay_seconds: int = 5
53
+ retry_backoff_multiplier: float = 2.0
54
+
55
+ @validator("max_retries")
56
+ def validate_max_retries(cls, v: int) -> int:
57
+ if v < 1:
58
+ raise ValueError("max_retries must be at least 1")
59
+ return v
60
+
61
+ @validator("retry_delay_seconds")
62
+ def validate_retry_delay_seconds(cls, v: int) -> int:
63
+ if v < 1:
64
+ raise ValueError("retry_delay_seconds must be at least 1")
65
+ return v
66
+
67
+ @validator("retry_backoff_multiplier")
68
+ def validate_retry_backoff_multiplier(cls, v: float) -> float:
69
+ if v < 1.0:
70
+ raise ValueError("retry_backoff_multiplier must be at least 1.0")
71
+ return v
34
72
 
35
73
 
36
74
  @dataclass
37
- class LineageExtractGraphSourceReport(SourceReport):
75
+ class LineageExtractGraphSourceReport(SourceReport, IngestionStageReport):
76
+ valid_urns_count: int = 0
77
+ upstream_count: int = 0
78
+ downstream_count: int = 0
38
79
  edges_scanned: int = 0
39
80
 
40
81
 
@@ -42,9 +83,15 @@ class LineageExtractGraphSourceReport(SourceReport):
42
83
  @config_class(LineageFeaturesSourceConfig)
43
84
  @support_status(SupportStatus.INCUBATING)
44
85
  class DataHubLineageFeaturesSource(Source):
86
+ """
87
+ DataHub Lineage Features Source that extracts lineage information from Elasticsearch/OpenSearch.
88
+ """
89
+
45
90
  platform = "datahub"
46
91
 
47
- def __init__(self, config: LineageFeaturesSourceConfig, ctx: PipelineContext):
92
+ def __init__(
93
+ self, config: LineageFeaturesSourceConfig, ctx: PipelineContext
94
+ ) -> None:
48
95
  super().__init__(ctx)
49
96
  self.config: LineageFeaturesSourceConfig = config
50
97
  self.report = LineageExtractGraphSourceReport()
@@ -53,9 +100,140 @@ class DataHubLineageFeaturesSource(Source):
53
100
  self.valid_urns: Set[str] = set()
54
101
  self.upstream_counts: Dict[str, int] = defaultdict(int)
55
102
  self.downstream_counts: Dict[str, int] = defaultdict(int)
103
+ self.last_print_time = time.time()
104
+
105
+ def _get_retry_decorator(
106
+ self,
107
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
108
+ """Create a retry decorator based on config parameters"""
109
+
110
+ def should_retry_exception(exception: Exception) -> bool:
111
+ """Custom retry predicate for OpenSearch exceptions"""
112
+ if isinstance(
113
+ exception,
114
+ (
115
+ OpenSearchConnectionError,
116
+ ConnectionTimeout,
117
+ RequestError,
118
+ TransportError,
119
+ ),
120
+ ):
121
+ return True
122
+ # Also retry on general connection and timeout errors
123
+ if isinstance(exception, (ConnectionError, TimeoutError)):
124
+ return True
125
+ return False
126
+
127
+ return retry(
128
+ retry=retry_if_exception_type(
129
+ (
130
+ OpenSearchConnectionError,
131
+ ConnectionTimeout,
132
+ RequestError,
133
+ TransportError,
134
+ ConnectionError,
135
+ TimeoutError,
136
+ )
137
+ ),
138
+ stop=stop_after_attempt(self.config.max_retries),
139
+ wait=wait_exponential(
140
+ multiplier=self.config.retry_backoff_multiplier,
141
+ min=self.config.retry_delay_seconds,
142
+ max=30,
143
+ ),
144
+ before_sleep=before_sleep_log(logger, logging.WARNING),
145
+ reraise=True,
146
+ )
147
+
148
+ def _create_pit_with_retry(self, server: OpenSearch, index: str) -> str:
149
+ """Create a Point-in-Time (PIT) with retry logic"""
150
+ retry_decorator = self._get_retry_decorator()
151
+
152
+ @retry_decorator
153
+ def _create_pit() -> str:
154
+ logger.debug(f"Creating PIT for index: {index}")
155
+ response = server.create_pit(index, keep_alive="10m")
156
+ pit = response.get("pit_id")
157
+ if not pit:
158
+ raise Exception("Failed to create PIT - no pit_id returned")
159
+ logger.debug(f"Successfully created PIT: {pit}")
160
+ return pit
161
+
162
+ return _create_pit()
163
+
164
+ def _search_with_retry(
165
+ self, server: OpenSearch, query: dict, batch_size: int
166
+ ) -> dict:
167
+ """Execute search with retry logic"""
168
+ retry_decorator = self._get_retry_decorator()
169
+
170
+ @retry_decorator
171
+ def _search() -> dict:
172
+ logger.debug(f"Executing search with batch size: {batch_size}")
173
+ return server.search(
174
+ body=query,
175
+ size=batch_size,
176
+ params={"timeout": self.config.query_timeout},
177
+ )
178
+
179
+ return _search()
180
+
181
+ def _delete_pit_with_retry(self, server: OpenSearch, pit: str) -> None:
182
+ """Delete Point-in-Time (PIT) with retry logic"""
183
+ retry_decorator = self._get_retry_decorator()
184
+
185
+ @retry_decorator
186
+ def _delete_pit() -> None:
187
+ logger.debug(f"Deleting PIT: {pit}")
188
+ server.delete_pit(body={"pit_id": pit})
189
+ logger.debug(f"Successfully deleted PIT: {pit}")
190
+
191
+ _delete_pit()
192
+
193
+ def _create_opensearch_client_with_retry(self) -> OpenSearch:
194
+ """Create OpenSearch client with retry logic"""
195
+ retry_decorator = self._get_retry_decorator()
196
+
197
+ @retry_decorator
198
+ def _create_client() -> OpenSearch:
199
+ logger.debug(
200
+ f"Creating OpenSearch client for endpoint: {self.config.search_index.endpoint}"
201
+ )
202
+ return OpenSearch(
203
+ [self.config.search_index.endpoint],
204
+ http_auth=(
205
+ self.config.search_index.username,
206
+ self.config.search_index.password,
207
+ ),
208
+ use_ssl=self.config.search_index.use_ssl,
209
+ )
210
+
211
+ return _create_client()
212
+
213
+ def _update_report(self) -> None:
214
+ """
215
+ Information to see whether we are close to hitting the memory limits
216
+ """
217
+ self.report.valid_urns_count = len(self.valid_urns)
218
+ self.report.upstream_count = len(self.upstream_counts.keys())
219
+ self.report.downstream_count = len(self.downstream_counts.keys())
220
+
221
+ def _print_report(self) -> None:
222
+ """
223
+ Printing is required like this because the report is only printed
224
+ when the workunits are yielded
225
+ In case of background processes we won't know the progress if this is not done
226
+ """
227
+ time_taken = round(time.time() - self.last_print_time, 1)
228
+ # Print report every 2 minutes
229
+ if time_taken > 120:
230
+ self._update_report()
231
+ self.last_print_time = time.time()
232
+ logger.info(f"\n{self.report.as_string()}")
56
233
 
57
234
  def process_batch(self, results: Iterable[dict]) -> None:
58
235
  for doc in results:
236
+ self._print_report()
59
237
  row = ElasticGraphRow.from_elastic_doc(doc["_source"])
60
238
  self.report.edges_scanned += 1
61
239
  if (
@@ -65,20 +243,18 @@ class DataHubLineageFeaturesSource(Source):
65
243
  self.upstream_counts[row.source_urn] += 1
66
244
  self.downstream_counts[row.destination_urn] += 1
67
245
 
68
- def get_workunits(self) -> Iterable[MetadataWorkUnit]:
246
+ def populate_valid_urns(self) -> None:
69
247
  graph = self.ctx.require_graph("Load non soft-deleted urns")
70
248
  for urn in graph.get_urns_by_filter(batch_size=self.config.extract_batch_size):
249
+ self._print_report()
71
250
  self.valid_urns.add(urn)
72
251
 
252
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
253
+ with self.report.new_stage("Load valid URNs"):
254
+ self.populate_valid_urns()
255
+
73
256
  timestamp = datetime.now(tz=timezone.utc)
74
- server = OpenSearch(
75
- [self.config.search_index.endpoint],
76
- http_auth=(
77
- self.config.search_index.username,
78
- self.config.search_index.password,
79
- ),
80
- use_ssl=self.config.search_index.use_ssl,
81
- )
257
+ server = self._create_opensearch_client_with_retry()
82
258
 
83
259
  query = {
84
260
  "query": {
@@ -113,27 +289,42 @@ class DataHubLineageFeaturesSource(Source):
113
289
  }
114
290
 
115
291
  index = f"{self.config.search_index.index_prefix}graph_service_v1"
116
- response = server.create_pit(index, keep_alive="10m")
292
+ pit = self._create_pit_with_retry(server, index)
117
293
 
118
294
  # TODO: Save PIT, we can resume processing based on <pit, search_after> tuple
119
- pit = response.get("pit_id")
120
295
  query.update({"pit": {"id": pit, "keep_alive": "10m"}})
121
296
 
122
297
  # TODO: Using slicing we can parallelize the ES calls below:
123
298
  # https://opensearch.org/docs/latest/search-plugins/searching-data/point-in-time/#search-slicing
124
299
  batch_size = self.config.extract_batch_size
125
- while True:
126
- results = server.search(
127
- body=query,
128
- size=batch_size,
129
- params={"timeout": self.config.query_timeout},
130
- )
131
- self.process_batch(results["hits"]["hits"])
132
- if len(results["hits"]["hits"]) < batch_size:
133
- break
134
- query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
135
-
136
- server.delete_pit(body={"pit_id": pit})
300
+ with self.report.new_stage("Extract lineage features"):
301
+ try:
302
+ while True:
303
+ results = self._search_with_retry(server, query, batch_size)
304
+ self.process_batch(results["hits"]["hits"])
305
+ if len(results["hits"]["hits"]) < batch_size:
306
+ break
307
+ query.update({"search_after": results["hits"]["hits"][-1]["sort"]})
308
+ except Exception as e:
309
+ logger.error(f"Error during lineage extraction: {e}")
310
+ self.report.report_failure(
311
+ title="Lineage extraction failed",
312
+ message="Failed to extract lineage features from Elasticsearch",
313
+ context=f"Error: {str(e)}",
314
+ exc=e,
315
+ )
316
+ # Ensure PIT is cleaned up even on error
317
+ try:
318
+ self._delete_pit_with_retry(server, pit)
319
+ except Exception as cleanup_error:
320
+ logger.warning(
321
+ f"Failed to cleanup PIT after error: {cleanup_error}"
322
+ )
323
+ raise
324
+ # So previous stage's calculations are done
325
+ self.report.new_stage("Extract lineage features End")
326
+ self._update_report()
327
+ self._delete_pit_with_retry(server, pit)
137
328
 
138
329
  # In Python 3.9, can be replaced by `self.self.upstream_counts.keys() | self.downstream_counts.keys()`
139
330
  for urn in set(self.upstream_counts.keys()).union(
@@ -142,7 +333,7 @@ class DataHubLineageFeaturesSource(Source):
142
333
  logger.debug(
143
334
  f"{urn}: {self.upstream_counts[urn]}, {self.downstream_counts[urn]}"
144
335
  )
145
- yield MetadataChangeProposalWrapper(
336
+ wu = MetadataChangeProposalWrapper(
146
337
  entityUrn=urn,
147
338
  aspect=LineageFeaturesClass(
148
339
  upstreamCount=self.upstream_counts[urn],
@@ -153,6 +344,8 @@ class DataHubLineageFeaturesSource(Source):
153
344
  ),
154
345
  ),
155
346
  ).as_workunit()
347
+ self.report.report_workunit(wu)
348
+ yield wu
156
349
 
157
350
  def get_report(self) -> SourceReport:
158
351
  return self.report