acryl-datahub-cloud 0.3.8rc0__py3-none-any.whl → 0.3.8rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (33) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/source.py +0 -1
  3. acryl_datahub_cloud/datahub_metadata_sharing/__init__.py +0 -0
  4. acryl_datahub_cloud/datahub_metadata_sharing/metadata_sharing_source.py +262 -0
  5. acryl_datahub_cloud/datahub_metadata_sharing/query.py +7 -0
  6. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +0 -2
  7. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +0 -1
  8. acryl_datahub_cloud/datahub_reporting/extract_graph.py +0 -1
  9. acryl_datahub_cloud/datahub_reporting/extract_sql.py +0 -1
  10. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +163 -0
  11. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +29 -129
  12. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1612 -1567
  13. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  14. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executor/__init__.py +15 -0
  15. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  16. acryl_datahub_cloud/metadata/schema.avsc +25096 -25347
  17. acryl_datahub_cloud/metadata/schema_classes.py +807 -503
  18. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -0
  19. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +2 -1
  20. acryl_datahub_cloud/metadata/schemas/DataTransformLogic.avsc +63 -0
  21. acryl_datahub_cloud/metadata/schemas/EntityTypeKey.avsc +1 -0
  22. acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +9 -0
  23. acryl_datahub_cloud/metadata/schemas/ExecutionRequestResult.avsc +14 -0
  24. acryl_datahub_cloud/metadata/schemas/PostInfo.avsc +23 -0
  25. acryl_datahub_cloud/metadata/schemas/RemoteExecutorKey.avsc +21 -0
  26. acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +80 -0
  27. acryl_datahub_cloud/metadata/schemas/VersionProperties.avsc +4 -0
  28. acryl_datahub_cloud/metadata/schemas/__init__.py +3 -3
  29. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/METADATA +34 -33
  30. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/RECORD +33 -25
  31. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/WHEEL +1 -1
  32. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/entry_points.txt +1 -0
  33. {acryl_datahub_cloud-0.3.8rc0.dist-info → acryl_datahub_cloud-0.3.8rc1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "acryl-datahub-cloud",
3
- "version": "0.3.8rc0",
3
+ "version": "0.3.8rc1",
4
4
  "install_requires": [
5
5
  "avro-gen3==0.7.16",
6
6
  "acryl-datahub"
@@ -100,7 +100,6 @@ class AcrylCSIssuesSource(Source):
100
100
  def _provision_platform(
101
101
  self, platform: str, logo_url: str, graph: DataHubGraph
102
102
  ) -> None:
103
-
104
103
  platform_urn = make_data_platform_urn(platform)
105
104
  if not graph.exists(platform_urn):
106
105
  platform_info = DataPlatformInfoClass(
@@ -0,0 +1,262 @@
1
+ import logging
2
+ import time
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
5
+
6
+ from pydantic import BaseModel
7
+ from tenacity import (
8
+ retry,
9
+ retry_if_exception_type,
10
+ stop_after_attempt,
11
+ wait_exponential,
12
+ )
13
+
14
+ from acryl_datahub_cloud.datahub_metadata_sharing.query import (
15
+ GRAPHQL_SCROLL_SHARED_ENTITIES,
16
+ GRAPHQL_SHARE_ENTITY,
17
+ )
18
+ from datahub.ingestion.api.common import PipelineContext
19
+ from datahub.ingestion.api.decorators import (
20
+ SupportStatus,
21
+ config_class,
22
+ platform_name,
23
+ support_status,
24
+ )
25
+ from datahub.ingestion.api.source import Source, SourceReport
26
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
27
+ from datahub.ingestion.graph.client import DataHubGraph
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class GraphQLError(Exception):
33
+ """Custom exception for GraphQL-specific errors"""
34
+
35
+ pass
36
+
37
+
38
+ class DataHubMetadataSharingSourceConfig(BaseModel):
39
+ batch_size: int = 100
40
+ batch_delay_ms: int = 100
41
+ max_retries: int = 3
42
+ initial_retry_delay_ms: int = 1000
43
+
44
+
45
+ @dataclass
46
+ class DataHubMetadataSharingSourceReport(SourceReport):
47
+ entities_shared: int = 0
48
+ entities_failed: int = 0
49
+ implicit_entities_skipped: int = 0
50
+ batches_processed: int = 0
51
+
52
+
53
+ @platform_name(id="datahub", platform_name="DataHub")
54
+ @config_class(DataHubMetadataSharingSourceConfig)
55
+ @support_status(SupportStatus.INCUBATING)
56
+ class DataHubMetadataSharingSource(Source):
57
+ """MetadataSharing Source that reshares entities across DataHub instances"""
58
+
59
+ def __init__(
60
+ self, config: DataHubMetadataSharingSourceConfig, ctx: PipelineContext
61
+ ):
62
+ super().__init__(ctx)
63
+ self.config: DataHubMetadataSharingSourceConfig = config
64
+ self.report = DataHubMetadataSharingSourceReport()
65
+ self.graph: Optional[DataHubGraph] = None
66
+
67
+ @retry(
68
+ retry=retry_if_exception_type((GraphQLError, ConnectionError)),
69
+ stop=stop_after_attempt(3),
70
+ wait=wait_exponential(multiplier=1, min=4, max=10),
71
+ reraise=True,
72
+ )
73
+ def execute_graphql_with_retry(
74
+ self, query: str, variables: Dict[str, Any]
75
+ ) -> Dict[str, Any]:
76
+ """Execute GraphQL query with retry logic"""
77
+ if self.graph is None:
78
+ raise ValueError("Graph client not initialized")
79
+ response = self.graph.execute_graphql(query, variables=variables)
80
+ error = response.get("error")
81
+ if error:
82
+ raise GraphQLError(f"GraphQL error: {error}")
83
+ return response
84
+
85
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
86
+ self.graph = self.ctx.require_graph("Loading default graph coordinates.")
87
+
88
+ self.reshare_entities()
89
+
90
+ # This source doesn't produce any work units
91
+ return []
92
+
93
+ def reshare_entities(self) -> None:
94
+ scroll_id: Optional[str] = None
95
+ current_batch_number: int = 1
96
+
97
+ try:
98
+ while True:
99
+ next_scroll_id, results = self.scroll_shared_entities(
100
+ scroll_id, self.config.batch_size
101
+ )
102
+
103
+ for result in results:
104
+ self._process_single_entity(result)
105
+
106
+ self.report.batches_processed = current_batch_number
107
+ self.report.info(
108
+ message="Completed sharing batch of entities.",
109
+ context=f"{current_batch_number} of size {self.config.batch_size}!",
110
+ )
111
+ current_batch_number += 1
112
+
113
+ if next_scroll_id is None:
114
+ break
115
+
116
+ time.sleep(self.config.batch_delay_ms / 1000.0)
117
+
118
+ except Exception as e:
119
+ self.report.report_failure(
120
+ title="Failed to process batches",
121
+ message="Error occurred while processing one or more batches!",
122
+ context=f"message = {str(e)}",
123
+ exc=e,
124
+ )
125
+ return
126
+
127
+ self.report.info(
128
+ message="Completed sharing all entities.",
129
+ context=f"Successfully shared {self.report.entities_shared} entities, "
130
+ f"failed to share {self.report.entities_failed} entities.",
131
+ )
132
+
133
+ # Rest of the methods remain the same...
134
+
135
+ def _process_single_entity(self, result: Dict[str, Any]) -> None:
136
+ """Process a single entity result"""
137
+ entity_urn = result.get("entity", {}).get("urn", None)
138
+ share_results = (
139
+ result.get("entity", {}).get("share", {}).get("lastShareResults", [])
140
+ )
141
+
142
+ if entity_urn is None:
143
+ self.report.report_warning(
144
+ message="Failed to resolve entity urn for shared asset! Skipping...",
145
+ context=f"Response: {str(result)}",
146
+ )
147
+ return
148
+
149
+ for share_result in share_results:
150
+ try:
151
+ destination_data = share_result.get("destination", {})
152
+ destination_urn = destination_data.get("urn", "")
153
+ previous_status = share_result.get("status")
154
+ share_config = share_result.get("shareConfig", {})
155
+
156
+ # Important: If there is implicit entity, we should skip this urn.
157
+ # This means the entity was not EXPLICITLY shared, so we do not want to explicitly share here.
158
+ implicit_shared_entity = share_result.get("implicitShareEntity")
159
+ is_implicitly_shared = (
160
+ implicit_shared_entity is not None
161
+ and "urn" in implicit_shared_entity
162
+ )
163
+
164
+ if is_implicitly_shared:
165
+ self.report.implicit_entities_skipped += 1
166
+ continue
167
+
168
+ if previous_status != "SUCCESS":
169
+ self.report.report_warning(
170
+ message="Attempting to share a previously unsuccessful shared entity!",
171
+ context=f"entity urn: {entity_urn}, destination urn: {destination_urn}",
172
+ )
173
+
174
+ lineage_direction = self._determine_lineage_direction(share_config)
175
+
176
+ shared = self.share_entity(
177
+ entity_urn=entity_urn,
178
+ destination_urn=destination_urn,
179
+ lineage_direction=lineage_direction,
180
+ )
181
+
182
+ if shared:
183
+ self.report.entities_shared += 1
184
+ else:
185
+ self.report.entities_failed += 1
186
+
187
+ except Exception as e:
188
+ self.report.report_warning(
189
+ message="Failed to share single entity!",
190
+ context=f"entity urn: {entity_urn}",
191
+ )
192
+ logger.exception(f"Error processing entity {entity_urn}", e)
193
+ self.report.entities_failed += 1
194
+
195
+ def _determine_lineage_direction(
196
+ self, share_config: Dict[str, Any]
197
+ ) -> Optional[str]:
198
+ """Determine lineage direction based on share config"""
199
+ include_upstreams = share_config.get("enableUpstreamLineage", False)
200
+ include_downstreams = share_config.get(
201
+ "enableDownstreamLineage", False
202
+ ) # Fixed typo
203
+
204
+ if include_upstreams and include_downstreams:
205
+ return "BOTH"
206
+ if include_upstreams:
207
+ return "UPSTREAM"
208
+ if include_downstreams:
209
+ return "DOWNSTREAM"
210
+ return None
211
+
212
+ def scroll_shared_entities(
213
+ self, scroll_id: Optional[str], count: int
214
+ ) -> Tuple[Optional[str], List[Dict[str, Any]]]:
215
+ """Scroll through shared entities with retry logic"""
216
+ response = self.execute_graphql_with_retry(
217
+ GRAPHQL_SCROLL_SHARED_ENTITIES,
218
+ variables={
219
+ "scrollId": scroll_id,
220
+ "count": count,
221
+ },
222
+ )
223
+
224
+ result = response.get("scrollAcrossEntities", {})
225
+ return result.get("nextScrollId"), result.get("searchResults", [])
226
+
227
+ def share_entity(
228
+ self, entity_urn: str, destination_urn: str, lineage_direction: Optional[str]
229
+ ) -> bool:
230
+ """Share entity with retry logic"""
231
+ try:
232
+ response = self.execute_graphql_with_retry(
233
+ GRAPHQL_SHARE_ENTITY,
234
+ variables={
235
+ "entityUrn": entity_urn,
236
+ "destinationUrn": destination_urn,
237
+ "lineageDirection": lineage_direction,
238
+ },
239
+ )
240
+
241
+ result = response.get("shareEntity", {})
242
+ if not result.get("succeeded", False):
243
+ self.report.report_failure(
244
+ title="Failed to Share Entity",
245
+ message="Response returned that success failed for entity and destination!",
246
+ context=f"entity urn: {entity_urn}, destination urn: {destination_urn}",
247
+ )
248
+ return False
249
+
250
+ return True
251
+
252
+ except Exception as e:
253
+ self.report.report_failure(
254
+ title="Failed to Share Entity",
255
+ message="Exception occurred while sharing entity",
256
+ context=f"entity urn: {entity_urn}, destination urn: {destination_urn}",
257
+ exc=e,
258
+ )
259
+ return False
260
+
261
+ def get_report(self) -> SourceReport:
262
+ return self.report
@@ -0,0 +1,7 @@
1
+ import pathlib
2
+
3
+ GRAPHQL_SCROLL_SHARED_ENTITIES = (
4
+ pathlib.Path(__file__).parent / "scroll_shared_entities.gql"
5
+ ).read_text()
6
+
7
+ GRAPHQL_SHARE_ENTITY = (pathlib.Path(__file__).parent / "share_entity.gql").read_text()
@@ -409,7 +409,6 @@ class DataHubBasedS3Dataset:
409
409
  physical_uri: str,
410
410
  local_file: str,
411
411
  ) -> Iterable[MetadataChangeProposalWrapper]:
412
-
413
412
  aspects: List = []
414
413
  mcps: List[MetadataChangeProposalWrapper] = self._update_presigned_url(
415
414
  dataset_urn, physical_uri
@@ -456,7 +455,6 @@ class DataHubBasedS3Dataset:
456
455
  physical_uri: str,
457
456
  dataset_properties: Optional[DatasetPropertiesClass] = None,
458
457
  ) -> List[MetadataChangeProposalWrapper]:
459
-
460
458
  if self.config.generate_presigned_url:
461
459
  external_url = self._generate_presigned_url(physical_uri)
462
460
  else:
@@ -180,7 +180,6 @@ class DataHubFormReportingData(FormData):
180
180
  def form_assigned_date(
181
181
  self, search_row: DataHubDatasetSearchRow
182
182
  ) -> Dict[str, date]:
183
-
184
183
  form_assigned_dates: Dict[str, date] = {}
185
184
  forms = self.graph.get_aspect(search_row.urn, FormsClass)
186
185
  if not forms:
@@ -118,7 +118,6 @@ class DataHubReportingExtractGraphSource(Source):
118
118
  return skip_extract
119
119
 
120
120
  def get_workunits(self):
121
-
122
121
  self.graph = (
123
122
  self.ctx.require_graph("Loading default graph coordinates.")
124
123
  if self.config.server is None
@@ -118,7 +118,6 @@ class DataHubReportingExtractSQLSource(Source):
118
118
  return skip_extract
119
119
 
120
120
  def get_workunits(self):
121
-
122
121
  self.graph = (
123
122
  self.ctx.require_graph("Loading default graph coordinates.")
124
123
  if self.config.server is None
@@ -0,0 +1,163 @@
1
+ from typing import Dict
2
+
3
+
4
+ class QueryBuilder:
5
+ @staticmethod
6
+ def get_soft_deleted_entities_query() -> Dict:
7
+ return {
8
+ "sort": [{"urn": {"order": "asc"}}],
9
+ }
10
+
11
+ @staticmethod
12
+ def get_query_entities_query() -> Dict:
13
+ return {
14
+ "sort": [{"urn": {"order": "asc"}}],
15
+ "query": {
16
+ "bool": {
17
+ "filter": {
18
+ "bool": {
19
+ "must_not": [
20
+ {"term": {"source": "MANUAL"}},
21
+ ]
22
+ }
23
+ }
24
+ }
25
+ },
26
+ }
27
+
28
+ @staticmethod
29
+ def get_upstreams_query() -> Dict:
30
+ return {
31
+ "sort": [{"destination.urn": {"order": "asc"}}],
32
+ "query": {
33
+ "bool": {
34
+ "must": [
35
+ {"terms": {"destination.entityType": ["dataset"]}},
36
+ {"terms": {"source.entityType": ["dataset"]}},
37
+ ]
38
+ }
39
+ },
40
+ }
41
+
42
+ @staticmethod
43
+ def get_dashboard_usage_query(days: int) -> Dict:
44
+ return {
45
+ "sort": [{"urn": {"order": "asc"}}],
46
+ "query": {
47
+ "bool": {
48
+ "filter": {
49
+ "bool": {
50
+ "must": [
51
+ {
52
+ "range": {
53
+ "@timestamp": {
54
+ "gte": f"now-{days}d",
55
+ "lt": "now/d",
56
+ }
57
+ }
58
+ },
59
+ {"term": {"isExploded": False}},
60
+ ]
61
+ }
62
+ }
63
+ }
64
+ },
65
+ }
66
+
67
+ @staticmethod
68
+ def get_dataset_usage_query(days: int) -> Dict:
69
+ return {
70
+ "sort": [{"urn": {"order": "asc"}}],
71
+ "query": {
72
+ "bool": {
73
+ "filter": {
74
+ "bool": {
75
+ "must": [
76
+ {
77
+ "range": {
78
+ "@timestamp": {
79
+ "gte": f"now-{days}d/d",
80
+ "lt": "now/d",
81
+ }
82
+ }
83
+ },
84
+ {"term": {"isExploded": False}},
85
+ {"range": {"totalSqlQueries": {"gt": 0}}},
86
+ ]
87
+ }
88
+ }
89
+ }
90
+ },
91
+ }
92
+
93
+ @staticmethod
94
+ def get_dataset_write_usage_raw_query(days: int) -> Dict:
95
+ return {
96
+ "sort": [{"urn": {"order": "asc"}}, {"@timestamp": {"order": "asc"}}],
97
+ "query": {
98
+ "bool": {
99
+ "must": [
100
+ {
101
+ "range": {
102
+ "@timestamp": {"gte": f"now-{days}d/d", "lte": "now/d"}
103
+ }
104
+ },
105
+ {"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
106
+ ]
107
+ }
108
+ },
109
+ "_source": {
110
+ "includes": ["urn", "@timestamp"],
111
+ },
112
+ }
113
+
114
+ @staticmethod
115
+ def get_dataset_write_usage_composite_query(days: int) -> Dict:
116
+ return {
117
+ "query": {
118
+ "bool": {
119
+ "must": [
120
+ {
121
+ "range": {
122
+ "@timestamp": {"gte": f"now-{days}d/d", "lte": "now/d"}
123
+ }
124
+ },
125
+ {"terms": {"operationType": ["INSERT", "UPDATE", "CREATE"]}},
126
+ ]
127
+ }
128
+ },
129
+ "aggs": {
130
+ "urn_count": {
131
+ "composite": {
132
+ "sources": [
133
+ {"dataset_operationaspect_v1": {"terms": {"field": "urn"}}}
134
+ ]
135
+ }
136
+ }
137
+ },
138
+ }
139
+
140
+ @staticmethod
141
+ def get_query_usage_query(days: int) -> Dict:
142
+ return {
143
+ "sort": [{"urn": {"order": "asc"}}],
144
+ "query": {
145
+ "bool": {
146
+ "filter": {
147
+ "bool": {
148
+ "must": [
149
+ {
150
+ "range": {
151
+ "@timestamp": {
152
+ "gte": f"now-{days}d/d",
153
+ "lt": "now/d",
154
+ }
155
+ }
156
+ },
157
+ {"term": {"isExploded": False}},
158
+ ]
159
+ }
160
+ }
161
+ }
162
+ },
163
+ }