acryl-datahub 0.15.0.1rc12__py3-none-any.whl → 0.15.0.1rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/METADATA +2369 -2369
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/RECORD +35 -31
- datahub/__init__.py +1 -1
- datahub/emitter/mce_builder.py +3 -3
- datahub/emitter/mcp_patch_builder.py +36 -12
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/gc/dataprocess_cleanup.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +159 -71
- datahub/ingestion/source/tableau/tableau.py +3 -0
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/metadata/_schema_classes.py +61 -1
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/schema.avsc +64 -29
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor,
|
|
3
|
+
from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
|
-
from
|
|
6
|
+
from threading import Lock
|
|
7
|
+
from typing import Dict, Iterable, List, Optional
|
|
7
8
|
|
|
8
9
|
from pydantic import Field
|
|
9
10
|
|
|
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
22
|
+
QUERY_QUERY_ENTITY = """
|
|
23
|
+
query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
+
scrollAcrossEntities(input: $input) {
|
|
25
|
+
nextScrollId
|
|
26
|
+
count
|
|
27
|
+
searchResults {
|
|
28
|
+
entity {
|
|
29
|
+
... on QueryEntity {
|
|
30
|
+
urn
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
21
38
|
|
|
22
39
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
23
40
|
enabled: bool = Field(
|
|
24
41
|
default=True, description="Whether to do soft deletion cleanup."
|
|
25
42
|
)
|
|
26
|
-
retention_days:
|
|
43
|
+
retention_days: int = Field(
|
|
27
44
|
10,
|
|
28
45
|
description="Number of days to retain metadata in DataHub",
|
|
29
46
|
)
|
|
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
62
79
|
default=None,
|
|
63
80
|
description="Query to filter entities",
|
|
64
81
|
)
|
|
82
|
+
|
|
65
83
|
limit_entities_delete: Optional[int] = Field(
|
|
66
84
|
25000, description="Max number of entities to delete."
|
|
67
85
|
)
|
|
68
86
|
|
|
69
|
-
|
|
70
|
-
|
|
87
|
+
futures_max_at_time: int = Field(
|
|
88
|
+
1000, description="Max number of futures to have at a time."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
runtime_limit_seconds: int = Field(
|
|
92
|
+
7200, # 2 hours by default
|
|
71
93
|
description="Runtime limit in seconds",
|
|
72
94
|
)
|
|
73
95
|
|
|
74
96
|
|
|
75
97
|
@dataclass
|
|
76
98
|
class SoftDeletedEntitiesReport(SourceReport):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
99
|
+
num_queries_found: int = 0
|
|
100
|
+
num_soft_deleted_entity_processed: int = 0
|
|
101
|
+
num_soft_deleted_retained_due_to_age: int = 0
|
|
102
|
+
num_soft_deleted_entity_removal_started: int = 0
|
|
103
|
+
num_hard_deleted: int = 0
|
|
104
|
+
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
105
|
+
sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
82
106
|
default_factory=TopKDict
|
|
83
107
|
)
|
|
84
108
|
|
|
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
|
|
|
103
127
|
self.config = config
|
|
104
128
|
self.report = report
|
|
105
129
|
self.dry_run = dry_run
|
|
130
|
+
self.start_time = 0.0
|
|
131
|
+
self._report_lock: Lock = Lock()
|
|
132
|
+
self.last_print_time = 0.0
|
|
133
|
+
|
|
134
|
+
def _increment_retained_count(self) -> None:
|
|
135
|
+
"""Thread-safe method to update report fields"""
|
|
136
|
+
with self._report_lock:
|
|
137
|
+
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
138
|
+
|
|
139
|
+
def _increment_removal_started_count(self) -> None:
|
|
140
|
+
"""Thread-safe method to update report fields"""
|
|
141
|
+
with self._report_lock:
|
|
142
|
+
self.report.num_soft_deleted_entity_removal_started += 1
|
|
143
|
+
|
|
144
|
+
def _update_report(self, urn: str, entity_type: str) -> None:
|
|
145
|
+
"""Thread-safe method to update report fields"""
|
|
146
|
+
with self._report_lock:
|
|
147
|
+
self.report.num_hard_deleted += 1
|
|
148
|
+
|
|
149
|
+
current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
|
|
150
|
+
self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
|
|
151
|
+
if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
|
|
152
|
+
self.report.sample_hard_deleted_aspects_by_type[
|
|
153
|
+
entity_type
|
|
154
|
+
] = LossyList()
|
|
155
|
+
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
106
156
|
|
|
107
157
|
def delete_entity(self, urn: str) -> None:
|
|
108
158
|
assert self.ctx.graph
|
|
109
159
|
|
|
110
160
|
entity_urn = Urn.from_string(urn)
|
|
111
|
-
self.report.num_soft_deleted_entity_removed += 1
|
|
112
|
-
self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
|
|
113
|
-
self.report.num_soft_deleted_entity_removed_by_type.get(
|
|
114
|
-
entity_urn.entity_type, 0
|
|
115
|
-
)
|
|
116
|
-
+ 1
|
|
117
|
-
)
|
|
118
|
-
if (
|
|
119
|
-
entity_urn.entity_type
|
|
120
|
-
not in self.report.sample_soft_deleted_removed_aspects_by_type
|
|
121
|
-
):
|
|
122
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
123
|
-
entity_urn.entity_type
|
|
124
|
-
] = LossyList()
|
|
125
|
-
self.report.sample_soft_deleted_removed_aspects_by_type[
|
|
126
|
-
entity_urn.entity_type
|
|
127
|
-
].append(urn)
|
|
128
|
-
|
|
129
161
|
if self.dry_run:
|
|
130
162
|
logger.info(
|
|
131
163
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
132
164
|
)
|
|
133
165
|
return
|
|
134
|
-
|
|
166
|
+
self._increment_removal_started_count()
|
|
135
167
|
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
136
168
|
self.ctx.graph.delete_references_to_urn(
|
|
137
169
|
urn=urn,
|
|
138
170
|
dry_run=False,
|
|
139
171
|
)
|
|
172
|
+
self._update_report(urn, entity_urn.entity_type)
|
|
140
173
|
|
|
141
174
|
def delete_soft_deleted_entity(self, urn: str) -> None:
|
|
142
175
|
assert self.ctx.graph
|
|
143
176
|
|
|
144
|
-
if self.config.retention_days is None:
|
|
145
|
-
logger.info("Retention days is not set, skipping soft delete cleanup")
|
|
146
|
-
return
|
|
147
|
-
|
|
148
177
|
retention_time = (
|
|
149
178
|
int(datetime.now(timezone.utc).timestamp())
|
|
150
179
|
- self.config.retention_days * 24 * 60 * 60
|
|
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
|
|
|
157
186
|
]["created"]["time"] < (retention_time * 1000):
|
|
158
187
|
logger.debug(f"Hard deleting {urn}")
|
|
159
188
|
self.delete_entity(urn)
|
|
189
|
+
else:
|
|
190
|
+
self._increment_retained_count()
|
|
191
|
+
|
|
192
|
+
def _print_report(self) -> None:
|
|
193
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
194
|
+
# Print report every 2 minutes
|
|
195
|
+
if time_taken > 120:
|
|
196
|
+
self.last_print_time = time.time()
|
|
197
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
198
|
+
|
|
199
|
+
def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
|
|
200
|
+
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
201
|
+
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
202
|
+
|
|
203
|
+
for future in done:
|
|
204
|
+
self._print_report()
|
|
205
|
+
if future.exception():
|
|
206
|
+
logger.error(
|
|
207
|
+
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
+
)
|
|
209
|
+
self.report.failure(
|
|
210
|
+
f"Failed to delete entity {futures[future]}",
|
|
211
|
+
exc=future.exception(),
|
|
212
|
+
)
|
|
213
|
+
self.report.num_soft_deleted_entity_processed += 1
|
|
214
|
+
if (
|
|
215
|
+
self.report.num_soft_deleted_entity_processed % self.config.batch_size
|
|
216
|
+
== 0
|
|
217
|
+
):
|
|
218
|
+
if self.config.delay:
|
|
219
|
+
logger.debug(
|
|
220
|
+
f"Sleeping for {self.config.delay} seconds before further processing batch"
|
|
221
|
+
)
|
|
222
|
+
time.sleep(self.config.delay)
|
|
223
|
+
return futures
|
|
160
224
|
|
|
161
|
-
def
|
|
162
|
-
if not self.config.enabled:
|
|
163
|
-
return
|
|
225
|
+
def _get_soft_deleted_queries(self) -> Iterable[str]:
|
|
164
226
|
assert self.ctx.graph
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
227
|
+
scroll_id: Optional[str] = None
|
|
228
|
+
while True:
|
|
229
|
+
try:
|
|
230
|
+
result = self.ctx.graph.execute_graphql(
|
|
231
|
+
QUERY_QUERY_ENTITY,
|
|
232
|
+
{
|
|
233
|
+
"input": {
|
|
234
|
+
"types": ["QUERY"],
|
|
235
|
+
"query": "*",
|
|
236
|
+
"scrollId": scroll_id if scroll_id else None,
|
|
237
|
+
"count": self.config.batch_size,
|
|
238
|
+
"orFilters": [
|
|
239
|
+
{
|
|
240
|
+
"and": [
|
|
241
|
+
{
|
|
242
|
+
"field": "removed",
|
|
243
|
+
"values": ["true"],
|
|
244
|
+
"condition": "EQUAL",
|
|
245
|
+
}
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
],
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
self.report.failure(
|
|
254
|
+
f"While trying to get queries with {scroll_id}", exc=e
|
|
255
|
+
)
|
|
256
|
+
break
|
|
257
|
+
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
258
|
+
if not scroll_across_entities:
|
|
259
|
+
break
|
|
260
|
+
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
261
|
+
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
262
|
+
for query in scroll_across_entities.get("searchResults"):
|
|
263
|
+
yield query["entity"]["urn"]
|
|
264
|
+
|
|
265
|
+
def _get_urns(self) -> Iterable[str]:
|
|
266
|
+
assert self.ctx.graph
|
|
267
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
169
268
|
entity_types=self.config.entity_types,
|
|
170
269
|
platform=self.config.platform,
|
|
171
270
|
env=self.config.env,
|
|
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
|
|
|
173
272
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
174
273
|
batch_size=self.config.batch_size,
|
|
175
274
|
)
|
|
275
|
+
yield from self._get_soft_deleted_queries()
|
|
276
|
+
|
|
277
|
+
def cleanup_soft_deleted_entities(self) -> None:
|
|
278
|
+
if not self.config.enabled:
|
|
279
|
+
return
|
|
280
|
+
self.start_time = time.time()
|
|
176
281
|
|
|
177
|
-
futures =
|
|
282
|
+
futures: Dict[Future, str] = dict()
|
|
178
283
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
284
|
+
for urn in self._get_urns():
|
|
285
|
+
self._print_report()
|
|
286
|
+
while len(futures) >= self.config.futures_max_at_time:
|
|
287
|
+
futures = self._process_futures(futures)
|
|
182
288
|
if (
|
|
183
289
|
self.config.limit_entities_delete
|
|
184
|
-
and
|
|
290
|
+
and self.report.num_hard_deleted > self.config.limit_entities_delete
|
|
185
291
|
):
|
|
186
292
|
logger.info(
|
|
187
|
-
f"Limit of {self.config.limit_entities_delete} entities reached.
|
|
293
|
+
f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
|
|
188
294
|
)
|
|
189
295
|
break
|
|
190
296
|
if (
|
|
191
297
|
self.config.runtime_limit_seconds
|
|
192
|
-
and time.time() -
|
|
298
|
+
and time.time() - self.start_time
|
|
299
|
+
> self.config.runtime_limit_seconds
|
|
193
300
|
):
|
|
194
301
|
logger.info(
|
|
195
|
-
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached.
|
|
302
|
+
f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
|
|
196
303
|
)
|
|
197
304
|
break
|
|
198
305
|
|
|
199
306
|
future = executor.submit(self.delete_soft_deleted_entity, urn)
|
|
200
307
|
futures[future] = urn
|
|
201
308
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
logger.error(
|
|
207
|
-
f"Failed to delete entity {futures[future]}: {future.exception()}"
|
|
208
|
-
)
|
|
209
|
-
self.report.failure(
|
|
210
|
-
f"Failed to delete entity {futures[future]}",
|
|
211
|
-
exc=future.exception(),
|
|
212
|
-
)
|
|
213
|
-
deleted_count_retention += 1
|
|
214
|
-
|
|
215
|
-
if deleted_count_retention % self.config.batch_size == 0:
|
|
216
|
-
logger.info(
|
|
217
|
-
f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
if self.config.delay:
|
|
221
|
-
logger.debug(
|
|
222
|
-
f"Sleeping for {self.config.delay} seconds before getting next batch"
|
|
223
|
-
)
|
|
224
|
-
time.sleep(self.config.delay)
|
|
309
|
+
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
310
|
+
while len(futures) > 0:
|
|
311
|
+
self._print_report()
|
|
312
|
+
futures = self._process_futures(futures)
|
|
@@ -109,6 +109,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
109
109
|
make_filter,
|
|
110
110
|
make_fine_grained_lineage_class,
|
|
111
111
|
make_upstream_class,
|
|
112
|
+
optimize_query_filter,
|
|
112
113
|
published_datasource_graphql_query,
|
|
113
114
|
query_metadata_cursor_based_pagination,
|
|
114
115
|
sheet_graphql_query,
|
|
@@ -1363,6 +1364,8 @@ class TableauSiteSource:
|
|
|
1363
1364
|
query_filter: dict = {},
|
|
1364
1365
|
page_size_override: Optional[int] = None,
|
|
1365
1366
|
) -> Iterable[dict]:
|
|
1367
|
+
query_filter = optimize_query_filter(query_filter)
|
|
1368
|
+
|
|
1366
1369
|
# Calls the get_connection_object_page function to get the objects,
|
|
1367
1370
|
# and automatically handles pagination.
|
|
1368
1371
|
page_size = page_size_override or self.config.page_size
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import html
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
@@ -35,6 +36,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
36
|
UpstreamClass,
|
|
36
37
|
)
|
|
37
38
|
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
|
|
39
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
38
40
|
|
|
39
41
|
logger = logging.getLogger(__name__)
|
|
40
42
|
|
|
@@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
|
|
|
1000
1002
|
]
|
|
1001
1003
|
|
|
1002
1004
|
return filter_pages
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def optimize_query_filter(query_filter: dict) -> dict:
|
|
1008
|
+
"""
|
|
1009
|
+
Duplicates in the filter cause duplicates in the result,
|
|
1010
|
+
leading to entities/aspects being emitted multiple times unnecessarily
|
|
1011
|
+
"""
|
|
1012
|
+
optimized_query = copy.deepcopy(query_filter)
|
|
1013
|
+
|
|
1014
|
+
if query_filter.get(c.ID_WITH_IN):
|
|
1015
|
+
optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN]))
|
|
1016
|
+
if query_filter.get(c.PROJECT_NAME_WITH_IN):
|
|
1017
|
+
optimized_query[c.PROJECT_NAME_WITH_IN] = list(
|
|
1018
|
+
OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN])
|
|
1019
|
+
)
|
|
1020
|
+
return optimized_query
|
|
@@ -4053,6 +4053,60 @@ class DataPlatformInstanceClass(_Aspect):
|
|
|
4053
4053
|
self._inner_dict['instance'] = value
|
|
4054
4054
|
|
|
4055
4055
|
|
|
4056
|
+
class DataTransformClass(DictWrapper):
|
|
4057
|
+
"""Information about a transformation. It may be a query,"""
|
|
4058
|
+
|
|
4059
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransform")
|
|
4060
|
+
def __init__(self,
|
|
4061
|
+
queryStatement: Union[None, "QueryStatementClass"]=None,
|
|
4062
|
+
):
|
|
4063
|
+
super().__init__()
|
|
4064
|
+
|
|
4065
|
+
self.queryStatement = queryStatement
|
|
4066
|
+
|
|
4067
|
+
def _restore_defaults(self) -> None:
|
|
4068
|
+
self.queryStatement = self.RECORD_SCHEMA.fields_dict["queryStatement"].default
|
|
4069
|
+
|
|
4070
|
+
|
|
4071
|
+
@property
|
|
4072
|
+
def queryStatement(self) -> Union[None, "QueryStatementClass"]:
|
|
4073
|
+
"""The data transform may be defined by a query statement"""
|
|
4074
|
+
return self._inner_dict.get('queryStatement') # type: ignore
|
|
4075
|
+
|
|
4076
|
+
@queryStatement.setter
|
|
4077
|
+
def queryStatement(self, value: Union[None, "QueryStatementClass"]) -> None:
|
|
4078
|
+
self._inner_dict['queryStatement'] = value
|
|
4079
|
+
|
|
4080
|
+
|
|
4081
|
+
class DataTransformLogicClass(_Aspect):
|
|
4082
|
+
"""Information about a Query against one or more data assets (e.g. Tables or Views)."""
|
|
4083
|
+
|
|
4084
|
+
|
|
4085
|
+
ASPECT_NAME = 'dataTransformLogic'
|
|
4086
|
+
ASPECT_INFO = {}
|
|
4087
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransformLogic")
|
|
4088
|
+
|
|
4089
|
+
def __init__(self,
|
|
4090
|
+
transforms: List["DataTransformClass"],
|
|
4091
|
+
):
|
|
4092
|
+
super().__init__()
|
|
4093
|
+
|
|
4094
|
+
self.transforms = transforms
|
|
4095
|
+
|
|
4096
|
+
def _restore_defaults(self) -> None:
|
|
4097
|
+
self.transforms = list()
|
|
4098
|
+
|
|
4099
|
+
|
|
4100
|
+
@property
|
|
4101
|
+
def transforms(self) -> List["DataTransformClass"]:
|
|
4102
|
+
"""List of transformations applied"""
|
|
4103
|
+
return self._inner_dict.get('transforms') # type: ignore
|
|
4104
|
+
|
|
4105
|
+
@transforms.setter
|
|
4106
|
+
def transforms(self, value: List["DataTransformClass"]) -> None:
|
|
4107
|
+
self._inner_dict['transforms'] = value
|
|
4108
|
+
|
|
4109
|
+
|
|
4056
4110
|
class DeprecationClass(_Aspect):
|
|
4057
4111
|
"""Deprecation status of an entity"""
|
|
4058
4112
|
|
|
@@ -14624,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
|
|
|
14624
14678
|
|
|
14625
14679
|
|
|
14626
14680
|
ASPECT_NAME = 'dataJobKey'
|
|
14627
|
-
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults']}
|
|
14681
|
+
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14628
14682
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
|
|
14629
14683
|
|
|
14630
14684
|
def __init__(self,
|
|
@@ -24715,6 +24769,8 @@ __SCHEMA_TYPES = {
|
|
|
24715
24769
|
'com.linkedin.pegasus2avro.common.CostCostDiscriminator': CostCostDiscriminatorClass,
|
|
24716
24770
|
'com.linkedin.pegasus2avro.common.CostType': CostTypeClass,
|
|
24717
24771
|
'com.linkedin.pegasus2avro.common.DataPlatformInstance': DataPlatformInstanceClass,
|
|
24772
|
+
'com.linkedin.pegasus2avro.common.DataTransform': DataTransformClass,
|
|
24773
|
+
'com.linkedin.pegasus2avro.common.DataTransformLogic': DataTransformLogicClass,
|
|
24718
24774
|
'com.linkedin.pegasus2avro.common.Deprecation': DeprecationClass,
|
|
24719
24775
|
'com.linkedin.pegasus2avro.common.Documentation': DocumentationClass,
|
|
24720
24776
|
'com.linkedin.pegasus2avro.common.DocumentationAssociation': DocumentationAssociationClass,
|
|
@@ -25182,6 +25238,8 @@ __SCHEMA_TYPES = {
|
|
|
25182
25238
|
'CostCostDiscriminator': CostCostDiscriminatorClass,
|
|
25183
25239
|
'CostType': CostTypeClass,
|
|
25184
25240
|
'DataPlatformInstance': DataPlatformInstanceClass,
|
|
25241
|
+
'DataTransform': DataTransformClass,
|
|
25242
|
+
'DataTransformLogic': DataTransformLogicClass,
|
|
25185
25243
|
'Deprecation': DeprecationClass,
|
|
25186
25244
|
'Documentation': DocumentationClass,
|
|
25187
25245
|
'DocumentationAssociation': DocumentationAssociationClass,
|
|
@@ -25588,6 +25646,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
|
|
|
25588
25646
|
CostClass,
|
|
25589
25647
|
BrowsePathsClass,
|
|
25590
25648
|
InstitutionalMemoryClass,
|
|
25649
|
+
DataTransformLogicClass,
|
|
25591
25650
|
SubTypesClass,
|
|
25592
25651
|
FormsClass,
|
|
25593
25652
|
DeprecationClass,
|
|
@@ -25802,6 +25861,7 @@ class AspectBag(TypedDict, total=False):
|
|
|
25802
25861
|
cost: CostClass
|
|
25803
25862
|
browsePaths: BrowsePathsClass
|
|
25804
25863
|
institutionalMemory: InstitutionalMemoryClass
|
|
25864
|
+
dataTransformLogic: DataTransformLogicClass
|
|
25805
25865
|
subTypes: SubTypesClass
|
|
25806
25866
|
forms: FormsClass
|
|
25807
25867
|
deprecation: DeprecationClass
|
|
@@ -19,6 +19,8 @@ from .....schema_classes import CostCostClass
|
|
|
19
19
|
from .....schema_classes import CostCostDiscriminatorClass
|
|
20
20
|
from .....schema_classes import CostTypeClass
|
|
21
21
|
from .....schema_classes import DataPlatformInstanceClass
|
|
22
|
+
from .....schema_classes import DataTransformClass
|
|
23
|
+
from .....schema_classes import DataTransformLogicClass
|
|
22
24
|
from .....schema_classes import DeprecationClass
|
|
23
25
|
from .....schema_classes import DocumentationClass
|
|
24
26
|
from .....schema_classes import DocumentationAssociationClass
|
|
@@ -79,6 +81,8 @@ CostCost = CostCostClass
|
|
|
79
81
|
CostCostDiscriminator = CostCostDiscriminatorClass
|
|
80
82
|
CostType = CostTypeClass
|
|
81
83
|
DataPlatformInstance = DataPlatformInstanceClass
|
|
84
|
+
DataTransform = DataTransformClass
|
|
85
|
+
DataTransformLogic = DataTransformLogicClass
|
|
82
86
|
Deprecation = DeprecationClass
|
|
83
87
|
Documentation = DocumentationClass
|
|
84
88
|
DocumentationAssociation = DocumentationAssociationClass
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -400,6 +400,69 @@
|
|
|
400
400
|
],
|
|
401
401
|
"doc": "Institutional memory of an entity. This is a way to link to relevant documentation and provide description of the documentation. Institutional or tribal knowledge is very important for users to leverage the entity."
|
|
402
402
|
},
|
|
403
|
+
{
|
|
404
|
+
"type": "record",
|
|
405
|
+
"Aspect": {
|
|
406
|
+
"name": "dataTransformLogic"
|
|
407
|
+
},
|
|
408
|
+
"name": "DataTransformLogic",
|
|
409
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
410
|
+
"fields": [
|
|
411
|
+
{
|
|
412
|
+
"type": {
|
|
413
|
+
"type": "array",
|
|
414
|
+
"items": {
|
|
415
|
+
"type": "record",
|
|
416
|
+
"name": "DataTransform",
|
|
417
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
418
|
+
"fields": [
|
|
419
|
+
{
|
|
420
|
+
"type": [
|
|
421
|
+
"null",
|
|
422
|
+
{
|
|
423
|
+
"type": "record",
|
|
424
|
+
"name": "QueryStatement",
|
|
425
|
+
"namespace": "com.linkedin.pegasus2avro.query",
|
|
426
|
+
"fields": [
|
|
427
|
+
{
|
|
428
|
+
"type": "string",
|
|
429
|
+
"name": "value",
|
|
430
|
+
"doc": "The query text"
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
"type": {
|
|
434
|
+
"type": "enum",
|
|
435
|
+
"symbolDocs": {
|
|
436
|
+
"SQL": "A SQL Query"
|
|
437
|
+
},
|
|
438
|
+
"name": "QueryLanguage",
|
|
439
|
+
"namespace": "com.linkedin.pegasus2avro.query",
|
|
440
|
+
"symbols": [
|
|
441
|
+
"SQL"
|
|
442
|
+
]
|
|
443
|
+
},
|
|
444
|
+
"name": "language",
|
|
445
|
+
"default": "SQL",
|
|
446
|
+
"doc": "The language of the Query, e.g. SQL."
|
|
447
|
+
}
|
|
448
|
+
],
|
|
449
|
+
"doc": "A query statement against one or more data assets."
|
|
450
|
+
}
|
|
451
|
+
],
|
|
452
|
+
"name": "queryStatement",
|
|
453
|
+
"default": null,
|
|
454
|
+
"doc": "The data transform may be defined by a query statement"
|
|
455
|
+
}
|
|
456
|
+
],
|
|
457
|
+
"doc": "Information about a transformation. It may be a query,"
|
|
458
|
+
}
|
|
459
|
+
},
|
|
460
|
+
"name": "transforms",
|
|
461
|
+
"doc": "List of transformations applied"
|
|
462
|
+
}
|
|
463
|
+
],
|
|
464
|
+
"doc": "Information about a Query against one or more data assets (e.g. Tables or Views)."
|
|
465
|
+
},
|
|
403
466
|
{
|
|
404
467
|
"type": "record",
|
|
405
468
|
"Aspect": {
|
|
@@ -11947,35 +12010,7 @@
|
|
|
11947
12010
|
"namespace": "com.linkedin.pegasus2avro.query",
|
|
11948
12011
|
"fields": [
|
|
11949
12012
|
{
|
|
11950
|
-
"type":
|
|
11951
|
-
"type": "record",
|
|
11952
|
-
"name": "QueryStatement",
|
|
11953
|
-
"namespace": "com.linkedin.pegasus2avro.query",
|
|
11954
|
-
"fields": [
|
|
11955
|
-
{
|
|
11956
|
-
"type": "string",
|
|
11957
|
-
"name": "value",
|
|
11958
|
-
"doc": "The query text"
|
|
11959
|
-
},
|
|
11960
|
-
{
|
|
11961
|
-
"type": {
|
|
11962
|
-
"type": "enum",
|
|
11963
|
-
"symbolDocs": {
|
|
11964
|
-
"SQL": "A SQL Query"
|
|
11965
|
-
},
|
|
11966
|
-
"name": "QueryLanguage",
|
|
11967
|
-
"namespace": "com.linkedin.pegasus2avro.query",
|
|
11968
|
-
"symbols": [
|
|
11969
|
-
"SQL"
|
|
11970
|
-
]
|
|
11971
|
-
},
|
|
11972
|
-
"name": "language",
|
|
11973
|
-
"default": "SQL",
|
|
11974
|
-
"doc": "The language of the Query, e.g. SQL."
|
|
11975
|
-
}
|
|
11976
|
-
],
|
|
11977
|
-
"doc": "A query statement against one or more data assets."
|
|
11978
|
-
},
|
|
12013
|
+
"type": "com.linkedin.pegasus2avro.query.QueryStatement",
|
|
11979
12014
|
"name": "statement",
|
|
11980
12015
|
"doc": "The Query Statement."
|
|
11981
12016
|
},
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"type": "record",
|
|
3
|
+
"Aspect": {
|
|
4
|
+
"name": "dataTransformLogic"
|
|
5
|
+
},
|
|
6
|
+
"name": "DataTransformLogic",
|
|
7
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
8
|
+
"fields": [
|
|
9
|
+
{
|
|
10
|
+
"type": {
|
|
11
|
+
"type": "array",
|
|
12
|
+
"items": {
|
|
13
|
+
"type": "record",
|
|
14
|
+
"name": "DataTransform",
|
|
15
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
16
|
+
"fields": [
|
|
17
|
+
{
|
|
18
|
+
"type": [
|
|
19
|
+
"null",
|
|
20
|
+
{
|
|
21
|
+
"type": "record",
|
|
22
|
+
"name": "QueryStatement",
|
|
23
|
+
"namespace": "com.linkedin.pegasus2avro.query",
|
|
24
|
+
"fields": [
|
|
25
|
+
{
|
|
26
|
+
"type": "string",
|
|
27
|
+
"name": "value",
|
|
28
|
+
"doc": "The query text"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"type": {
|
|
32
|
+
"type": "enum",
|
|
33
|
+
"symbolDocs": {
|
|
34
|
+
"SQL": "A SQL Query"
|
|
35
|
+
},
|
|
36
|
+
"name": "QueryLanguage",
|
|
37
|
+
"namespace": "com.linkedin.pegasus2avro.query",
|
|
38
|
+
"symbols": [
|
|
39
|
+
"SQL"
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
"name": "language",
|
|
43
|
+
"default": "SQL",
|
|
44
|
+
"doc": "The language of the Query, e.g. SQL."
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"doc": "A query statement against one or more data assets."
|
|
48
|
+
}
|
|
49
|
+
],
|
|
50
|
+
"name": "queryStatement",
|
|
51
|
+
"default": null,
|
|
52
|
+
"doc": "The data transform may be defined by a query statement"
|
|
53
|
+
}
|
|
54
|
+
],
|
|
55
|
+
"doc": "Information about a transformation. It may be a query,"
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"name": "transforms",
|
|
59
|
+
"doc": "List of transformations applied"
|
|
60
|
+
}
|
|
61
|
+
],
|
|
62
|
+
"doc": "Information about a Query against one or more data assets (e.g. Tables or Views)."
|
|
63
|
+
}
|
|
File without changes
|