acryl-datahub 0.15.0.1rc12__py3-none-any.whl → 0.15.0.1rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (38) hide show
  1. {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/METADATA +2369 -2369
  2. {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/RECORD +35 -31
  3. datahub/__init__.py +1 -1
  4. datahub/emitter/mce_builder.py +3 -3
  5. datahub/emitter/mcp_patch_builder.py +36 -12
  6. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  7. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  8. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  9. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  10. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  11. datahub/ingestion/source/gc/dataprocess_cleanup.py +4 -4
  12. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +159 -71
  13. datahub/ingestion/source/tableau/tableau.py +3 -0
  14. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  15. datahub/metadata/_schema_classes.py +61 -1
  16. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  17. datahub/metadata/schema.avsc +64 -29
  18. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  19. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  20. datahub/specific/aspect_helpers/__init__.py +0 -0
  21. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  22. datahub/specific/aspect_helpers/ownership.py +67 -0
  23. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  24. datahub/specific/aspect_helpers/tags.py +42 -0
  25. datahub/specific/aspect_helpers/terms.py +43 -0
  26. datahub/specific/chart.py +28 -184
  27. datahub/specific/dashboard.py +31 -196
  28. datahub/specific/datajob.py +34 -189
  29. datahub/specific/dataproduct.py +24 -86
  30. datahub/specific/dataset.py +48 -133
  31. datahub/specific/form.py +12 -32
  32. datahub/specific/structured_property.py +9 -9
  33. datahub/specific/custom_properties.py +0 -37
  34. datahub/specific/ownership.py +0 -48
  35. datahub/specific/structured_properties.py +0 -53
  36. {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/WHEEL +0 -0
  37. {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/entry_points.txt +0 -0
  38. {acryl_datahub-0.15.0.1rc12.dist-info → acryl_datahub-0.15.0.1rc14.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  import time
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime, timezone
6
- from typing import List, Optional
6
+ from threading import Lock
7
+ from typing import Dict, Iterable, List, Optional
7
8
 
8
9
  from pydantic import Field
9
10
 
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ QUERY_QUERY_ENTITY = """
23
+ query listQueries($input: ScrollAcrossEntitiesInput!) {
24
+ scrollAcrossEntities(input: $input) {
25
+ nextScrollId
26
+ count
27
+ searchResults {
28
+ entity {
29
+ ... on QueryEntity {
30
+ urn
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ """
37
+
21
38
 
22
39
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
23
40
  enabled: bool = Field(
24
41
  default=True, description="Whether to do soft deletion cleanup."
25
42
  )
26
- retention_days: Optional[int] = Field(
43
+ retention_days: int = Field(
27
44
  10,
28
45
  description="Number of days to retain metadata in DataHub",
29
46
  )
@@ -62,23 +79,30 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
62
79
  default=None,
63
80
  description="Query to filter entities",
64
81
  )
82
+
65
83
  limit_entities_delete: Optional[int] = Field(
66
84
  25000, description="Max number of entities to delete."
67
85
  )
68
86
 
69
- runtime_limit_seconds: Optional[int] = Field(
70
- None,
87
+ futures_max_at_time: int = Field(
88
+ 1000, description="Max number of futures to have at a time."
89
+ )
90
+
91
+ runtime_limit_seconds: int = Field(
92
+ 7200, # 2 hours by default
71
93
  description="Runtime limit in seconds",
72
94
  )
73
95
 
74
96
 
75
97
  @dataclass
76
98
  class SoftDeletedEntitiesReport(SourceReport):
77
- num_soft_deleted_entity_removed: int = 0
78
- num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field(
79
- default_factory=TopKDict
80
- )
81
- sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
99
+ num_queries_found: int = 0
100
+ num_soft_deleted_entity_processed: int = 0
101
+ num_soft_deleted_retained_due_to_age: int = 0
102
+ num_soft_deleted_entity_removal_started: int = 0
103
+ num_hard_deleted: int = 0
104
+ num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
105
+ sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
82
106
  default_factory=TopKDict
83
107
  )
84
108
 
@@ -103,48 +127,53 @@ class SoftDeletedEntitiesCleanup:
103
127
  self.config = config
104
128
  self.report = report
105
129
  self.dry_run = dry_run
130
+ self.start_time = 0.0
131
+ self._report_lock: Lock = Lock()
132
+ self.last_print_time = 0.0
133
+
134
+ def _increment_retained_count(self) -> None:
135
+ """Thread-safe method to update report fields"""
136
+ with self._report_lock:
137
+ self.report.num_soft_deleted_retained_due_to_age += 1
138
+
139
+ def _increment_removal_started_count(self) -> None:
140
+ """Thread-safe method to update report fields"""
141
+ with self._report_lock:
142
+ self.report.num_soft_deleted_entity_removal_started += 1
143
+
144
+ def _update_report(self, urn: str, entity_type: str) -> None:
145
+ """Thread-safe method to update report fields"""
146
+ with self._report_lock:
147
+ self.report.num_hard_deleted += 1
148
+
149
+ current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
150
+ self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
151
+ if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
152
+ self.report.sample_hard_deleted_aspects_by_type[
153
+ entity_type
154
+ ] = LossyList()
155
+ self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
106
156
 
107
157
  def delete_entity(self, urn: str) -> None:
108
158
  assert self.ctx.graph
109
159
 
110
160
  entity_urn = Urn.from_string(urn)
111
- self.report.num_soft_deleted_entity_removed += 1
112
- self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
113
- self.report.num_soft_deleted_entity_removed_by_type.get(
114
- entity_urn.entity_type, 0
115
- )
116
- + 1
117
- )
118
- if (
119
- entity_urn.entity_type
120
- not in self.report.sample_soft_deleted_removed_aspects_by_type
121
- ):
122
- self.report.sample_soft_deleted_removed_aspects_by_type[
123
- entity_urn.entity_type
124
- ] = LossyList()
125
- self.report.sample_soft_deleted_removed_aspects_by_type[
126
- entity_urn.entity_type
127
- ].append(urn)
128
-
129
161
  if self.dry_run:
130
162
  logger.info(
131
163
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
132
164
  )
133
165
  return
134
-
166
+ self._increment_removal_started_count()
135
167
  self.ctx.graph.delete_entity(urn=urn, hard=True)
136
168
  self.ctx.graph.delete_references_to_urn(
137
169
  urn=urn,
138
170
  dry_run=False,
139
171
  )
172
+ self._update_report(urn, entity_urn.entity_type)
140
173
 
141
174
  def delete_soft_deleted_entity(self, urn: str) -> None:
142
175
  assert self.ctx.graph
143
176
 
144
- if self.config.retention_days is None:
145
- logger.info("Retention days is not set, skipping soft delete cleanup")
146
- return
147
-
148
177
  retention_time = (
149
178
  int(datetime.now(timezone.utc).timestamp())
150
179
  - self.config.retention_days * 24 * 60 * 60
@@ -157,15 +186,85 @@ class SoftDeletedEntitiesCleanup:
157
186
  ]["created"]["time"] < (retention_time * 1000):
158
187
  logger.debug(f"Hard deleting {urn}")
159
188
  self.delete_entity(urn)
189
+ else:
190
+ self._increment_retained_count()
191
+
192
+ def _print_report(self) -> None:
193
+ time_taken = round(time.time() - self.last_print_time, 1)
194
+ # Print report every 2 minutes
195
+ if time_taken > 120:
196
+ self.last_print_time = time.time()
197
+ logger.info(f"\n{self.report.as_string()}")
198
+
199
+ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
200
+ done, not_done = wait(futures, return_when=FIRST_COMPLETED)
201
+ futures = {future: urn for future, urn in futures.items() if future in not_done}
202
+
203
+ for future in done:
204
+ self._print_report()
205
+ if future.exception():
206
+ logger.error(
207
+ f"Failed to delete entity {futures[future]}: {future.exception()}"
208
+ )
209
+ self.report.failure(
210
+ f"Failed to delete entity {futures[future]}",
211
+ exc=future.exception(),
212
+ )
213
+ self.report.num_soft_deleted_entity_processed += 1
214
+ if (
215
+ self.report.num_soft_deleted_entity_processed % self.config.batch_size
216
+ == 0
217
+ ):
218
+ if self.config.delay:
219
+ logger.debug(
220
+ f"Sleeping for {self.config.delay} seconds before further processing batch"
221
+ )
222
+ time.sleep(self.config.delay)
223
+ return futures
160
224
 
161
- def cleanup_soft_deleted_entities(self) -> None:
162
- if not self.config.enabled:
163
- return
225
+ def _get_soft_deleted_queries(self) -> Iterable[str]:
164
226
  assert self.ctx.graph
165
- start_time = time.time()
166
-
167
- deleted_count_retention = 0
168
- urns = self.ctx.graph.get_urns_by_filter(
227
+ scroll_id: Optional[str] = None
228
+ while True:
229
+ try:
230
+ result = self.ctx.graph.execute_graphql(
231
+ QUERY_QUERY_ENTITY,
232
+ {
233
+ "input": {
234
+ "types": ["QUERY"],
235
+ "query": "*",
236
+ "scrollId": scroll_id if scroll_id else None,
237
+ "count": self.config.batch_size,
238
+ "orFilters": [
239
+ {
240
+ "and": [
241
+ {
242
+ "field": "removed",
243
+ "values": ["true"],
244
+ "condition": "EQUAL",
245
+ }
246
+ ]
247
+ }
248
+ ],
249
+ }
250
+ },
251
+ )
252
+ except Exception as e:
253
+ self.report.failure(
254
+ f"While trying to get queries with {scroll_id}", exc=e
255
+ )
256
+ break
257
+ scroll_across_entities = result.get("scrollAcrossEntities")
258
+ if not scroll_across_entities:
259
+ break
260
+ scroll_id = scroll_across_entities.get("nextScrollId")
261
+ self.report.num_queries_found += scroll_across_entities.get("count")
262
+ for query in scroll_across_entities.get("searchResults"):
263
+ yield query["entity"]["urn"]
264
+
265
+ def _get_urns(self) -> Iterable[str]:
266
+ assert self.ctx.graph
267
+ yield from self.ctx.graph.get_urns_by_filter(
169
268
  entity_types=self.config.entity_types,
170
269
  platform=self.config.platform,
171
270
  env=self.config.env,
@@ -173,52 +272,41 @@ class SoftDeletedEntitiesCleanup:
173
272
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
174
273
  batch_size=self.config.batch_size,
175
274
  )
275
+ yield from self._get_soft_deleted_queries()
276
+
277
+ def cleanup_soft_deleted_entities(self) -> None:
278
+ if not self.config.enabled:
279
+ return
280
+ self.start_time = time.time()
176
281
 
177
- futures = {}
282
+ futures: Dict[Future, str] = dict()
178
283
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
179
- num_urns_submitted = 0
180
- for urn in urns:
181
- num_urns_submitted += 1
284
+ for urn in self._get_urns():
285
+ self._print_report()
286
+ while len(futures) >= self.config.futures_max_at_time:
287
+ futures = self._process_futures(futures)
182
288
  if (
183
289
  self.config.limit_entities_delete
184
- and num_urns_submitted > self.config.limit_entities_delete
290
+ and self.report.num_hard_deleted > self.config.limit_entities_delete
185
291
  ):
186
292
  logger.info(
187
- f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
293
+ f"Limit of {self.config.limit_entities_delete} entities reached. Stopped adding more."
188
294
  )
189
295
  break
190
296
  if (
191
297
  self.config.runtime_limit_seconds
192
- and time.time() - start_time > self.config.runtime_limit_seconds
298
+ and time.time() - self.start_time
299
+ > self.config.runtime_limit_seconds
193
300
  ):
194
301
  logger.info(
195
- f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
302
+ f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Not submitting more futures."
196
303
  )
197
304
  break
198
305
 
199
306
  future = executor.submit(self.delete_soft_deleted_entity, urn)
200
307
  futures[future] = urn
201
308
 
202
- if not futures:
203
- return
204
- for future in as_completed(futures):
205
- if future.exception():
206
- logger.error(
207
- f"Failed to delete entity {futures[future]}: {future.exception()}"
208
- )
209
- self.report.failure(
210
- f"Failed to delete entity {futures[future]}",
211
- exc=future.exception(),
212
- )
213
- deleted_count_retention += 1
214
-
215
- if deleted_count_retention % self.config.batch_size == 0:
216
- logger.info(
217
- f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
218
- )
219
-
220
- if self.config.delay:
221
- logger.debug(
222
- f"Sleeping for {self.config.delay} seconds before getting next batch"
223
- )
224
- time.sleep(self.config.delay)
309
+ logger.info(f"Waiting for {len(futures)} futures to complete")
310
+ while len(futures) > 0:
311
+ self._print_report()
312
+ futures = self._process_futures(futures)
@@ -109,6 +109,7 @@ from datahub.ingestion.source.tableau.tableau_common import (
109
109
  make_filter,
110
110
  make_fine_grained_lineage_class,
111
111
  make_upstream_class,
112
+ optimize_query_filter,
112
113
  published_datasource_graphql_query,
113
114
  query_metadata_cursor_based_pagination,
114
115
  sheet_graphql_query,
@@ -1363,6 +1364,8 @@ class TableauSiteSource:
1363
1364
  query_filter: dict = {},
1364
1365
  page_size_override: Optional[int] = None,
1365
1366
  ) -> Iterable[dict]:
1367
+ query_filter = optimize_query_filter(query_filter)
1368
+
1366
1369
  # Calls the get_connection_object_page function to get the objects,
1367
1370
  # and automatically handles pagination.
1368
1371
  page_size = page_size_override or self.config.page_size
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import html
2
3
  import json
3
4
  import logging
@@ -35,6 +36,7 @@ from datahub.metadata.schema_classes import (
35
36
  UpstreamClass,
36
37
  )
37
38
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
39
+ from datahub.utilities.ordered_set import OrderedSet
38
40
 
39
41
  logger = logging.getLogger(__name__)
40
42
 
@@ -1000,3 +1002,19 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
1000
1002
  ]
1001
1003
 
1002
1004
  return filter_pages
1005
+
1006
+
1007
+ def optimize_query_filter(query_filter: dict) -> dict:
1008
+ """
1009
+ Duplicates in the filter cause duplicates in the result,
1010
+ leading to entities/aspects being emitted multiple times unnecessarily
1011
+ """
1012
+ optimized_query = copy.deepcopy(query_filter)
1013
+
1014
+ if query_filter.get(c.ID_WITH_IN):
1015
+ optimized_query[c.ID_WITH_IN] = list(OrderedSet(query_filter[c.ID_WITH_IN]))
1016
+ if query_filter.get(c.PROJECT_NAME_WITH_IN):
1017
+ optimized_query[c.PROJECT_NAME_WITH_IN] = list(
1018
+ OrderedSet(query_filter[c.PROJECT_NAME_WITH_IN])
1019
+ )
1020
+ return optimized_query
@@ -4053,6 +4053,60 @@ class DataPlatformInstanceClass(_Aspect):
4053
4053
  self._inner_dict['instance'] = value
4054
4054
 
4055
4055
 
4056
+ class DataTransformClass(DictWrapper):
4057
+ """Information about a transformation. It may be a query,"""
4058
+
4059
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransform")
4060
+ def __init__(self,
4061
+ queryStatement: Union[None, "QueryStatementClass"]=None,
4062
+ ):
4063
+ super().__init__()
4064
+
4065
+ self.queryStatement = queryStatement
4066
+
4067
+ def _restore_defaults(self) -> None:
4068
+ self.queryStatement = self.RECORD_SCHEMA.fields_dict["queryStatement"].default
4069
+
4070
+
4071
+ @property
4072
+ def queryStatement(self) -> Union[None, "QueryStatementClass"]:
4073
+ """The data transform may be defined by a query statement"""
4074
+ return self._inner_dict.get('queryStatement') # type: ignore
4075
+
4076
+ @queryStatement.setter
4077
+ def queryStatement(self, value: Union[None, "QueryStatementClass"]) -> None:
4078
+ self._inner_dict['queryStatement'] = value
4079
+
4080
+
4081
+ class DataTransformLogicClass(_Aspect):
4082
+ """Information about a Query against one or more data assets (e.g. Tables or Views)."""
4083
+
4084
+
4085
+ ASPECT_NAME = 'dataTransformLogic'
4086
+ ASPECT_INFO = {}
4087
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.common.DataTransformLogic")
4088
+
4089
+ def __init__(self,
4090
+ transforms: List["DataTransformClass"],
4091
+ ):
4092
+ super().__init__()
4093
+
4094
+ self.transforms = transforms
4095
+
4096
+ def _restore_defaults(self) -> None:
4097
+ self.transforms = list()
4098
+
4099
+
4100
+ @property
4101
+ def transforms(self) -> List["DataTransformClass"]:
4102
+ """List of transformations applied"""
4103
+ return self._inner_dict.get('transforms') # type: ignore
4104
+
4105
+ @transforms.setter
4106
+ def transforms(self, value: List["DataTransformClass"]) -> None:
4107
+ self._inner_dict['transforms'] = value
4108
+
4109
+
4056
4110
  class DeprecationClass(_Aspect):
4057
4111
  """Deprecation status of an entity"""
4058
4112
 
@@ -14624,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
14624
14678
 
14625
14679
 
14626
14680
  ASPECT_NAME = 'dataJobKey'
14627
- ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults']}
14681
+ ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14628
14682
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
14629
14683
 
14630
14684
  def __init__(self,
@@ -24715,6 +24769,8 @@ __SCHEMA_TYPES = {
24715
24769
  'com.linkedin.pegasus2avro.common.CostCostDiscriminator': CostCostDiscriminatorClass,
24716
24770
  'com.linkedin.pegasus2avro.common.CostType': CostTypeClass,
24717
24771
  'com.linkedin.pegasus2avro.common.DataPlatformInstance': DataPlatformInstanceClass,
24772
+ 'com.linkedin.pegasus2avro.common.DataTransform': DataTransformClass,
24773
+ 'com.linkedin.pegasus2avro.common.DataTransformLogic': DataTransformLogicClass,
24718
24774
  'com.linkedin.pegasus2avro.common.Deprecation': DeprecationClass,
24719
24775
  'com.linkedin.pegasus2avro.common.Documentation': DocumentationClass,
24720
24776
  'com.linkedin.pegasus2avro.common.DocumentationAssociation': DocumentationAssociationClass,
@@ -25182,6 +25238,8 @@ __SCHEMA_TYPES = {
25182
25238
  'CostCostDiscriminator': CostCostDiscriminatorClass,
25183
25239
  'CostType': CostTypeClass,
25184
25240
  'DataPlatformInstance': DataPlatformInstanceClass,
25241
+ 'DataTransform': DataTransformClass,
25242
+ 'DataTransformLogic': DataTransformLogicClass,
25185
25243
  'Deprecation': DeprecationClass,
25186
25244
  'Documentation': DocumentationClass,
25187
25245
  'DocumentationAssociation': DocumentationAssociationClass,
@@ -25588,6 +25646,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
25588
25646
  CostClass,
25589
25647
  BrowsePathsClass,
25590
25648
  InstitutionalMemoryClass,
25649
+ DataTransformLogicClass,
25591
25650
  SubTypesClass,
25592
25651
  FormsClass,
25593
25652
  DeprecationClass,
@@ -25802,6 +25861,7 @@ class AspectBag(TypedDict, total=False):
25802
25861
  cost: CostClass
25803
25862
  browsePaths: BrowsePathsClass
25804
25863
  institutionalMemory: InstitutionalMemoryClass
25864
+ dataTransformLogic: DataTransformLogicClass
25805
25865
  subTypes: SubTypesClass
25806
25866
  forms: FormsClass
25807
25867
  deprecation: DeprecationClass
@@ -19,6 +19,8 @@ from .....schema_classes import CostCostClass
19
19
  from .....schema_classes import CostCostDiscriminatorClass
20
20
  from .....schema_classes import CostTypeClass
21
21
  from .....schema_classes import DataPlatformInstanceClass
22
+ from .....schema_classes import DataTransformClass
23
+ from .....schema_classes import DataTransformLogicClass
22
24
  from .....schema_classes import DeprecationClass
23
25
  from .....schema_classes import DocumentationClass
24
26
  from .....schema_classes import DocumentationAssociationClass
@@ -79,6 +81,8 @@ CostCost = CostCostClass
79
81
  CostCostDiscriminator = CostCostDiscriminatorClass
80
82
  CostType = CostTypeClass
81
83
  DataPlatformInstance = DataPlatformInstanceClass
84
+ DataTransform = DataTransformClass
85
+ DataTransformLogic = DataTransformLogicClass
82
86
  Deprecation = DeprecationClass
83
87
  Documentation = DocumentationClass
84
88
  DocumentationAssociation = DocumentationAssociationClass
@@ -400,6 +400,69 @@
400
400
  ],
401
401
  "doc": "Institutional memory of an entity. This is a way to link to relevant documentation and provide description of the documentation. Institutional or tribal knowledge is very important for users to leverage the entity."
402
402
  },
403
+ {
404
+ "type": "record",
405
+ "Aspect": {
406
+ "name": "dataTransformLogic"
407
+ },
408
+ "name": "DataTransformLogic",
409
+ "namespace": "com.linkedin.pegasus2avro.common",
410
+ "fields": [
411
+ {
412
+ "type": {
413
+ "type": "array",
414
+ "items": {
415
+ "type": "record",
416
+ "name": "DataTransform",
417
+ "namespace": "com.linkedin.pegasus2avro.common",
418
+ "fields": [
419
+ {
420
+ "type": [
421
+ "null",
422
+ {
423
+ "type": "record",
424
+ "name": "QueryStatement",
425
+ "namespace": "com.linkedin.pegasus2avro.query",
426
+ "fields": [
427
+ {
428
+ "type": "string",
429
+ "name": "value",
430
+ "doc": "The query text"
431
+ },
432
+ {
433
+ "type": {
434
+ "type": "enum",
435
+ "symbolDocs": {
436
+ "SQL": "A SQL Query"
437
+ },
438
+ "name": "QueryLanguage",
439
+ "namespace": "com.linkedin.pegasus2avro.query",
440
+ "symbols": [
441
+ "SQL"
442
+ ]
443
+ },
444
+ "name": "language",
445
+ "default": "SQL",
446
+ "doc": "The language of the Query, e.g. SQL."
447
+ }
448
+ ],
449
+ "doc": "A query statement against one or more data assets."
450
+ }
451
+ ],
452
+ "name": "queryStatement",
453
+ "default": null,
454
+ "doc": "The data transform may be defined by a query statement"
455
+ }
456
+ ],
457
+ "doc": "Information about a transformation. It may be a query,"
458
+ }
459
+ },
460
+ "name": "transforms",
461
+ "doc": "List of transformations applied"
462
+ }
463
+ ],
464
+ "doc": "Information about a Query against one or more data assets (e.g. Tables or Views)."
465
+ },
403
466
  {
404
467
  "type": "record",
405
468
  "Aspect": {
@@ -11947,35 +12010,7 @@
11947
12010
  "namespace": "com.linkedin.pegasus2avro.query",
11948
12011
  "fields": [
11949
12012
  {
11950
- "type": {
11951
- "type": "record",
11952
- "name": "QueryStatement",
11953
- "namespace": "com.linkedin.pegasus2avro.query",
11954
- "fields": [
11955
- {
11956
- "type": "string",
11957
- "name": "value",
11958
- "doc": "The query text"
11959
- },
11960
- {
11961
- "type": {
11962
- "type": "enum",
11963
- "symbolDocs": {
11964
- "SQL": "A SQL Query"
11965
- },
11966
- "name": "QueryLanguage",
11967
- "namespace": "com.linkedin.pegasus2avro.query",
11968
- "symbols": [
11969
- "SQL"
11970
- ]
11971
- },
11972
- "name": "language",
11973
- "default": "SQL",
11974
- "doc": "The language of the Query, e.g. SQL."
11975
- }
11976
- ],
11977
- "doc": "A query statement against one or more data assets."
11978
- },
12013
+ "type": "com.linkedin.pegasus2avro.query.QueryStatement",
11979
12014
  "name": "statement",
11980
12015
  "doc": "The Query Statement."
11981
12016
  },
@@ -25,7 +25,8 @@
25
25
  "forms",
26
26
  "subTypes",
27
27
  "incidentsSummary",
28
- "testResults"
28
+ "testResults",
29
+ "dataTransformLogic"
29
30
  ]
30
31
  },
31
32
  "name": "DataJobKey",
@@ -0,0 +1,63 @@
1
+ {
2
+ "type": "record",
3
+ "Aspect": {
4
+ "name": "dataTransformLogic"
5
+ },
6
+ "name": "DataTransformLogic",
7
+ "namespace": "com.linkedin.pegasus2avro.common",
8
+ "fields": [
9
+ {
10
+ "type": {
11
+ "type": "array",
12
+ "items": {
13
+ "type": "record",
14
+ "name": "DataTransform",
15
+ "namespace": "com.linkedin.pegasus2avro.common",
16
+ "fields": [
17
+ {
18
+ "type": [
19
+ "null",
20
+ {
21
+ "type": "record",
22
+ "name": "QueryStatement",
23
+ "namespace": "com.linkedin.pegasus2avro.query",
24
+ "fields": [
25
+ {
26
+ "type": "string",
27
+ "name": "value",
28
+ "doc": "The query text"
29
+ },
30
+ {
31
+ "type": {
32
+ "type": "enum",
33
+ "symbolDocs": {
34
+ "SQL": "A SQL Query"
35
+ },
36
+ "name": "QueryLanguage",
37
+ "namespace": "com.linkedin.pegasus2avro.query",
38
+ "symbols": [
39
+ "SQL"
40
+ ]
41
+ },
42
+ "name": "language",
43
+ "default": "SQL",
44
+ "doc": "The language of the Query, e.g. SQL."
45
+ }
46
+ ],
47
+ "doc": "A query statement against one or more data assets."
48
+ }
49
+ ],
50
+ "name": "queryStatement",
51
+ "default": null,
52
+ "doc": "The data transform may be defined by a query statement"
53
+ }
54
+ ],
55
+ "doc": "Information about a transformation. It may be a query,"
56
+ }
57
+ },
58
+ "name": "transforms",
59
+ "doc": "List of transformations applied"
60
+ }
61
+ ],
62
+ "doc": "Information about a Query against one or more data assets (e.g. Tables or Views)."
63
+ }
File without changes