acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +7 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +1 -0
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/metadata/_schema_classes.py +517 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17362 -17638
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_entity.py +18 -1
- datahub/sdk/container.py +3 -1
- datahub/sdk/dataset.py +5 -3
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from datahub.emitter.mce_builder import (
|
|
7
|
-
make_data_platform_urn,
|
|
8
|
-
make_dataplatform_instance_urn,
|
|
9
7
|
make_dataset_urn_with_platform_instance,
|
|
10
8
|
make_schema_field_urn,
|
|
11
9
|
)
|
|
12
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
10
|
from datahub.emitter.mcp_builder import (
|
|
14
11
|
ContainerKey,
|
|
15
|
-
add_dataset_to_container,
|
|
16
|
-
gen_containers,
|
|
17
12
|
)
|
|
18
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
19
14
|
from datahub.ingestion.api.decorators import (
|
|
@@ -31,6 +26,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
|
|
|
31
26
|
CassandraColumn,
|
|
32
27
|
CassandraEntities,
|
|
33
28
|
CassandraKeyspace,
|
|
29
|
+
CassandraSharedDatasetFields,
|
|
34
30
|
CassandraTable,
|
|
35
31
|
CassandraView,
|
|
36
32
|
)
|
|
@@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
51
47
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
52
48
|
StatefulIngestionSourceBase,
|
|
53
49
|
)
|
|
54
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
55
50
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
56
51
|
SchemaField,
|
|
57
|
-
SchemaMetadata,
|
|
58
52
|
)
|
|
59
53
|
from datahub.metadata.schema_classes import (
|
|
60
|
-
DataPlatformInstanceClass,
|
|
61
54
|
DatasetLineageTypeClass,
|
|
62
|
-
DatasetPropertiesClass,
|
|
63
55
|
FineGrainedLineageClass,
|
|
64
56
|
FineGrainedLineageDownstreamTypeClass,
|
|
65
57
|
FineGrainedLineageUpstreamTypeClass,
|
|
66
|
-
OtherSchemaClass,
|
|
67
|
-
SubTypesClass,
|
|
68
58
|
UpstreamClass,
|
|
69
59
|
UpstreamLineageClass,
|
|
70
60
|
ViewPropertiesClass,
|
|
71
61
|
)
|
|
62
|
+
from datahub.sdk._entity import Entity
|
|
63
|
+
from datahub.sdk.container import Container
|
|
64
|
+
from datahub.sdk.dataset import Dataset
|
|
72
65
|
|
|
73
66
|
logger = logging.getLogger(__name__)
|
|
74
67
|
|
|
@@ -133,6 +126,13 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
133
126
|
def get_workunits_internal(
|
|
134
127
|
self,
|
|
135
128
|
) -> Iterable[MetadataWorkUnit]:
|
|
129
|
+
for metadata in self._get_metadata():
|
|
130
|
+
if isinstance(metadata, MetadataWorkUnit):
|
|
131
|
+
yield metadata
|
|
132
|
+
else:
|
|
133
|
+
yield from metadata.as_workunits()
|
|
134
|
+
|
|
135
|
+
def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
136
136
|
if not self.cassandra_api.authenticate():
|
|
137
137
|
return
|
|
138
138
|
keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
|
|
@@ -145,7 +145,7 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
145
145
|
self.report.report_dropped(keyspace_name)
|
|
146
146
|
continue
|
|
147
147
|
|
|
148
|
-
yield
|
|
148
|
+
yield self._generate_keyspace_container(keyspace)
|
|
149
149
|
|
|
150
150
|
try:
|
|
151
151
|
yield from self._extract_tables_from_keyspace(keyspace_name)
|
|
@@ -170,21 +170,20 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
170
170
|
if self.config.is_profiling_enabled():
|
|
171
171
|
yield from self.profiler.get_workunits(self.cassandra_data)
|
|
172
172
|
|
|
173
|
-
def _generate_keyspace_container(
|
|
174
|
-
self, keyspace: CassandraKeyspace
|
|
175
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
173
|
+
def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container:
|
|
176
174
|
keyspace_container_key = self._generate_keyspace_container_key(
|
|
177
175
|
keyspace.keyspace_name
|
|
178
176
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
177
|
+
|
|
178
|
+
return Container(
|
|
179
|
+
keyspace_container_key,
|
|
180
|
+
display_name=keyspace.keyspace_name,
|
|
182
181
|
qualified_name=keyspace.keyspace_name,
|
|
182
|
+
subtype=DatasetContainerSubTypes.KEYSPACE,
|
|
183
183
|
extra_properties={
|
|
184
184
|
"durable_writes": str(keyspace.durable_writes),
|
|
185
185
|
"replication": json.dumps(keyspace.replication),
|
|
186
186
|
},
|
|
187
|
-
sub_types=[DatasetContainerSubTypes.KEYSPACE],
|
|
188
187
|
)
|
|
189
188
|
|
|
190
189
|
def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey:
|
|
@@ -196,105 +195,55 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
196
195
|
)
|
|
197
196
|
|
|
198
197
|
# get all tables for a given keyspace, iterate over them to extract column metadata
|
|
199
|
-
def _extract_tables_from_keyspace(
|
|
200
|
-
self, keyspace_name: str
|
|
201
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
198
|
+
def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
|
|
202
199
|
self.cassandra_data.keyspaces.append(keyspace_name)
|
|
203
200
|
tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name)
|
|
204
201
|
for table in tables:
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
202
|
+
dataset = self._generate_table(keyspace_name, table)
|
|
203
|
+
if dataset:
|
|
204
|
+
yield dataset
|
|
205
|
+
|
|
206
|
+
def _generate_table(
|
|
207
|
+
self, keyspace_name: str, table: CassandraTable
|
|
208
|
+
) -> Optional[Dataset]:
|
|
209
|
+
table_name: str = table.table_name
|
|
210
|
+
dataset_name: str = f"{keyspace_name}.{table_name}"
|
|
211
|
+
|
|
212
|
+
self.report.report_entity_scanned(dataset_name, ent_type="Table")
|
|
213
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
214
|
+
self.report.report_dropped(dataset_name)
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
|
|
218
|
+
|
|
219
|
+
schema_fields = None
|
|
220
|
+
try:
|
|
221
|
+
schema_fields = self._extract_columns_from_table(keyspace_name, table_name)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.report.failure(
|
|
224
|
+
message="Failed to extract columns from table",
|
|
225
|
+
context=dataset_name,
|
|
226
|
+
exc=e,
|
|
221
227
|
)
|
|
222
228
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
entityUrn=dataset_urn,
|
|
237
|
-
aspect=StatusClass(removed=False),
|
|
238
|
-
).as_workunit()
|
|
239
|
-
|
|
240
|
-
yield MetadataChangeProposalWrapper(
|
|
241
|
-
entityUrn=dataset_urn,
|
|
242
|
-
aspect=SubTypesClass(
|
|
243
|
-
typeNames=[
|
|
244
|
-
DatasetSubTypes.TABLE,
|
|
245
|
-
]
|
|
246
|
-
),
|
|
247
|
-
).as_workunit()
|
|
248
|
-
|
|
249
|
-
yield MetadataChangeProposalWrapper(
|
|
250
|
-
entityUrn=dataset_urn,
|
|
251
|
-
aspect=DatasetPropertiesClass(
|
|
252
|
-
name=table_name,
|
|
253
|
-
qualifiedName=f"{keyspace_name}.{table_name}",
|
|
254
|
-
description=table.comment,
|
|
255
|
-
customProperties={
|
|
256
|
-
"bloom_filter_fp_chance": str(table.bloom_filter_fp_chance),
|
|
257
|
-
"caching": json.dumps(table.caching),
|
|
258
|
-
"compaction": json.dumps(table.compaction),
|
|
259
|
-
"compression": json.dumps(table.compression),
|
|
260
|
-
"crc_check_chance": str(table.crc_check_chance),
|
|
261
|
-
"dclocal_read_repair_chance": str(
|
|
262
|
-
table.dclocal_read_repair_chance
|
|
263
|
-
),
|
|
264
|
-
"default_time_to_live": str(table.default_time_to_live),
|
|
265
|
-
"extensions": json.dumps(table.extensions),
|
|
266
|
-
"gc_grace_seconds": str(table.gc_grace_seconds),
|
|
267
|
-
"max_index_interval": str(table.max_index_interval),
|
|
268
|
-
"min_index_interval": str(table.min_index_interval),
|
|
269
|
-
"memtable_flush_period_in_ms": str(
|
|
270
|
-
table.memtable_flush_period_in_ms
|
|
271
|
-
),
|
|
272
|
-
"read_repair_chance": str(table.read_repair_chance),
|
|
273
|
-
"speculative_retry": str(table.speculative_retry),
|
|
274
|
-
},
|
|
275
|
-
),
|
|
276
|
-
).as_workunit()
|
|
277
|
-
|
|
278
|
-
yield from add_dataset_to_container(
|
|
279
|
-
container_key=self._generate_keyspace_container_key(keyspace_name),
|
|
280
|
-
dataset_urn=dataset_urn,
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
if self.config.platform_instance:
|
|
284
|
-
yield MetadataChangeProposalWrapper(
|
|
285
|
-
entityUrn=dataset_urn,
|
|
286
|
-
aspect=DataPlatformInstanceClass(
|
|
287
|
-
platform=make_data_platform_urn(self.platform),
|
|
288
|
-
instance=make_dataplatform_instance_urn(
|
|
289
|
-
self.platform, self.config.platform_instance
|
|
290
|
-
),
|
|
291
|
-
),
|
|
292
|
-
).as_workunit()
|
|
229
|
+
return Dataset(
|
|
230
|
+
platform=self.platform,
|
|
231
|
+
name=dataset_name,
|
|
232
|
+
env=self.config.env,
|
|
233
|
+
platform_instance=self.config.platform_instance,
|
|
234
|
+
subtype=DatasetSubTypes.TABLE,
|
|
235
|
+
parent_container=self._generate_keyspace_container_key(keyspace_name),
|
|
236
|
+
schema=schema_fields,
|
|
237
|
+
display_name=table_name,
|
|
238
|
+
qualified_name=dataset_name,
|
|
239
|
+
description=table.comment,
|
|
240
|
+
custom_properties=self._get_dataset_custom_props(table),
|
|
241
|
+
)
|
|
293
242
|
|
|
294
243
|
# get all columns for a given table, iterate over them to extract column metadata
|
|
295
244
|
def _extract_columns_from_table(
|
|
296
|
-
self, keyspace_name: str, table_name: str
|
|
297
|
-
) ->
|
|
245
|
+
self, keyspace_name: str, table_name: str
|
|
246
|
+
) -> Optional[List[SchemaField]]:
|
|
298
247
|
column_infos: List[CassandraColumn] = self.cassandra_api.get_columns(
|
|
299
248
|
keyspace_name, table_name
|
|
300
249
|
)
|
|
@@ -305,147 +254,117 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
305
254
|
self.report.report_warning(
|
|
306
255
|
message="Table has no columns, skipping", context=table_name
|
|
307
256
|
)
|
|
308
|
-
return
|
|
257
|
+
return None
|
|
309
258
|
|
|
259
|
+
# Tricky: we also save the column info to a global store.
|
|
310
260
|
jsonable_column_infos: List[Dict[str, Any]] = []
|
|
311
261
|
for column in column_infos:
|
|
312
262
|
self.cassandra_data.columns.setdefault(table_name, []).append(column)
|
|
313
263
|
jsonable_column_infos.append(dataclasses.asdict(column))
|
|
314
264
|
|
|
315
|
-
|
|
316
|
-
schemaName=table_name,
|
|
317
|
-
platform=make_data_platform_urn(self.platform),
|
|
318
|
-
version=0,
|
|
319
|
-
hash="",
|
|
320
|
-
platformSchema=OtherSchemaClass(
|
|
321
|
-
rawSchema=json.dumps(jsonable_column_infos)
|
|
322
|
-
),
|
|
323
|
-
fields=schema_fields,
|
|
324
|
-
)
|
|
325
|
-
|
|
326
|
-
yield MetadataChangeProposalWrapper(
|
|
327
|
-
entityUrn=dataset_urn,
|
|
328
|
-
aspect=schema_metadata,
|
|
329
|
-
).as_workunit()
|
|
265
|
+
return schema_fields
|
|
330
266
|
|
|
331
|
-
def _extract_views_from_keyspace(
|
|
332
|
-
self, keyspace_name: str
|
|
333
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
267
|
+
def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
|
|
334
268
|
views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name)
|
|
335
269
|
for view in views:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
270
|
+
dataset = self._generate_view(keyspace_name, view)
|
|
271
|
+
if dataset:
|
|
272
|
+
yield dataset
|
|
273
|
+
|
|
274
|
+
def _generate_view(
|
|
275
|
+
self, keyspace_name: str, view: CassandraView
|
|
276
|
+
) -> Optional[Dataset]:
|
|
277
|
+
view_name: str = view.view_name
|
|
278
|
+
dataset_name: str = f"{keyspace_name}.{view_name}"
|
|
279
|
+
|
|
280
|
+
self.report.report_entity_scanned(dataset_name, ent_type="View")
|
|
281
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
282
|
+
# TODO: Maybe add a view_pattern instead of reusing table_pattern?
|
|
283
|
+
self.report.report_dropped(dataset_name)
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
schema_fields = None
|
|
287
|
+
try:
|
|
288
|
+
schema_fields = self._extract_columns_from_table(keyspace_name, view_name)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
self.report.failure(
|
|
291
|
+
message="Failed to extract columns from views",
|
|
292
|
+
context=view_name,
|
|
293
|
+
exc=e,
|
|
344
294
|
)
|
|
345
295
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
yield MetadataChangeProposalWrapper(
|
|
361
|
-
entityUrn=dataset_urn,
|
|
362
|
-
aspect=ViewPropertiesClass(
|
|
296
|
+
dataset = Dataset(
|
|
297
|
+
platform=self.platform,
|
|
298
|
+
name=dataset_name,
|
|
299
|
+
env=self.config.env,
|
|
300
|
+
platform_instance=self.config.platform_instance,
|
|
301
|
+
subtype=DatasetSubTypes.VIEW,
|
|
302
|
+
parent_container=self._generate_keyspace_container_key(keyspace_name),
|
|
303
|
+
schema=schema_fields,
|
|
304
|
+
display_name=view_name,
|
|
305
|
+
qualified_name=dataset_name,
|
|
306
|
+
description=view.comment,
|
|
307
|
+
custom_properties=self._get_dataset_custom_props(view),
|
|
308
|
+
extra_aspects=[
|
|
309
|
+
ViewPropertiesClass(
|
|
363
310
|
materialized=True,
|
|
364
311
|
viewLogic=view.where_clause, # Use the WHERE clause as view logic
|
|
365
312
|
viewLanguage="CQL", # Use "CQL" as the language
|
|
366
313
|
),
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
yield MetadataChangeProposalWrapper(
|
|
370
|
-
entityUrn=dataset_urn,
|
|
371
|
-
aspect=DatasetPropertiesClass(
|
|
372
|
-
name=view_name,
|
|
373
|
-
qualifiedName=f"{keyspace_name}.{view_name}",
|
|
374
|
-
description=view.comment,
|
|
375
|
-
customProperties={
|
|
376
|
-
"bloom_filter_fp_chance": str(view.bloom_filter_fp_chance),
|
|
377
|
-
"caching": json.dumps(view.caching),
|
|
378
|
-
"compaction": json.dumps(view.compaction),
|
|
379
|
-
"compression": json.dumps(view.compression),
|
|
380
|
-
"crc_check_chance": str(view.crc_check_chance),
|
|
381
|
-
"include_all_columns": str(view.include_all_columns),
|
|
382
|
-
"dclocal_read_repair_chance": str(
|
|
383
|
-
view.dclocal_read_repair_chance
|
|
384
|
-
),
|
|
385
|
-
"default_time_to_live": str(view.default_time_to_live),
|
|
386
|
-
"extensions": json.dumps(view.extensions),
|
|
387
|
-
"gc_grace_seconds": str(view.gc_grace_seconds),
|
|
388
|
-
"max_index_interval": str(view.max_index_interval),
|
|
389
|
-
"min_index_interval": str(view.min_index_interval),
|
|
390
|
-
"memtable_flush_period_in_ms": str(
|
|
391
|
-
view.memtable_flush_period_in_ms
|
|
392
|
-
),
|
|
393
|
-
"read_repair_chance": str(view.read_repair_chance),
|
|
394
|
-
"speculative_retry": str(view.speculative_retry),
|
|
395
|
-
},
|
|
396
|
-
),
|
|
397
|
-
).as_workunit()
|
|
314
|
+
],
|
|
315
|
+
)
|
|
398
316
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
317
|
+
# Construct and emit lineage off of 'base_table_name'
|
|
318
|
+
# NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
|
|
319
|
+
upstream_urn: str = make_dataset_urn_with_platform_instance(
|
|
320
|
+
platform=self.platform,
|
|
321
|
+
name=f"{keyspace_name}.{view.base_table_name}",
|
|
322
|
+
env=self.config.env,
|
|
323
|
+
platform_instance=self.config.platform_instance,
|
|
324
|
+
)
|
|
325
|
+
fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
|
|
326
|
+
view_name, str(dataset.urn), upstream_urn
|
|
327
|
+
)
|
|
328
|
+
upstream_lineage = UpstreamLineageClass(
|
|
329
|
+
upstreams=[
|
|
330
|
+
UpstreamClass(
|
|
331
|
+
dataset=upstream_urn,
|
|
332
|
+
type=DatasetLineageTypeClass.VIEW,
|
|
408
333
|
)
|
|
334
|
+
],
|
|
335
|
+
fineGrainedLineages=fineGrainedLineages,
|
|
336
|
+
)
|
|
409
337
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
338
|
+
dataset.set_upstreams(upstream_lineage)
|
|
339
|
+
|
|
340
|
+
return dataset
|
|
341
|
+
|
|
342
|
+
def _get_dataset_custom_props(
|
|
343
|
+
self, dataset: CassandraSharedDatasetFields
|
|
344
|
+
) -> Dict[str, str]:
|
|
345
|
+
props = {
|
|
346
|
+
"bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance),
|
|
347
|
+
"caching": json.dumps(dataset.caching),
|
|
348
|
+
"compaction": json.dumps(dataset.compaction),
|
|
349
|
+
"compression": json.dumps(dataset.compression),
|
|
350
|
+
"crc_check_chance": str(dataset.crc_check_chance),
|
|
351
|
+
"dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance),
|
|
352
|
+
"default_time_to_live": str(dataset.default_time_to_live),
|
|
353
|
+
"extensions": json.dumps(dataset.extensions),
|
|
354
|
+
"gc_grace_seconds": str(dataset.gc_grace_seconds),
|
|
355
|
+
"max_index_interval": str(dataset.max_index_interval),
|
|
356
|
+
"min_index_interval": str(dataset.min_index_interval),
|
|
357
|
+
"memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms),
|
|
358
|
+
"read_repair_chance": str(dataset.read_repair_chance),
|
|
359
|
+
"speculative_retry": str(dataset.speculative_retry),
|
|
360
|
+
}
|
|
361
|
+
if isinstance(dataset, CassandraView):
|
|
362
|
+
props.update(
|
|
363
|
+
{
|
|
364
|
+
"include_all_columns": str(dataset.include_all_columns),
|
|
365
|
+
}
|
|
437
366
|
)
|
|
438
|
-
|
|
439
|
-
if self.config.platform_instance:
|
|
440
|
-
yield MetadataChangeProposalWrapper(
|
|
441
|
-
entityUrn=dataset_urn,
|
|
442
|
-
aspect=DataPlatformInstanceClass(
|
|
443
|
-
platform=make_data_platform_urn(self.platform),
|
|
444
|
-
instance=make_dataplatform_instance_urn(
|
|
445
|
-
self.platform, self.config.platform_instance
|
|
446
|
-
),
|
|
447
|
-
),
|
|
448
|
-
).as_workunit()
|
|
367
|
+
return props
|
|
449
368
|
|
|
450
369
|
def get_upstream_fields_of_field_in_datasource(
|
|
451
370
|
self, table_name: str, dataset_urn: str, upstream_urn: str
|
|
@@ -23,9 +23,9 @@ class CassandraKeyspace:
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
@dataclass
|
|
26
|
-
class
|
|
26
|
+
class CassandraSharedDatasetFields:
|
|
27
27
|
keyspace_name: str
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
bloom_filter_fp_chance: Optional[float]
|
|
30
30
|
caching: Optional[Dict[str, str]]
|
|
31
31
|
comment: Optional[str]
|
|
@@ -43,6 +43,11 @@ class CassandraTable:
|
|
|
43
43
|
speculative_retry: Optional[str]
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
@dataclass
|
|
47
|
+
class CassandraTable(CassandraSharedDatasetFields):
|
|
48
|
+
table_name: str
|
|
49
|
+
|
|
50
|
+
|
|
46
51
|
@dataclass
|
|
47
52
|
class CassandraColumn:
|
|
48
53
|
keyspace_name: str
|
|
@@ -55,8 +60,10 @@ class CassandraColumn:
|
|
|
55
60
|
|
|
56
61
|
|
|
57
62
|
@dataclass
|
|
58
|
-
class CassandraView(
|
|
63
|
+
class CassandraView(CassandraSharedDatasetFields):
|
|
59
64
|
view_name: str
|
|
65
|
+
|
|
66
|
+
base_table_name: str
|
|
60
67
|
include_all_columns: Optional[bool]
|
|
61
68
|
where_clause: str = ""
|
|
62
69
|
|
|
@@ -261,7 +268,7 @@ class CassandraAPI:
|
|
|
261
268
|
views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name])
|
|
262
269
|
view_list = [
|
|
263
270
|
CassandraView(
|
|
264
|
-
|
|
271
|
+
base_table_name=row.base_table_name,
|
|
265
272
|
keyspace_name=row.keyspace_name,
|
|
266
273
|
view_name=row.view_name,
|
|
267
274
|
bloom_filter_fp_chance=row.bloom_filter_fp_chance,
|
|
@@ -13,6 +13,9 @@ from datahub.configuration.source_common import (
|
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
|
+
StatefulIngestionConfigBase,
|
|
18
|
+
)
|
|
16
19
|
|
|
17
20
|
# hide annoying debug errors from py4j
|
|
18
21
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
@@ -35,7 +38,11 @@ class S3(ConfigModel):
|
|
|
35
38
|
)
|
|
36
39
|
|
|
37
40
|
|
|
38
|
-
class DeltaLakeSourceConfig(
|
|
41
|
+
class DeltaLakeSourceConfig(
|
|
42
|
+
PlatformInstanceConfigMixin,
|
|
43
|
+
EnvConfigMixin,
|
|
44
|
+
StatefulIngestionConfigBase,
|
|
45
|
+
):
|
|
39
46
|
base_path: str = Field(
|
|
40
47
|
description="Path to table (s3 or local file system). If path is not a delta table path "
|
|
41
48
|
"then all subfolders will be scanned to detect and ingest delta tables."
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
3
|
|
|
4
|
-
from datahub.ingestion.
|
|
4
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
|
+
StaleEntityRemovalSourceReport,
|
|
6
|
+
)
|
|
5
7
|
from datahub.utilities.lossy_collections import LossyList
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
@dataclasses.dataclass
|
|
9
|
-
class DeltaLakeSourceReport(
|
|
11
|
+
class DeltaLakeSourceReport(StaleEntityRemovalSourceReport):
|
|
10
12
|
files_scanned = 0
|
|
11
13
|
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
12
14
|
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
import time
|
|
5
|
-
from typing import Dict, Iterable, List
|
|
5
|
+
from typing import Dict, Iterable, List, Optional
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from deltalake import DeltaTable
|
|
@@ -21,7 +21,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
21
21
|
platform_name,
|
|
22
22
|
support_status,
|
|
23
23
|
)
|
|
24
|
-
from datahub.ingestion.api.source import
|
|
24
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
25
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
26
|
from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
|
|
27
27
|
from datahub.ingestion.source.aws.s3_util import (
|
|
@@ -36,6 +36,12 @@ from datahub.ingestion.source.delta_lake.delta_lake_utils import (
|
|
|
36
36
|
read_delta_table,
|
|
37
37
|
)
|
|
38
38
|
from datahub.ingestion.source.delta_lake.report import DeltaLakeSourceReport
|
|
39
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
40
|
+
StaleEntityRemovalHandler,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
43
|
+
StatefulIngestionSourceBase,
|
|
44
|
+
)
|
|
39
45
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
40
46
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
41
47
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
@@ -79,7 +85,7 @@ OPERATION_STATEMENT_TYPES = {
|
|
|
79
85
|
@config_class(DeltaLakeSourceConfig)
|
|
80
86
|
@support_status(SupportStatus.INCUBATING)
|
|
81
87
|
@capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
|
|
82
|
-
class DeltaLakeSource(
|
|
88
|
+
class DeltaLakeSource(StatefulIngestionSourceBase):
|
|
83
89
|
"""
|
|
84
90
|
This plugin extracts:
|
|
85
91
|
- Column types and schema associated with each delta table
|
|
@@ -100,9 +106,10 @@ class DeltaLakeSource(Source):
|
|
|
100
106
|
storage_options: Dict[str, str]
|
|
101
107
|
|
|
102
108
|
def __init__(self, config: DeltaLakeSourceConfig, ctx: PipelineContext):
|
|
103
|
-
super().__init__(ctx)
|
|
109
|
+
super().__init__(config, ctx)
|
|
110
|
+
self.ctx = ctx
|
|
104
111
|
self.source_config = config
|
|
105
|
-
self.report = DeltaLakeSourceReport()
|
|
112
|
+
self.report: DeltaLakeSourceReport = DeltaLakeSourceReport()
|
|
106
113
|
if self.source_config.is_s3:
|
|
107
114
|
if (
|
|
108
115
|
self.source_config.s3 is None
|
|
@@ -331,6 +338,14 @@ class DeltaLakeSource(Source):
|
|
|
331
338
|
for folder in os.listdir(path):
|
|
332
339
|
yield os.path.join(path, folder)
|
|
333
340
|
|
|
341
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
342
|
+
return [
|
|
343
|
+
*super().get_workunit_processors(),
|
|
344
|
+
StaleEntityRemovalHandler.create(
|
|
345
|
+
self, self.source_config, self.ctx
|
|
346
|
+
).workunit_processor,
|
|
347
|
+
]
|
|
348
|
+
|
|
334
349
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
335
350
|
self.container_WU_creator = ContainerWUCreator(
|
|
336
351
|
self.source_config.platform,
|