acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show
  1. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
  2. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
  3. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/emitter/mcp_builder.py +4 -1
  7. datahub/ingestion/api/source_helpers.py +4 -0
  8. datahub/ingestion/run/pipeline.py +109 -143
  9. datahub/ingestion/run/sink_callback.py +77 -0
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
  11. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  12. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  13. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  14. datahub/ingestion/source/delta_lake/config.py +8 -1
  15. datahub/ingestion/source/delta_lake/report.py +4 -2
  16. datahub/ingestion/source/delta_lake/source.py +20 -5
  17. datahub/ingestion/source/elastic_search.py +26 -6
  18. datahub/ingestion/source/feast.py +27 -8
  19. datahub/ingestion/source/file.py +1 -1
  20. datahub/ingestion/source/identity/okta.py +1 -2
  21. datahub/ingestion/source/mlflow.py +30 -7
  22. datahub/ingestion/source/mode.py +7 -2
  23. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  24. datahub/ingestion/source/nifi.py +29 -6
  25. datahub/ingestion/source/openapi_parser.py +46 -14
  26. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  27. datahub/ingestion/source/pulsar.py +1 -0
  28. datahub/ingestion/source/redash.py +29 -6
  29. datahub/ingestion/source/s3/config.py +3 -1
  30. datahub/ingestion/source/salesforce.py +28 -6
  31. datahub/ingestion/source/slack/slack.py +31 -10
  32. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  33. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  34. datahub/ingestion/source/sql/oracle.py +34 -0
  35. datahub/ingestion/source_config/pulsar.py +3 -1
  36. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  37. datahub/metadata/_schema_classes.py +534 -410
  38. datahub/metadata/_urns/urn_defs.py +1670 -1670
  39. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  40. datahub/metadata/schema.avsc +17379 -17637
  41. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  42. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  43. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  44. datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
  45. datahub/metadata/schemas/__init__.py +3 -3
  46. datahub/sdk/__init__.py +29 -12
  47. datahub/sdk/_attribution.py +4 -0
  48. datahub/sdk/_entity.py +20 -1
  49. datahub/sdk/_shared.py +163 -13
  50. datahub/sdk/_utils.py +35 -0
  51. datahub/sdk/container.py +23 -5
  52. datahub/sdk/dataset.py +109 -17
  53. datahub/sdk/main_client.py +17 -0
  54. datahub/specific/dataset.py +3 -4
  55. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  56. datahub/sql_parsing/split_statements.py +20 -13
  57. datahub/utilities/file_backed_collections.py +3 -14
  58. datahub/utilities/sentinels.py +22 -0
  59. datahub/utilities/unified_diff.py +5 -1
  60. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  61. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  62. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,14 @@
1
1
  import dataclasses
2
2
  import json
3
3
  import logging
4
- from typing import Any, Dict, Iterable, List, Optional
4
+ from typing import Any, Dict, Iterable, List, Optional, Union
5
5
 
6
6
  from datahub.emitter.mce_builder import (
7
- make_data_platform_urn,
8
- make_dataplatform_instance_urn,
9
7
  make_dataset_urn_with_platform_instance,
10
8
  make_schema_field_urn,
11
9
  )
12
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
10
  from datahub.emitter.mcp_builder import (
14
11
  ContainerKey,
15
- add_dataset_to_container,
16
- gen_containers,
17
12
  )
18
13
  from datahub.ingestion.api.common import PipelineContext
19
14
  from datahub.ingestion.api.decorators import (
@@ -31,6 +26,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
31
26
  CassandraColumn,
32
27
  CassandraEntities,
33
28
  CassandraKeyspace,
29
+ CassandraSharedDatasetFields,
34
30
  CassandraTable,
35
31
  CassandraView,
36
32
  )
@@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
47
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
48
  StatefulIngestionSourceBase,
53
49
  )
54
- from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
55
50
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
51
  SchemaField,
57
- SchemaMetadata,
58
52
  )
59
53
  from datahub.metadata.schema_classes import (
60
- DataPlatformInstanceClass,
61
54
  DatasetLineageTypeClass,
62
- DatasetPropertiesClass,
63
55
  FineGrainedLineageClass,
64
56
  FineGrainedLineageDownstreamTypeClass,
65
57
  FineGrainedLineageUpstreamTypeClass,
66
- OtherSchemaClass,
67
- SubTypesClass,
68
58
  UpstreamClass,
69
59
  UpstreamLineageClass,
70
60
  ViewPropertiesClass,
71
61
  )
62
+ from datahub.sdk._entity import Entity
63
+ from datahub.sdk.container import Container
64
+ from datahub.sdk.dataset import Dataset
72
65
 
73
66
  logger = logging.getLogger(__name__)
74
67
 
@@ -133,6 +126,13 @@ class CassandraSource(StatefulIngestionSourceBase):
133
126
  def get_workunits_internal(
134
127
  self,
135
128
  ) -> Iterable[MetadataWorkUnit]:
129
+ for metadata in self._get_metadata():
130
+ if isinstance(metadata, MetadataWorkUnit):
131
+ yield metadata
132
+ else:
133
+ yield from metadata.as_workunits()
134
+
135
+ def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
136
136
  if not self.cassandra_api.authenticate():
137
137
  return
138
138
  keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
@@ -145,7 +145,7 @@ class CassandraSource(StatefulIngestionSourceBase):
145
145
  self.report.report_dropped(keyspace_name)
146
146
  continue
147
147
 
148
- yield from self._generate_keyspace_container(keyspace)
148
+ yield self._generate_keyspace_container(keyspace)
149
149
 
150
150
  try:
151
151
  yield from self._extract_tables_from_keyspace(keyspace_name)
@@ -170,21 +170,20 @@ class CassandraSource(StatefulIngestionSourceBase):
170
170
  if self.config.is_profiling_enabled():
171
171
  yield from self.profiler.get_workunits(self.cassandra_data)
172
172
 
173
- def _generate_keyspace_container(
174
- self, keyspace: CassandraKeyspace
175
- ) -> Iterable[MetadataWorkUnit]:
173
+ def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container:
176
174
  keyspace_container_key = self._generate_keyspace_container_key(
177
175
  keyspace.keyspace_name
178
176
  )
179
- yield from gen_containers(
180
- container_key=keyspace_container_key,
181
- name=keyspace.keyspace_name,
177
+
178
+ return Container(
179
+ keyspace_container_key,
180
+ display_name=keyspace.keyspace_name,
182
181
  qualified_name=keyspace.keyspace_name,
182
+ subtype=DatasetContainerSubTypes.KEYSPACE,
183
183
  extra_properties={
184
184
  "durable_writes": str(keyspace.durable_writes),
185
185
  "replication": json.dumps(keyspace.replication),
186
186
  },
187
- sub_types=[DatasetContainerSubTypes.KEYSPACE],
188
187
  )
189
188
 
190
189
  def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey:
@@ -196,105 +195,55 @@ class CassandraSource(StatefulIngestionSourceBase):
196
195
  )
197
196
 
198
197
  # get all tables for a given keyspace, iterate over them to extract column metadata
199
- def _extract_tables_from_keyspace(
200
- self, keyspace_name: str
201
- ) -> Iterable[MetadataWorkUnit]:
198
+ def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
202
199
  self.cassandra_data.keyspaces.append(keyspace_name)
203
200
  tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name)
204
201
  for table in tables:
205
- # define the dataset urn for this table to be used downstream
206
- table_name: str = table.table_name
207
- dataset_name: str = f"{keyspace_name}.{table_name}"
208
-
209
- if not self.config.table_pattern.allowed(dataset_name):
210
- self.report.report_dropped(dataset_name)
211
- continue
212
-
213
- self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
214
- self.report.report_entity_scanned(dataset_name, ent_type="Table")
215
-
216
- dataset_urn = make_dataset_urn_with_platform_instance(
217
- platform=self.platform,
218
- name=dataset_name,
219
- env=self.config.env,
220
- platform_instance=self.config.platform_instance,
202
+ dataset = self._generate_table(keyspace_name, table)
203
+ if dataset:
204
+ yield dataset
205
+
206
+ def _generate_table(
207
+ self, keyspace_name: str, table: CassandraTable
208
+ ) -> Optional[Dataset]:
209
+ table_name: str = table.table_name
210
+ dataset_name: str = f"{keyspace_name}.{table_name}"
211
+
212
+ self.report.report_entity_scanned(dataset_name, ent_type="Table")
213
+ if not self.config.table_pattern.allowed(dataset_name):
214
+ self.report.report_dropped(dataset_name)
215
+ return None
216
+
217
+ self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
218
+
219
+ schema_fields = None
220
+ try:
221
+ schema_fields = self._extract_columns_from_table(keyspace_name, table_name)
222
+ except Exception as e:
223
+ self.report.failure(
224
+ message="Failed to extract columns from table",
225
+ context=dataset_name,
226
+ exc=e,
221
227
  )
222
228
 
223
- # 1. Extract columns from table, then construct and emit the schemaMetadata aspect.
224
- try:
225
- yield from self._extract_columns_from_table(
226
- keyspace_name, table_name, dataset_urn
227
- )
228
- except Exception as e:
229
- self.report.failure(
230
- message="Failed to extract columns from table",
231
- context=table_name,
232
- exc=e,
233
- )
234
-
235
- yield MetadataChangeProposalWrapper(
236
- entityUrn=dataset_urn,
237
- aspect=StatusClass(removed=False),
238
- ).as_workunit()
239
-
240
- yield MetadataChangeProposalWrapper(
241
- entityUrn=dataset_urn,
242
- aspect=SubTypesClass(
243
- typeNames=[
244
- DatasetSubTypes.TABLE,
245
- ]
246
- ),
247
- ).as_workunit()
248
-
249
- yield MetadataChangeProposalWrapper(
250
- entityUrn=dataset_urn,
251
- aspect=DatasetPropertiesClass(
252
- name=table_name,
253
- qualifiedName=f"{keyspace_name}.{table_name}",
254
- description=table.comment,
255
- customProperties={
256
- "bloom_filter_fp_chance": str(table.bloom_filter_fp_chance),
257
- "caching": json.dumps(table.caching),
258
- "compaction": json.dumps(table.compaction),
259
- "compression": json.dumps(table.compression),
260
- "crc_check_chance": str(table.crc_check_chance),
261
- "dclocal_read_repair_chance": str(
262
- table.dclocal_read_repair_chance
263
- ),
264
- "default_time_to_live": str(table.default_time_to_live),
265
- "extensions": json.dumps(table.extensions),
266
- "gc_grace_seconds": str(table.gc_grace_seconds),
267
- "max_index_interval": str(table.max_index_interval),
268
- "min_index_interval": str(table.min_index_interval),
269
- "memtable_flush_period_in_ms": str(
270
- table.memtable_flush_period_in_ms
271
- ),
272
- "read_repair_chance": str(table.read_repair_chance),
273
- "speculative_retry": str(table.speculative_retry),
274
- },
275
- ),
276
- ).as_workunit()
277
-
278
- yield from add_dataset_to_container(
279
- container_key=self._generate_keyspace_container_key(keyspace_name),
280
- dataset_urn=dataset_urn,
281
- )
282
-
283
- if self.config.platform_instance:
284
- yield MetadataChangeProposalWrapper(
285
- entityUrn=dataset_urn,
286
- aspect=DataPlatformInstanceClass(
287
- platform=make_data_platform_urn(self.platform),
288
- instance=make_dataplatform_instance_urn(
289
- self.platform, self.config.platform_instance
290
- ),
291
- ),
292
- ).as_workunit()
229
+ return Dataset(
230
+ platform=self.platform,
231
+ name=dataset_name,
232
+ env=self.config.env,
233
+ platform_instance=self.config.platform_instance,
234
+ subtype=DatasetSubTypes.TABLE,
235
+ parent_container=self._generate_keyspace_container_key(keyspace_name),
236
+ schema=schema_fields,
237
+ display_name=table_name,
238
+ qualified_name=dataset_name,
239
+ description=table.comment,
240
+ custom_properties=self._get_dataset_custom_props(table),
241
+ )
293
242
 
294
243
  # get all columns for a given table, iterate over them to extract column metadata
295
244
  def _extract_columns_from_table(
296
- self, keyspace_name: str, table_name: str, dataset_urn: str
297
- ) -> Iterable[MetadataWorkUnit]:
245
+ self, keyspace_name: str, table_name: str
246
+ ) -> Optional[List[SchemaField]]:
298
247
  column_infos: List[CassandraColumn] = self.cassandra_api.get_columns(
299
248
  keyspace_name, table_name
300
249
  )
@@ -305,147 +254,117 @@ class CassandraSource(StatefulIngestionSourceBase):
305
254
  self.report.report_warning(
306
255
  message="Table has no columns, skipping", context=table_name
307
256
  )
308
- return
257
+ return None
309
258
 
259
+ # Tricky: we also save the column info to a global store.
310
260
  jsonable_column_infos: List[Dict[str, Any]] = []
311
261
  for column in column_infos:
312
262
  self.cassandra_data.columns.setdefault(table_name, []).append(column)
313
263
  jsonable_column_infos.append(dataclasses.asdict(column))
314
264
 
315
- schema_metadata: SchemaMetadata = SchemaMetadata(
316
- schemaName=table_name,
317
- platform=make_data_platform_urn(self.platform),
318
- version=0,
319
- hash="",
320
- platformSchema=OtherSchemaClass(
321
- rawSchema=json.dumps(jsonable_column_infos)
322
- ),
323
- fields=schema_fields,
324
- )
325
-
326
- yield MetadataChangeProposalWrapper(
327
- entityUrn=dataset_urn,
328
- aspect=schema_metadata,
329
- ).as_workunit()
265
+ return schema_fields
330
266
 
331
- def _extract_views_from_keyspace(
332
- self, keyspace_name: str
333
- ) -> Iterable[MetadataWorkUnit]:
267
+ def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
334
268
  views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name)
335
269
  for view in views:
336
- view_name: str = view.view_name
337
- dataset_name: str = f"{keyspace_name}.{view_name}"
338
- self.report.report_entity_scanned(dataset_name)
339
- dataset_urn: str = make_dataset_urn_with_platform_instance(
340
- platform=self.platform,
341
- name=dataset_name,
342
- env=self.config.env,
343
- platform_instance=self.config.platform_instance,
270
+ dataset = self._generate_view(keyspace_name, view)
271
+ if dataset:
272
+ yield dataset
273
+
274
+ def _generate_view(
275
+ self, keyspace_name: str, view: CassandraView
276
+ ) -> Optional[Dataset]:
277
+ view_name: str = view.view_name
278
+ dataset_name: str = f"{keyspace_name}.{view_name}"
279
+
280
+ self.report.report_entity_scanned(dataset_name, ent_type="View")
281
+ if not self.config.table_pattern.allowed(dataset_name):
282
+ # TODO: Maybe add a view_pattern instead of reusing table_pattern?
283
+ self.report.report_dropped(dataset_name)
284
+ return None
285
+
286
+ schema_fields = None
287
+ try:
288
+ schema_fields = self._extract_columns_from_table(keyspace_name, view_name)
289
+ except Exception as e:
290
+ self.report.failure(
291
+ message="Failed to extract columns from views",
292
+ context=view_name,
293
+ exc=e,
344
294
  )
345
295
 
346
- yield MetadataChangeProposalWrapper(
347
- entityUrn=dataset_urn,
348
- aspect=StatusClass(removed=False),
349
- ).as_workunit()
350
-
351
- yield MetadataChangeProposalWrapper(
352
- entityUrn=dataset_urn,
353
- aspect=SubTypesClass(
354
- typeNames=[
355
- DatasetSubTypes.VIEW,
356
- ]
357
- ),
358
- ).as_workunit()
359
-
360
- yield MetadataChangeProposalWrapper(
361
- entityUrn=dataset_urn,
362
- aspect=ViewPropertiesClass(
296
+ dataset = Dataset(
297
+ platform=self.platform,
298
+ name=dataset_name,
299
+ env=self.config.env,
300
+ platform_instance=self.config.platform_instance,
301
+ subtype=DatasetSubTypes.VIEW,
302
+ parent_container=self._generate_keyspace_container_key(keyspace_name),
303
+ schema=schema_fields,
304
+ display_name=view_name,
305
+ qualified_name=dataset_name,
306
+ description=view.comment,
307
+ custom_properties=self._get_dataset_custom_props(view),
308
+ extra_aspects=[
309
+ ViewPropertiesClass(
363
310
  materialized=True,
364
311
  viewLogic=view.where_clause, # Use the WHERE clause as view logic
365
312
  viewLanguage="CQL", # Use "CQL" as the language
366
313
  ),
367
- ).as_workunit()
368
-
369
- yield MetadataChangeProposalWrapper(
370
- entityUrn=dataset_urn,
371
- aspect=DatasetPropertiesClass(
372
- name=view_name,
373
- qualifiedName=f"{keyspace_name}.{view_name}",
374
- description=view.comment,
375
- customProperties={
376
- "bloom_filter_fp_chance": str(view.bloom_filter_fp_chance),
377
- "caching": json.dumps(view.caching),
378
- "compaction": json.dumps(view.compaction),
379
- "compression": json.dumps(view.compression),
380
- "crc_check_chance": str(view.crc_check_chance),
381
- "include_all_columns": str(view.include_all_columns),
382
- "dclocal_read_repair_chance": str(
383
- view.dclocal_read_repair_chance
384
- ),
385
- "default_time_to_live": str(view.default_time_to_live),
386
- "extensions": json.dumps(view.extensions),
387
- "gc_grace_seconds": str(view.gc_grace_seconds),
388
- "max_index_interval": str(view.max_index_interval),
389
- "min_index_interval": str(view.min_index_interval),
390
- "memtable_flush_period_in_ms": str(
391
- view.memtable_flush_period_in_ms
392
- ),
393
- "read_repair_chance": str(view.read_repair_chance),
394
- "speculative_retry": str(view.speculative_retry),
395
- },
396
- ),
397
- ).as_workunit()
314
+ ],
315
+ )
398
316
 
399
- try:
400
- yield from self._extract_columns_from_table(
401
- keyspace_name, view_name, dataset_urn
402
- )
403
- except Exception as e:
404
- self.report.failure(
405
- message="Failed to extract columns from views",
406
- context=view_name,
407
- exc=e,
317
+ # Construct and emit lineage off of 'base_table_name'
318
+ # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
319
+ upstream_urn: str = make_dataset_urn_with_platform_instance(
320
+ platform=self.platform,
321
+ name=f"{keyspace_name}.{view.base_table_name}",
322
+ env=self.config.env,
323
+ platform_instance=self.config.platform_instance,
324
+ )
325
+ fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
326
+ view_name, str(dataset.urn), upstream_urn
327
+ )
328
+ upstream_lineage = UpstreamLineageClass(
329
+ upstreams=[
330
+ UpstreamClass(
331
+ dataset=upstream_urn,
332
+ type=DatasetLineageTypeClass.VIEW,
408
333
  )
334
+ ],
335
+ fineGrainedLineages=fineGrainedLineages,
336
+ )
409
337
 
410
- # Construct and emit lineage off of 'base_table_name'
411
- # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
412
- upstream_urn: str = make_dataset_urn_with_platform_instance(
413
- platform=self.platform,
414
- name=f"{keyspace_name}.{view.table_name}",
415
- env=self.config.env,
416
- platform_instance=self.config.platform_instance,
417
- )
418
- fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
419
- view_name, dataset_urn, upstream_urn
420
- )
421
- yield MetadataChangeProposalWrapper(
422
- entityUrn=dataset_urn,
423
- aspect=UpstreamLineageClass(
424
- upstreams=[
425
- UpstreamClass(
426
- dataset=upstream_urn,
427
- type=DatasetLineageTypeClass.VIEW,
428
- )
429
- ],
430
- fineGrainedLineages=fineGrainedLineages,
431
- ),
432
- ).as_workunit()
433
-
434
- yield from add_dataset_to_container(
435
- container_key=self._generate_keyspace_container_key(keyspace_name),
436
- dataset_urn=dataset_urn,
338
+ dataset.set_upstreams(upstream_lineage)
339
+
340
+ return dataset
341
+
342
+ def _get_dataset_custom_props(
343
+ self, dataset: CassandraSharedDatasetFields
344
+ ) -> Dict[str, str]:
345
+ props = {
346
+ "bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance),
347
+ "caching": json.dumps(dataset.caching),
348
+ "compaction": json.dumps(dataset.compaction),
349
+ "compression": json.dumps(dataset.compression),
350
+ "crc_check_chance": str(dataset.crc_check_chance),
351
+ "dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance),
352
+ "default_time_to_live": str(dataset.default_time_to_live),
353
+ "extensions": json.dumps(dataset.extensions),
354
+ "gc_grace_seconds": str(dataset.gc_grace_seconds),
355
+ "max_index_interval": str(dataset.max_index_interval),
356
+ "min_index_interval": str(dataset.min_index_interval),
357
+ "memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms),
358
+ "read_repair_chance": str(dataset.read_repair_chance),
359
+ "speculative_retry": str(dataset.speculative_retry),
360
+ }
361
+ if isinstance(dataset, CassandraView):
362
+ props.update(
363
+ {
364
+ "include_all_columns": str(dataset.include_all_columns),
365
+ }
437
366
  )
438
-
439
- if self.config.platform_instance:
440
- yield MetadataChangeProposalWrapper(
441
- entityUrn=dataset_urn,
442
- aspect=DataPlatformInstanceClass(
443
- platform=make_data_platform_urn(self.platform),
444
- instance=make_dataplatform_instance_urn(
445
- self.platform, self.config.platform_instance
446
- ),
447
- ),
448
- ).as_workunit()
367
+ return props
449
368
 
450
369
  def get_upstream_fields_of_field_in_datasource(
451
370
  self, table_name: str, dataset_urn: str, upstream_urn: str
@@ -23,9 +23,9 @@ class CassandraKeyspace:
23
23
 
24
24
 
25
25
  @dataclass
26
- class CassandraTable:
26
+ class CassandraSharedDatasetFields:
27
27
  keyspace_name: str
28
- table_name: str
28
+
29
29
  bloom_filter_fp_chance: Optional[float]
30
30
  caching: Optional[Dict[str, str]]
31
31
  comment: Optional[str]
@@ -43,6 +43,11 @@ class CassandraTable:
43
43
  speculative_retry: Optional[str]
44
44
 
45
45
 
46
+ @dataclass
47
+ class CassandraTable(CassandraSharedDatasetFields):
48
+ table_name: str
49
+
50
+
46
51
  @dataclass
47
52
  class CassandraColumn:
48
53
  keyspace_name: str
@@ -55,8 +60,10 @@ class CassandraColumn:
55
60
 
56
61
 
57
62
  @dataclass
58
- class CassandraView(CassandraTable):
63
+ class CassandraView(CassandraSharedDatasetFields):
59
64
  view_name: str
65
+
66
+ base_table_name: str
60
67
  include_all_columns: Optional[bool]
61
68
  where_clause: str = ""
62
69
 
@@ -261,7 +268,7 @@ class CassandraAPI:
261
268
  views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name])
262
269
  view_list = [
263
270
  CassandraView(
264
- table_name=row.base_table_name,
271
+ base_table_name=row.base_table_name,
265
272
  keyspace_name=row.keyspace_name,
266
273
  view_name=row.view_name,
267
274
  bloom_filter_fp_chance=row.bloom_filter_fp_chance,
@@ -13,6 +13,9 @@ from datahub.configuration.source_common import (
13
13
  )
14
14
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
17
+ StatefulIngestionConfigBase,
18
+ )
16
19
 
17
20
  # hide annoying debug errors from py4j
18
21
  logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -35,7 +38,11 @@ class S3(ConfigModel):
35
38
  )
36
39
 
37
40
 
38
- class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
41
+ class DeltaLakeSourceConfig(
42
+ PlatformInstanceConfigMixin,
43
+ EnvConfigMixin,
44
+ StatefulIngestionConfigBase,
45
+ ):
39
46
  base_path: str = Field(
40
47
  description="Path to table (s3 or local file system). If path is not a delta table path "
41
48
  "then all subfolders will be scanned to detect and ingest delta tables."
@@ -1,12 +1,14 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
3
 
4
- from datahub.ingestion.api.source import SourceReport
4
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
+ StaleEntityRemovalSourceReport,
6
+ )
5
7
  from datahub.utilities.lossy_collections import LossyList
6
8
 
7
9
 
8
10
  @dataclasses.dataclass
9
- class DeltaLakeSourceReport(SourceReport):
11
+ class DeltaLakeSourceReport(StaleEntityRemovalSourceReport):
10
12
  files_scanned = 0
11
13
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
12
14
 
@@ -2,7 +2,7 @@ import json
2
2
  import logging
3
3
  import os
4
4
  import time
5
- from typing import Dict, Iterable, List
5
+ from typing import Dict, Iterable, List, Optional
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from deltalake import DeltaTable
@@ -21,7 +21,7 @@ from datahub.ingestion.api.decorators import (
21
21
  platform_name,
22
22
  support_status,
23
23
  )
24
- from datahub.ingestion.api.source import Source, SourceReport
24
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
25
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
26
  from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
27
27
  from datahub.ingestion.source.aws.s3_util import (
@@ -36,6 +36,12 @@ from datahub.ingestion.source.delta_lake.delta_lake_utils import (
36
36
  read_delta_table,
37
37
  )
38
38
  from datahub.ingestion.source.delta_lake.report import DeltaLakeSourceReport
39
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
40
+ StaleEntityRemovalHandler,
41
+ )
42
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
43
+ StatefulIngestionSourceBase,
44
+ )
39
45
  from datahub.metadata.com.linkedin.pegasus2avro.common import Status
40
46
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
41
47
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
@@ -79,7 +85,7 @@ OPERATION_STATEMENT_TYPES = {
79
85
  @config_class(DeltaLakeSourceConfig)
80
86
  @support_status(SupportStatus.INCUBATING)
81
87
  @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
82
- class DeltaLakeSource(Source):
88
+ class DeltaLakeSource(StatefulIngestionSourceBase):
83
89
  """
84
90
  This plugin extracts:
85
91
  - Column types and schema associated with each delta table
@@ -100,9 +106,10 @@ class DeltaLakeSource(Source):
100
106
  storage_options: Dict[str, str]
101
107
 
102
108
  def __init__(self, config: DeltaLakeSourceConfig, ctx: PipelineContext):
103
- super().__init__(ctx)
109
+ super().__init__(config, ctx)
110
+ self.ctx = ctx
104
111
  self.source_config = config
105
- self.report = DeltaLakeSourceReport()
112
+ self.report: DeltaLakeSourceReport = DeltaLakeSourceReport()
106
113
  if self.source_config.is_s3:
107
114
  if (
108
115
  self.source_config.s3 is None
@@ -331,6 +338,14 @@ class DeltaLakeSource(Source):
331
338
  for folder in os.listdir(path):
332
339
  yield os.path.join(path, folder)
333
340
 
341
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
342
+ return [
343
+ *super().get_workunit_processors(),
344
+ StaleEntityRemovalHandler.create(
345
+ self, self.source_config, self.ctx
346
+ ).workunit_processor,
347
+ ]
348
+
334
349
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
335
350
  self.container_WU_creator = ContainerWUCreator(
336
351
  self.source_config.platform,