acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
  2. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
  3. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/ingestion/api/source_helpers.py +4 -0
  7. datahub/ingestion/run/pipeline.py +109 -143
  8. datahub/ingestion/run/sink_callback.py +77 -0
  9. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  10. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  11. datahub/ingestion/source/delta_lake/config.py +8 -1
  12. datahub/ingestion/source/delta_lake/report.py +4 -2
  13. datahub/ingestion/source/delta_lake/source.py +20 -5
  14. datahub/ingestion/source/elastic_search.py +26 -6
  15. datahub/ingestion/source/feast.py +27 -8
  16. datahub/ingestion/source/file.py +1 -1
  17. datahub/ingestion/source/identity/okta.py +1 -2
  18. datahub/ingestion/source/mlflow.py +30 -7
  19. datahub/ingestion/source/mode.py +7 -2
  20. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  21. datahub/ingestion/source/nifi.py +29 -6
  22. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  23. datahub/ingestion/source/pulsar.py +1 -0
  24. datahub/ingestion/source/redash.py +29 -6
  25. datahub/ingestion/source/s3/config.py +3 -1
  26. datahub/ingestion/source/salesforce.py +28 -6
  27. datahub/ingestion/source/slack/slack.py +31 -10
  28. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  30. datahub/ingestion/source/sql/oracle.py +34 -0
  31. datahub/ingestion/source_config/pulsar.py +3 -1
  32. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  33. datahub/metadata/_schema_classes.py +517 -410
  34. datahub/metadata/_urns/urn_defs.py +1670 -1670
  35. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  36. datahub/metadata/schema.avsc +17362 -17638
  37. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  38. datahub/metadata/schemas/__init__.py +3 -3
  39. datahub/sdk/__init__.py +29 -12
  40. datahub/sdk/_entity.py +18 -1
  41. datahub/sdk/container.py +3 -1
  42. datahub/sdk/dataset.py +5 -3
  43. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  44. datahub/utilities/unified_diff.py +5 -1
  45. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  46. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  47. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
32
32
  platform_name,
33
33
  support_status,
34
34
  )
35
- from datahub.ingestion.api.source import Source, SourceReport
35
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
37
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
38
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
39
+ StaleEntityRemovalHandler,
40
+ StaleEntityRemovalSourceReport,
41
+ )
42
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
43
+ StatefulIngestionConfigBase,
44
+ StatefulIngestionSourceBase,
45
+ )
38
46
  from datahub.ingestion.source_config.operation_config import (
39
47
  OperationConfig,
40
48
  is_profiling_enabled,
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
188
196
 
189
197
 
190
198
  @dataclass
191
- class ElasticsearchSourceReport(SourceReport):
199
+ class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
192
200
  index_scanned: int = 0
193
201
  filtered: LossyList[str] = field(default_factory=LossyList)
194
202
 
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
240
248
  )
241
249
 
242
250
 
243
- class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
251
+ class ElasticsearchSourceConfig(
252
+ StatefulIngestionConfigBase,
253
+ PlatformInstanceConfigMixin,
254
+ EnvConfigMixin,
255
+ ):
244
256
  host: str = Field(
245
257
  default="localhost:9200", description="The elastic search host URI."
246
258
  )
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
337
349
  @config_class(ElasticsearchSourceConfig)
338
350
  @support_status(SupportStatus.CERTIFIED)
339
351
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
340
- class ElasticsearchSource(Source):
352
+ class ElasticsearchSource(StatefulIngestionSourceBase):
341
353
  """
342
354
  This plugin extracts the following:
343
355
 
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
346
358
  """
347
359
 
348
360
  def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
349
- super().__init__(ctx)
361
+ super().__init__(config, ctx)
350
362
  self.source_config = config
351
363
  self.client = Elasticsearch(
352
364
  self.source_config.host,
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
361
373
  ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
362
374
  url_prefix=self.source_config.url_prefix,
363
375
  )
364
- self.report = ElasticsearchSourceReport()
376
+ self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
365
377
  self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
366
378
  self.platform: str = "elasticsearch"
367
379
  self.cat_response: Optional[List[Dict[str, Any]]] = None
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
373
385
  config = ElasticsearchSourceConfig.parse_obj(config_dict)
374
386
  return cls(config, ctx)
375
387
 
388
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
389
+ return [
390
+ *super().get_workunit_processors(),
391
+ StaleEntityRemovalHandler.create(
392
+ self, self.source_config, self.ctx
393
+ ).workunit_processor,
394
+ ]
395
+
376
396
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
377
397
  indices = self.client.indices.get_alias()
378
398
  for index in indices:
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
20
20
  from pydantic import Field
21
21
 
22
22
  import datahub.emitter.mce_builder as builder
23
- from datahub.configuration.common import ConfigModel
24
23
  from datahub.emitter.mce_builder import DEFAULT_ENV
25
24
  from datahub.ingestion.api.common import PipelineContext
26
25
  from datahub.ingestion.api.decorators import (
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
31
30
  platform_name,
32
31
  support_status,
33
32
  )
34
- from datahub.ingestion.api.source import Source, SourceReport
33
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
36
+ StaleEntityRemovalHandler,
37
+ StaleEntityRemovalSourceReport,
38
+ )
39
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
40
+ StatefulIngestionConfigBase,
41
+ StatefulIngestionSourceBase,
42
+ )
36
43
  from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
37
44
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
38
45
  MLFeatureSnapshot,
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
86
93
  }
87
94
 
88
95
 
89
- class FeastRepositorySourceConfig(ConfigModel):
96
+ class FeastRepositorySourceConfig(
97
+ StatefulIngestionConfigBase,
98
+ ):
90
99
  path: str = Field(description="Path to Feast repository")
91
100
  fs_yaml_file: Optional[str] = Field(
92
101
  default=None,
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
122
131
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
123
132
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
124
133
  @dataclass
125
- class FeastRepositorySource(Source):
134
+ class FeastRepositorySource(StatefulIngestionSourceBase):
126
135
  """
127
136
  This plugin extracts:
128
137
 
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
135
144
 
136
145
  platform = "feast"
137
146
  source_config: FeastRepositorySourceConfig
138
- report: SourceReport
147
+ report: StaleEntityRemovalSourceReport
139
148
  feature_store: FeatureStore
140
149
 
141
150
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
142
- super().__init__(ctx)
151
+ super().__init__(config, ctx)
143
152
  self.source_config = config
144
- self.report = SourceReport()
153
+ self.ctx = ctx
154
+ self.report = StaleEntityRemovalSourceReport()
145
155
  self.feature_store = FeatureStore(
146
156
  repo_path=self.source_config.path,
147
157
  fs_yaml_file=self.source_config.fs_yaml_file,
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
158
168
 
159
169
  if ml_feature_data_type is None:
160
170
  self.report.report_warning(
161
- parent_name, f"unable to map type {field_type} to metadata schema"
171
+ "unable to map type",
172
+ f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
162
173
  )
163
174
 
164
175
  ml_feature_data_type = MLFeatureDataType.UNKNOWN
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
456
467
  config = FeastRepositorySourceConfig.parse_obj(config_dict)
457
468
  return cls(config, ctx)
458
469
 
470
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
471
+ return [
472
+ *super().get_workunit_processors(),
473
+ StaleEntityRemovalHandler.create(
474
+ self, self.source_config, self.ctx
475
+ ).workunit_processor,
476
+ ]
477
+
459
478
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
460
479
  for feature_view in self.feature_store.list_feature_views():
461
480
  for entity_name in feature_view.entities:
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
351
351
  self.report.add_deserialize_time(deserialize_duration)
352
352
  yield i, item
353
353
  except Exception as e:
354
- self.report.report_failure(f"path-{i}", str(e))
354
+ self.report.report_failure(f"{file_status.path}-{i}", str(e))
355
355
 
356
356
  @staticmethod
357
357
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
14
14
  from pydantic import validator
15
15
  from pydantic.fields import Field
16
16
 
17
- from datahub.configuration.common import ConfigModel
18
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
18
  from datahub.ingestion.api.common import PipelineContext
20
19
  from datahub.ingestion.api.decorators import (
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
56
55
  nest_asyncio.apply()
57
56
 
58
57
 
59
- class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
58
+ class OktaConfig(StatefulIngestionConfigBase):
60
59
  # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
61
60
  okta_domain: str = Field(
62
61
  description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import Any, Callable, Iterable, Optional, TypeVar, Union
2
+ from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
3
3
 
4
4
  from mlflow import MlflowClient
5
5
  from mlflow.entities import Run
@@ -8,7 +8,9 @@ from mlflow.store.entities import PagedList
8
8
  from pydantic.fields import Field
9
9
 
10
10
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.source_common import EnvConfigMixin
11
+ from datahub.configuration.source_common import (
12
+ EnvConfigMixin,
13
+ )
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
14
16
  from datahub.ingestion.api.decorators import (
@@ -18,8 +20,20 @@ from datahub.ingestion.api.decorators import (
18
20
  platform_name,
19
21
  support_status,
20
22
  )
21
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
23
+ from datahub.ingestion.api.source import (
24
+ MetadataWorkUnitProcessor,
25
+ SourceCapability,
26
+ SourceReport,
27
+ )
22
28
  from datahub.ingestion.api.workunit import MetadataWorkUnit
29
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
30
+ StaleEntityRemovalHandler,
31
+ StaleEntityRemovalSourceReport,
32
+ )
33
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
34
+ StatefulIngestionConfigBase,
35
+ StatefulIngestionSourceBase,
36
+ )
23
37
  from datahub.metadata.schema_classes import (
24
38
  GlobalTagsClass,
25
39
  MLHyperParamClass,
@@ -35,7 +49,7 @@ from datahub.metadata.schema_classes import (
35
49
  T = TypeVar("T")
36
50
 
37
51
 
38
- class MLflowConfig(EnvConfigMixin):
52
+ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
39
53
  tracking_uri: Optional[str] = Field(
40
54
  default=None,
41
55
  description=(
@@ -79,7 +93,7 @@ class MLflowRegisteredModelStageInfo:
79
93
  "Extract descriptions for MLflow Registered Models and Model Versions",
80
94
  )
81
95
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
82
- class MLflowSource(Source):
96
+ class MLflowSource(StatefulIngestionSourceBase):
83
97
  platform = "mlflow"
84
98
  registered_model_stages_info = (
85
99
  MLflowRegisteredModelStageInfo(
@@ -105,9 +119,10 @@ class MLflowSource(Source):
105
119
  )
106
120
 
107
121
  def __init__(self, ctx: PipelineContext, config: MLflowConfig):
108
- super().__init__(ctx)
122
+ super().__init__(config, ctx)
123
+ self.ctx = ctx
109
124
  self.config = config
110
- self.report = SourceReport()
125
+ self.report = StaleEntityRemovalSourceReport()
111
126
  self.client = MlflowClient(
112
127
  tracking_uri=self.config.tracking_uri,
113
128
  registry_uri=self.config.registry_uri,
@@ -116,6 +131,14 @@ class MLflowSource(Source):
116
131
  def get_report(self) -> SourceReport:
117
132
  return self.report
118
133
 
134
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
135
+ return [
136
+ *super().get_workunit_processors(),
137
+ StaleEntityRemovalHandler.create(
138
+ self, self.config, self.ctx
139
+ ).workunit_processor,
140
+ ]
141
+
119
142
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
120
143
  yield from self._get_tags_workunits()
121
144
  yield from self._get_ml_model_workunits()
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
23
23
 
24
24
  import datahub.emitter.mce_builder as builder
25
25
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
26
- from datahub.configuration.source_common import DatasetLineageProviderConfigBase
26
+ from datahub.configuration.source_common import (
27
+ DatasetLineageProviderConfigBase,
28
+ )
27
29
  from datahub.configuration.validate_field_removal import pydantic_removed_field
28
30
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
29
31
  from datahub.emitter.mcp_builder import (
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
137
139
  )
138
140
 
139
141
 
140
- class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
142
+ class ModeConfig(
143
+ StatefulIngestionConfigBase,
144
+ DatasetLineageProviderConfigBase,
145
+ ):
141
146
  # See https://mode.com/developer/api-reference/authentication/
142
147
  # for authentication
143
148
  connect_uri: str = Field(
@@ -7,7 +7,9 @@ import pandas as pd
7
7
  from neo4j import GraphDatabase
8
8
  from pydantic.fields import Field
9
9
 
10
- from datahub.configuration.source_common import EnvConfigMixin
10
+ from datahub.configuration.source_common import (
11
+ EnvConfigMixin,
12
+ )
11
13
  from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
17
19
  platform_name,
18
20
  support_status,
19
21
  )
20
- from datahub.ingestion.api.source import Source, SourceReport
22
+ from datahub.ingestion.api.source import (
23
+ MetadataWorkUnitProcessor,
24
+ )
21
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
26
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
+ StaleEntityRemovalHandler,
29
+ )
30
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
31
+ StatefulIngestionConfigBase,
32
+ StatefulIngestionReport,
33
+ StatefulIngestionSourceBase,
34
+ )
23
35
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
36
  from datahub.metadata.schema_classes import (
25
37
  AuditStampClass,
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
52
64
  }
53
65
 
54
66
 
55
- class Neo4jConfig(EnvConfigMixin):
67
+ class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
56
68
  username: str = Field(description="Neo4j Username")
57
69
  password: str = Field(description="Neo4j Password")
58
70
  uri: str = Field(description="The URI for the Neo4j server")
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
60
72
 
61
73
 
62
74
  @dataclass
63
- class Neo4jSourceReport(SourceReport):
75
+ class Neo4jSourceReport(StatefulIngestionReport):
64
76
  obj_failures: int = 0
65
77
  obj_created: int = 0
66
78
 
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
68
80
  @platform_name("Neo4j", id="neo4j")
69
81
  @config_class(Neo4jConfig)
70
82
  @support_status(SupportStatus.CERTIFIED)
71
- class Neo4jSource(Source):
83
+ class Neo4jSource(StatefulIngestionSourceBase):
72
84
  NODE = "node"
73
85
  RELATIONSHIP = "relationship"
74
86
  PLATFORM = "neo4j"
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
76
88
  def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
89
  self.ctx = ctx
78
90
  self.config = config
79
- self.report = Neo4jSourceReport()
91
+ self.report: Neo4jSourceReport = Neo4jSourceReport()
80
92
 
81
93
  @classmethod
82
94
  def create(cls, config_dict, ctx):
@@ -282,6 +294,14 @@ class Neo4jSource(Source):
282
294
  def get_relationships(self, record: dict) -> dict:
283
295
  return record.get("relationships", None)
284
296
 
297
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
298
+ return [
299
+ *super().get_workunit_processors(),
300
+ StaleEntityRemovalHandler.create(
301
+ self, self.config, self.ctx
302
+ ).workunit_processor,
303
+ ]
304
+
285
305
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
306
  df = self.get_neo4j_metadata(
287
307
  "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
@@ -22,7 +22,9 @@ from requests_gssapi import HTTPSPNEGOAuth
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.configuration.common import AllowDenyPattern
25
- from datahub.configuration.source_common import EnvConfigMixin
25
+ from datahub.configuration.source_common import (
26
+ EnvConfigMixin,
27
+ )
26
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
27
29
  from datahub.emitter.mcp_builder import ContainerKey, gen_containers
28
30
  from datahub.ingestion.api.common import PipelineContext
@@ -33,9 +35,21 @@ from datahub.ingestion.api.decorators import (
33
35
  platform_name,
34
36
  support_status,
35
37
  )
36
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
38
+ from datahub.ingestion.api.source import (
39
+ MetadataWorkUnitProcessor,
40
+ SourceCapability,
41
+ SourceReport,
42
+ )
37
43
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
44
  from datahub.ingestion.source.common.subtypes import JobContainerSubTypes
45
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
46
+ StaleEntityRemovalHandler,
47
+ StaleEntityRemovalSourceReport,
48
+ )
49
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
50
+ StatefulIngestionConfigBase,
51
+ StatefulIngestionSourceBase,
52
+ )
39
53
  from datahub.metadata.schema_classes import (
40
54
  BrowsePathEntryClass,
41
55
  BrowsePathsV2Class,
@@ -81,7 +95,7 @@ class ProcessGroupKey(ContainerKey):
81
95
  process_group_id: str
82
96
 
83
97
 
84
- class NifiSourceConfig(EnvConfigMixin):
98
+ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
99
  site_url: str = Field(
86
100
  description="URL for Nifi, ending with /nifi/. e.g. https://mynifi.domain/nifi/"
87
101
  )
@@ -452,7 +466,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
466
 
453
467
 
454
468
  @dataclass
455
- class NifiSourceReport(SourceReport):
469
+ class NifiSourceReport(StaleEntityRemovalSourceReport):
456
470
  filtered: LossyList[str] = field(default_factory=LossyList)
457
471
 
458
472
  def report_dropped(self, ent_name: str) -> None:
@@ -464,13 +478,14 @@ class NifiSourceReport(SourceReport):
464
478
  @config_class(NifiSourceConfig)
465
479
  @support_status(SupportStatus.CERTIFIED)
466
480
  @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations")
467
- class NifiSource(Source):
481
+ class NifiSource(StatefulIngestionSourceBase):
468
482
  config: NifiSourceConfig
469
483
  report: NifiSourceReport
470
484
 
471
485
  def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None:
472
- super().__init__(ctx)
486
+ super().__init__(config, ctx)
473
487
  self.config = config
488
+ self.ctx = ctx
474
489
  self.report = NifiSourceReport()
475
490
  self.session = requests.Session()
476
491
 
@@ -1151,6 +1166,14 @@ class NifiSource(Source):
1151
1166
  token_response.raise_for_status()
1152
1167
  self.session.headers.update({"Authorization": "Bearer " + token_response.text})
1153
1168
 
1169
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1170
+ return [
1171
+ *super().get_workunit_processors(),
1172
+ StaleEntityRemovalHandler.create(
1173
+ self, self.config, self.ctx
1174
+ ).workunit_processor,
1175
+ ]
1176
+
1154
1177
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1155
1178
  try:
1156
1179
  self.authenticate()
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
14
14
 
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.common import AllowDenyPattern
17
- from datahub.configuration.source_common import EnvConfigMixin
17
+ from datahub.configuration.source_common import (
18
+ EnvConfigMixin,
19
+ )
18
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
21
  from datahub.ingestion.api.common import PipelineContext
20
22
  from datahub.ingestion.api.decorators import (
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
25
27
  platform_name,
26
28
  support_status,
27
29
  )
28
- from datahub.ingestion.api.source import Source, SourceReport
30
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
29
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
32
  from datahub.ingestion.source.powerbi_report_server.constants import (
31
33
  API_ENDPOINTS,
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
39
41
  PowerBiReport,
40
42
  Report,
41
43
  )
44
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
45
+ StaleEntityRemovalHandler,
46
+ StaleEntityRemovalSourceReport,
47
+ )
48
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
+ StatefulIngestionConfigBase,
50
+ StatefulIngestionSourceBase,
51
+ )
42
52
  from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
43
53
  from datahub.metadata.schema_classes import (
44
54
  BrowsePathsClass,
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
58
68
  LOGGER = logging.getLogger(__name__)
59
69
 
60
70
 
61
- class PowerBiReportServerAPIConfig(EnvConfigMixin):
71
+ class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
62
72
  username: str = pydantic.Field(description="Windows account username")
63
73
  password: str = pydantic.Field(description="Windows account password")
64
74
  workstation_name: str = pydantic.Field(
@@ -475,7 +485,7 @@ class Mapper:
475
485
 
476
486
 
477
487
  @dataclass
478
- class PowerBiReportServerDashboardSourceReport(SourceReport):
488
+ class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
479
489
  scanned_report: int = 0
480
490
  filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
481
491
 
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
490
500
  @config_class(PowerBiReportServerDashboardSourceConfig)
491
501
  @support_status(SupportStatus.INCUBATING)
492
502
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
493
- class PowerBiReportServerDashboardSource(Source):
503
+ class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
494
504
  """
495
505
  Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
496
506
  It extracts the following:
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
520
530
  def __init__(
521
531
  self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
522
532
  ):
523
- super().__init__(ctx)
533
+ super().__init__(config, ctx)
524
534
  self.source_config = config
535
+ self.ctx = ctx
525
536
  self.report = PowerBiReportServerDashboardSourceReport()
526
537
  self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
527
538
  self.powerbi_client = PowerBiReportServerAPI(self.source_config)
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
532
543
  config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
533
544
  return cls(config, ctx)
534
545
 
546
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
547
+ return [
548
+ *super().get_workunit_processors(),
549
+ StaleEntityRemovalHandler.create(
550
+ self, self.source_config, self.ctx
551
+ ).workunit_processor,
552
+ ]
553
+
535
554
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
536
555
  """
537
556
  Datahub Ingestion framework invoke this method
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
116
116
  def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
117
117
  super().__init__(config, ctx)
118
118
  self.platform: str = "pulsar"
119
+ self.ctx = ctx
119
120
  self.config: PulsarSourceConfig = config
120
121
  self.report: PulsarSourceReport = PulsarSourceReport()
121
122
 
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
12
12
  from urllib3.util.retry import Retry
13
13
 
14
14
  import datahub.emitter.mce_builder as builder
15
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
15
+ from datahub.configuration.common import AllowDenyPattern
16
16
  from datahub.emitter.mce_builder import DEFAULT_ENV
17
17
  from datahub.ingestion.api.common import PipelineContext
18
18
  from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
25
+ from datahub.ingestion.api.source import (
26
+ MetadataWorkUnitProcessor,
27
+ SourceCapability,
28
+ SourceReport,
29
+ )
26
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
32
+ StaleEntityRemovalHandler,
33
+ StaleEntityRemovalSourceReport,
34
+ )
35
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
+ StatefulIngestionConfigBase,
37
+ StatefulIngestionSourceBase,
38
+ )
27
39
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
28
40
  AuditStamp,
29
41
  ChangeAuditStamps,
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
235
247
  return f"{database_name}.{table_name}"
236
248
 
237
249
 
238
- class RedashConfig(ConfigModel):
250
+ class RedashConfig(
251
+ StatefulIngestionConfigBase,
252
+ ):
239
253
  # See the Redash API for details
240
254
  # https://redash.io/help/user-guide/integrations-and-api/api
241
255
  connect_uri: str = Field(
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
277
291
 
278
292
 
279
293
  @dataclass
280
- class RedashSourceReport(SourceReport):
294
+ class RedashSourceReport(StaleEntityRemovalSourceReport):
281
295
  items_scanned: int = 0
282
296
  filtered: LossyList[str] = field(default_factory=LossyList)
283
297
  queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
305
319
  @config_class(RedashConfig)
306
320
  @support_status(SupportStatus.INCUBATING)
307
321
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
308
- class RedashSource(Source):
322
+ class RedashSource(StatefulIngestionSourceBase):
309
323
  """
310
324
  This plugin extracts the following:
311
325
 
@@ -316,8 +330,9 @@ class RedashSource(Source):
316
330
  platform = "redash"
317
331
 
318
332
  def __init__(self, ctx: PipelineContext, config: RedashConfig):
319
- super().__init__(ctx)
333
+ super().__init__(config, ctx)
320
334
  self.config: RedashConfig = config
335
+ self.ctx = ctx
321
336
  self.report: RedashSourceReport = RedashSourceReport()
322
337
 
323
338
  # Handle trailing slash removal
@@ -724,6 +739,14 @@ class RedashSource(Source):
724
739
  def add_config_to_report(self) -> None:
725
740
  self.report.api_page_limit = self.config.api_page_limit
726
741
 
742
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
743
+ return [
744
+ *super().get_workunit_processors(),
745
+ StaleEntityRemovalHandler.create(
746
+ self, self.config, self.ctx
747
+ ).workunit_processor,
748
+ ]
749
+
727
750
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
728
751
  self.validate_connection()
729
752
  self.add_config_to_report()
@@ -5,7 +5,9 @@ import pydantic
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
8
- from datahub.configuration.source_common import DatasetSourceConfigMixin
8
+ from datahub.configuration.source_common import (
9
+ DatasetSourceConfigMixin,
10
+ )
9
11
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
10
12
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
11
13
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig