acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (72) hide show
  1. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/METADATA +2449 -2449
  2. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/RECORD +72 -71
  3. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/docker_cli.py +1 -1
  6. datahub/cli/iceberg_cli.py +1 -1
  7. datahub/cli/ingest_cli.py +3 -1
  8. datahub/cli/lite_cli.py +4 -2
  9. datahub/cli/specific/dataproduct_cli.py +1 -1
  10. datahub/configuration/kafka.py +1 -1
  11. datahub/ingestion/api/source_helpers.py +4 -0
  12. datahub/ingestion/fs/s3_fs.py +2 -2
  13. datahub/ingestion/graph/client.py +15 -6
  14. datahub/ingestion/graph/entity_versioning.py +3 -3
  15. datahub/ingestion/run/pipeline.py +109 -143
  16. datahub/ingestion/run/sink_callback.py +77 -0
  17. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  18. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  19. datahub/ingestion/source/csv_enricher.py +2 -2
  20. datahub/ingestion/source/delta_lake/config.py +8 -1
  21. datahub/ingestion/source/delta_lake/report.py +4 -2
  22. datahub/ingestion/source/delta_lake/source.py +20 -5
  23. datahub/ingestion/source/dremio/dremio_api.py +3 -3
  24. datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
  25. datahub/ingestion/source/elastic_search.py +26 -6
  26. datahub/ingestion/source/feast.py +27 -8
  27. datahub/ingestion/source/file.py +1 -1
  28. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  29. datahub/ingestion/source/identity/okta.py +1 -2
  30. datahub/ingestion/source/kafka/kafka.py +1 -1
  31. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  32. datahub/ingestion/source/looker/lookml_source.py +1 -1
  33. datahub/ingestion/source/metabase.py +54 -32
  34. datahub/ingestion/source/mlflow.py +30 -7
  35. datahub/ingestion/source/mode.py +8 -3
  36. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  37. datahub/ingestion/source/nifi.py +29 -6
  38. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  39. datahub/ingestion/source/pulsar.py +3 -2
  40. datahub/ingestion/source/redash.py +29 -6
  41. datahub/ingestion/source/s3/config.py +3 -1
  42. datahub/ingestion/source/salesforce.py +28 -6
  43. datahub/ingestion/source/sigma/sigma.py +1 -1
  44. datahub/ingestion/source/slack/slack.py +31 -10
  45. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  46. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  47. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  48. datahub/ingestion/source/sql/druid.py +1 -5
  49. datahub/ingestion/source/sql/oracle.py +34 -0
  50. datahub/ingestion/source/tableau/tableau.py +2 -1
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -1
  52. datahub/ingestion/source_config/pulsar.py +3 -1
  53. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  54. datahub/lite/duckdb_lite.py +2 -1
  55. datahub/lite/lite_local.py +1 -1
  56. datahub/lite/lite_util.py +4 -3
  57. datahub/metadata/_schema_classes.py +517 -410
  58. datahub/metadata/_urns/urn_defs.py +1670 -1670
  59. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  60. datahub/metadata/schema.avsc +17362 -17638
  61. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  62. datahub/metadata/schemas/__init__.py +3 -3
  63. datahub/sdk/__init__.py +29 -12
  64. datahub/sdk/_entity.py +18 -1
  65. datahub/sdk/container.py +3 -1
  66. datahub/sdk/dataset.py +5 -3
  67. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  68. datahub/utilities/memory_footprint.py +3 -2
  69. datahub/utilities/unified_diff.py +5 -1
  70. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/LICENSE +0 -0
  71. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/entry_points.txt +0 -0
  72. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/top_level.txt +0 -0
@@ -271,12 +271,12 @@ class DremioAPIOperations:
271
271
  self.cancel_query(job_id)
272
272
  raise DremioAPIException(
273
273
  f"Query execution timed out after {timeout} seconds"
274
- )
274
+ ) from None
275
275
  except RuntimeError as e:
276
- raise DremioAPIException(f"{str(e)}")
276
+ raise DremioAPIException() from e
277
277
 
278
278
  except requests.RequestException as e:
279
- raise DremioAPIException(f"Error executing query: {str(e)}")
279
+ raise DremioAPIException("Error executing query") from e
280
280
 
281
281
  def fetch_results(self, job_id: str) -> List[Dict]:
282
282
  """Fetch job results with status checking"""
@@ -168,8 +168,9 @@ class DremioAspects:
168
168
  )
169
169
 
170
170
  def get_container_urn(
171
- self, name: Optional[str] = None, path: Optional[List[str]] = []
171
+ self, name: Optional[str] = None, path: Optional[List[str]] = None
172
172
  ) -> str:
173
+ path = path or []
173
174
  container_key = self.get_container_key(name, path)
174
175
  return container_key.as_urn()
175
176
 
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
32
32
  platform_name,
33
33
  support_status,
34
34
  )
35
- from datahub.ingestion.api.source import Source, SourceReport
35
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
37
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
38
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
39
+ StaleEntityRemovalHandler,
40
+ StaleEntityRemovalSourceReport,
41
+ )
42
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
43
+ StatefulIngestionConfigBase,
44
+ StatefulIngestionSourceBase,
45
+ )
38
46
  from datahub.ingestion.source_config.operation_config import (
39
47
  OperationConfig,
40
48
  is_profiling_enabled,
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
188
196
 
189
197
 
190
198
  @dataclass
191
- class ElasticsearchSourceReport(SourceReport):
199
+ class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
192
200
  index_scanned: int = 0
193
201
  filtered: LossyList[str] = field(default_factory=LossyList)
194
202
 
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
240
248
  )
241
249
 
242
250
 
243
- class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
251
+ class ElasticsearchSourceConfig(
252
+ StatefulIngestionConfigBase,
253
+ PlatformInstanceConfigMixin,
254
+ EnvConfigMixin,
255
+ ):
244
256
  host: str = Field(
245
257
  default="localhost:9200", description="The elastic search host URI."
246
258
  )
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
337
349
  @config_class(ElasticsearchSourceConfig)
338
350
  @support_status(SupportStatus.CERTIFIED)
339
351
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
340
- class ElasticsearchSource(Source):
352
+ class ElasticsearchSource(StatefulIngestionSourceBase):
341
353
  """
342
354
  This plugin extracts the following:
343
355
 
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
346
358
  """
347
359
 
348
360
  def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
349
- super().__init__(ctx)
361
+ super().__init__(config, ctx)
350
362
  self.source_config = config
351
363
  self.client = Elasticsearch(
352
364
  self.source_config.host,
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
361
373
  ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
362
374
  url_prefix=self.source_config.url_prefix,
363
375
  )
364
- self.report = ElasticsearchSourceReport()
376
+ self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
365
377
  self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
366
378
  self.platform: str = "elasticsearch"
367
379
  self.cat_response: Optional[List[Dict[str, Any]]] = None
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
373
385
  config = ElasticsearchSourceConfig.parse_obj(config_dict)
374
386
  return cls(config, ctx)
375
387
 
388
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
389
+ return [
390
+ *super().get_workunit_processors(),
391
+ StaleEntityRemovalHandler.create(
392
+ self, self.source_config, self.ctx
393
+ ).workunit_processor,
394
+ ]
395
+
376
396
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
377
397
  indices = self.client.indices.get_alias()
378
398
  for index in indices:
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
20
20
  from pydantic import Field
21
21
 
22
22
  import datahub.emitter.mce_builder as builder
23
- from datahub.configuration.common import ConfigModel
24
23
  from datahub.emitter.mce_builder import DEFAULT_ENV
25
24
  from datahub.ingestion.api.common import PipelineContext
26
25
  from datahub.ingestion.api.decorators import (
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
31
30
  platform_name,
32
31
  support_status,
33
32
  )
34
- from datahub.ingestion.api.source import Source, SourceReport
33
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
36
+ StaleEntityRemovalHandler,
37
+ StaleEntityRemovalSourceReport,
38
+ )
39
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
40
+ StatefulIngestionConfigBase,
41
+ StatefulIngestionSourceBase,
42
+ )
36
43
  from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
37
44
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
38
45
  MLFeatureSnapshot,
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
86
93
  }
87
94
 
88
95
 
89
- class FeastRepositorySourceConfig(ConfigModel):
96
+ class FeastRepositorySourceConfig(
97
+ StatefulIngestionConfigBase,
98
+ ):
90
99
  path: str = Field(description="Path to Feast repository")
91
100
  fs_yaml_file: Optional[str] = Field(
92
101
  default=None,
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
122
131
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
123
132
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
124
133
  @dataclass
125
- class FeastRepositorySource(Source):
134
+ class FeastRepositorySource(StatefulIngestionSourceBase):
126
135
  """
127
136
  This plugin extracts:
128
137
 
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
135
144
 
136
145
  platform = "feast"
137
146
  source_config: FeastRepositorySourceConfig
138
- report: SourceReport
147
+ report: StaleEntityRemovalSourceReport
139
148
  feature_store: FeatureStore
140
149
 
141
150
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
142
- super().__init__(ctx)
151
+ super().__init__(config, ctx)
143
152
  self.source_config = config
144
- self.report = SourceReport()
153
+ self.ctx = ctx
154
+ self.report = StaleEntityRemovalSourceReport()
145
155
  self.feature_store = FeatureStore(
146
156
  repo_path=self.source_config.path,
147
157
  fs_yaml_file=self.source_config.fs_yaml_file,
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
158
168
 
159
169
  if ml_feature_data_type is None:
160
170
  self.report.report_warning(
161
- parent_name, f"unable to map type {field_type} to metadata schema"
171
+ "unable to map type",
172
+ f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
162
173
  )
163
174
 
164
175
  ml_feature_data_type = MLFeatureDataType.UNKNOWN
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
456
467
  config = FeastRepositorySourceConfig.parse_obj(config_dict)
457
468
  return cls(config, ctx)
458
469
 
470
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
471
+ return [
472
+ *super().get_workunit_processors(),
473
+ StaleEntityRemovalHandler.create(
474
+ self, self.source_config, self.ctx
475
+ ).workunit_processor,
476
+ ]
477
+
459
478
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
460
479
  for feature_view in self.feature_store.list_feature_views():
461
480
  for entity_name in feature_view.entities:
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
351
351
  self.report.add_deserialize_time(deserialize_duration)
352
352
  yield i, item
353
353
  except Exception as e:
354
- self.report.report_failure(f"path-{i}", str(e))
354
+ self.report.report_failure(f"{file_status.path}-{i}", str(e))
355
355
 
356
356
  @staticmethod
357
357
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
130
130
  )
131
131
 
132
132
  def _scroll_execution_requests(
133
- self, overrides: Dict[str, Any] = {}
133
+ self, overrides: Optional[Dict[str, Any]] = None
134
134
  ) -> Iterator[CleanupRecord]:
135
+ overrides = overrides or {}
135
136
  headers: Dict[str, Any] = {
136
137
  "Accept": "application/json",
137
138
  "Content-Type": "application/json",
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
14
14
  from pydantic import validator
15
15
  from pydantic.fields import Field
16
16
 
17
- from datahub.configuration.common import ConfigModel
18
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
18
  from datahub.ingestion.api.common import PipelineContext
20
19
  from datahub.ingestion.api.decorators import (
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
56
55
  nest_asyncio.apply()
57
56
 
58
57
 
59
- class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
58
+ class OktaConfig(StatefulIngestionConfigBase):
60
59
  # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
61
60
  okta_domain: str = Field(
62
61
  description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
272
272
  return schema_registry_class.create(config, report)
273
273
  except Exception as e:
274
274
  logger.debug(e, exc_info=e)
275
- raise ImportError(config.schema_registry_class)
275
+ raise ImportError(config.schema_registry_class) from e
276
276
 
277
277
  def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
278
278
  super().__init__(config, ctx)
@@ -33,14 +33,14 @@ class LookerViewFileLoader:
33
33
  base_projects_folder: Dict[str, pathlib.Path],
34
34
  reporter: LookMLSourceReport,
35
35
  source_config: LookMLSourceConfig,
36
- manifest_constants: Dict[str, LookerConstant] = {},
36
+ manifest_constants: Optional[Dict[str, LookerConstant]] = None,
37
37
  ) -> None:
38
38
  self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
39
39
  self._root_project_name = root_project_name
40
40
  self._base_projects_folder = base_projects_folder
41
41
  self.reporter = reporter
42
42
  self.source_config = source_config
43
- self.manifest_constants = manifest_constants
43
+ self.manifest_constants = manifest_constants or {}
44
44
 
45
45
  def _load_viewfile(
46
46
  self, project_name: str, path: str, reporter: LookMLSourceReport
@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
501
501
  raise ValueError(
502
502
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
503
  f"in your config file"
504
- )
504
+ ) from None
505
505
 
506
506
  def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
507
507
  manifest_file = folder / "manifest.lkml"
@@ -69,9 +69,19 @@ class MetabaseConfig(DatasetLineageProviderConfigBase, StatefulIngestionConfigBa
69
69
  default=None,
70
70
  description="optional URL to use in links (if `connect_uri` is only for ingestion)",
71
71
  )
72
- username: Optional[str] = Field(default=None, description="Metabase username.")
72
+ username: Optional[str] = Field(
73
+ default=None,
74
+ description="Metabase username, used when an API key is not provided.",
75
+ )
73
76
  password: Optional[pydantic.SecretStr] = Field(
74
- default=None, description="Metabase password."
77
+ default=None,
78
+ description="Metabase password, used when an API key is not provided.",
79
+ )
80
+
81
+ # https://www.metabase.com/learn/metabase-basics/administration/administration-and-operation/metabase-api#example-get-request
82
+ api_key: Optional[pydantic.SecretStr] = Field(
83
+ default=None,
84
+ description="Metabase API key. If provided, the username and password will be ignored. Recommended method.",
75
85
  )
76
86
  # TODO: Check and remove this if no longer needed.
77
87
  # Config database_alias is removed from sql sources.
@@ -178,30 +188,40 @@ class MetabaseSource(StatefulIngestionSourceBase):
178
188
  self.source_config: MetabaseConfig = config
179
189
 
180
190
  def setup_session(self) -> None:
181
- login_response = requests.post(
182
- f"{self.config.connect_uri}/api/session",
183
- None,
184
- {
185
- "username": self.config.username,
186
- "password": (
187
- self.config.password.get_secret_value()
188
- if self.config.password
189
- else None
190
- ),
191
- },
192
- )
191
+ self.session = requests.session()
192
+ if self.config.api_key:
193
+ self.session.headers.update(
194
+ {
195
+ "x-api-key": self.config.api_key.get_secret_value(),
196
+ "Content-Type": "application/json",
197
+ "Accept": "*/*",
198
+ }
199
+ )
200
+ else:
201
+ # If no API key is provided, generate a session token using username and password.
202
+ login_response = requests.post(
203
+ f"{self.config.connect_uri}/api/session",
204
+ None,
205
+ {
206
+ "username": self.config.username,
207
+ "password": (
208
+ self.config.password.get_secret_value()
209
+ if self.config.password
210
+ else None
211
+ ),
212
+ },
213
+ )
193
214
 
194
- login_response.raise_for_status()
195
- self.access_token = login_response.json().get("id", "")
215
+ login_response.raise_for_status()
216
+ self.access_token = login_response.json().get("id", "")
196
217
 
197
- self.session = requests.session()
198
- self.session.headers.update(
199
- {
200
- "X-Metabase-Session": f"{self.access_token}",
201
- "Content-Type": "application/json",
202
- "Accept": "*/*",
203
- }
204
- )
218
+ self.session.headers.update(
219
+ {
220
+ "X-Metabase-Session": f"{self.access_token}",
221
+ "Content-Type": "application/json",
222
+ "Accept": "*/*",
223
+ }
224
+ )
205
225
 
206
226
  # Test the connection
207
227
  try:
@@ -217,15 +237,17 @@ class MetabaseSource(StatefulIngestionSourceBase):
217
237
  )
218
238
 
219
239
  def close(self) -> None:
220
- response = requests.delete(
221
- f"{self.config.connect_uri}/api/session",
222
- headers={"X-Metabase-Session": self.access_token},
223
- )
224
- if response.status_code not in (200, 204):
225
- self.report.report_failure(
226
- title="Unable to Log User Out",
227
- message=f"Unable to logout for user {self.config.username}",
240
+ # API key authentication does not require session closure.
241
+ if not self.config.api_key:
242
+ response = requests.delete(
243
+ f"{self.config.connect_uri}/api/session",
244
+ headers={"X-Metabase-Session": self.access_token},
228
245
  )
246
+ if response.status_code not in (200, 204):
247
+ self.report.report_failure(
248
+ title="Unable to Log User Out",
249
+ message=f"Unable to logout for user {self.config.username}",
250
+ )
229
251
  super().close()
230
252
 
231
253
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import Any, Callable, Iterable, Optional, TypeVar, Union
2
+ from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
3
3
 
4
4
  from mlflow import MlflowClient
5
5
  from mlflow.entities import Run
@@ -8,7 +8,9 @@ from mlflow.store.entities import PagedList
8
8
  from pydantic.fields import Field
9
9
 
10
10
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.source_common import EnvConfigMixin
11
+ from datahub.configuration.source_common import (
12
+ EnvConfigMixin,
13
+ )
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
14
16
  from datahub.ingestion.api.decorators import (
@@ -18,8 +20,20 @@ from datahub.ingestion.api.decorators import (
18
20
  platform_name,
19
21
  support_status,
20
22
  )
21
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
23
+ from datahub.ingestion.api.source import (
24
+ MetadataWorkUnitProcessor,
25
+ SourceCapability,
26
+ SourceReport,
27
+ )
22
28
  from datahub.ingestion.api.workunit import MetadataWorkUnit
29
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
30
+ StaleEntityRemovalHandler,
31
+ StaleEntityRemovalSourceReport,
32
+ )
33
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
34
+ StatefulIngestionConfigBase,
35
+ StatefulIngestionSourceBase,
36
+ )
23
37
  from datahub.metadata.schema_classes import (
24
38
  GlobalTagsClass,
25
39
  MLHyperParamClass,
@@ -35,7 +49,7 @@ from datahub.metadata.schema_classes import (
35
49
  T = TypeVar("T")
36
50
 
37
51
 
38
- class MLflowConfig(EnvConfigMixin):
52
+ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
39
53
  tracking_uri: Optional[str] = Field(
40
54
  default=None,
41
55
  description=(
@@ -79,7 +93,7 @@ class MLflowRegisteredModelStageInfo:
79
93
  "Extract descriptions for MLflow Registered Models and Model Versions",
80
94
  )
81
95
  @capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
82
- class MLflowSource(Source):
96
+ class MLflowSource(StatefulIngestionSourceBase):
83
97
  platform = "mlflow"
84
98
  registered_model_stages_info = (
85
99
  MLflowRegisteredModelStageInfo(
@@ -105,9 +119,10 @@ class MLflowSource(Source):
105
119
  )
106
120
 
107
121
  def __init__(self, ctx: PipelineContext, config: MLflowConfig):
108
- super().__init__(ctx)
122
+ super().__init__(config, ctx)
123
+ self.ctx = ctx
109
124
  self.config = config
110
- self.report = SourceReport()
125
+ self.report = StaleEntityRemovalSourceReport()
111
126
  self.client = MlflowClient(
112
127
  tracking_uri=self.config.tracking_uri,
113
128
  registry_uri=self.config.registry_uri,
@@ -116,6 +131,14 @@ class MLflowSource(Source):
116
131
  def get_report(self) -> SourceReport:
117
132
  return self.report
118
133
 
134
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
135
+ return [
136
+ *super().get_workunit_processors(),
137
+ StaleEntityRemovalHandler.create(
138
+ self, self.config, self.ctx
139
+ ).workunit_processor,
140
+ ]
141
+
119
142
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
120
143
  yield from self._get_tags_workunits()
121
144
  yield from self._get_ml_model_workunits()
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
23
23
 
24
24
  import datahub.emitter.mce_builder as builder
25
25
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
26
- from datahub.configuration.source_common import DatasetLineageProviderConfigBase
26
+ from datahub.configuration.source_common import (
27
+ DatasetLineageProviderConfigBase,
28
+ )
27
29
  from datahub.configuration.validate_field_removal import pydantic_removed_field
28
30
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
29
31
  from datahub.emitter.mcp_builder import (
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
137
139
  )
138
140
 
139
141
 
140
- class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase):
142
+ class ModeConfig(
143
+ StatefulIngestionConfigBase,
144
+ DatasetLineageProviderConfigBase,
145
+ ):
141
146
  # See https://mode.com/developer/api-reference/authentication/
142
147
  # for authentication
143
148
  connect_uri: str = Field(
@@ -1489,7 +1494,7 @@ class ModeSource(StatefulIngestionSourceBase):
1489
1494
  sleep_time = error_response.headers.get("retry-after")
1490
1495
  if sleep_time is not None:
1491
1496
  time.sleep(float(sleep_time))
1492
- raise HTTPError429
1497
+ raise HTTPError429 from None
1493
1498
 
1494
1499
  raise http_error
1495
1500
 
@@ -7,7 +7,9 @@ import pandas as pd
7
7
  from neo4j import GraphDatabase
8
8
  from pydantic.fields import Field
9
9
 
10
- from datahub.configuration.source_common import EnvConfigMixin
10
+ from datahub.configuration.source_common import (
11
+ EnvConfigMixin,
12
+ )
11
13
  from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
12
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
15
  from datahub.ingestion.api.common import PipelineContext
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
17
19
  platform_name,
18
20
  support_status,
19
21
  )
20
- from datahub.ingestion.api.source import Source, SourceReport
22
+ from datahub.ingestion.api.source import (
23
+ MetadataWorkUnitProcessor,
24
+ )
21
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
26
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
+ StaleEntityRemovalHandler,
29
+ )
30
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
31
+ StatefulIngestionConfigBase,
32
+ StatefulIngestionReport,
33
+ StatefulIngestionSourceBase,
34
+ )
23
35
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
24
36
  from datahub.metadata.schema_classes import (
25
37
  AuditStampClass,
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
52
64
  }
53
65
 
54
66
 
55
- class Neo4jConfig(EnvConfigMixin):
67
+ class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
56
68
  username: str = Field(description="Neo4j Username")
57
69
  password: str = Field(description="Neo4j Password")
58
70
  uri: str = Field(description="The URI for the Neo4j server")
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
60
72
 
61
73
 
62
74
  @dataclass
63
- class Neo4jSourceReport(SourceReport):
75
+ class Neo4jSourceReport(StatefulIngestionReport):
64
76
  obj_failures: int = 0
65
77
  obj_created: int = 0
66
78
 
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
68
80
  @platform_name("Neo4j", id="neo4j")
69
81
  @config_class(Neo4jConfig)
70
82
  @support_status(SupportStatus.CERTIFIED)
71
- class Neo4jSource(Source):
83
+ class Neo4jSource(StatefulIngestionSourceBase):
72
84
  NODE = "node"
73
85
  RELATIONSHIP = "relationship"
74
86
  PLATFORM = "neo4j"
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
76
88
  def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
77
89
  self.ctx = ctx
78
90
  self.config = config
79
- self.report = Neo4jSourceReport()
91
+ self.report: Neo4jSourceReport = Neo4jSourceReport()
80
92
 
81
93
  @classmethod
82
94
  def create(cls, config_dict, ctx):
@@ -282,6 +294,14 @@ class Neo4jSource(Source):
282
294
  def get_relationships(self, record: dict) -> dict:
283
295
  return record.get("relationships", None)
284
296
 
297
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
298
+ return [
299
+ *super().get_workunit_processors(),
300
+ StaleEntityRemovalHandler.create(
301
+ self, self.config, self.ctx
302
+ ).workunit_processor,
303
+ ]
304
+
285
305
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
286
306
  df = self.get_neo4j_metadata(
287
307
  "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
@@ -22,7 +22,9 @@ from requests_gssapi import HTTPSPNEGOAuth
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.configuration.common import AllowDenyPattern
25
- from datahub.configuration.source_common import EnvConfigMixin
25
+ from datahub.configuration.source_common import (
26
+ EnvConfigMixin,
27
+ )
26
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
27
29
  from datahub.emitter.mcp_builder import ContainerKey, gen_containers
28
30
  from datahub.ingestion.api.common import PipelineContext
@@ -33,9 +35,21 @@ from datahub.ingestion.api.decorators import (
33
35
  platform_name,
34
36
  support_status,
35
37
  )
36
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
38
+ from datahub.ingestion.api.source import (
39
+ MetadataWorkUnitProcessor,
40
+ SourceCapability,
41
+ SourceReport,
42
+ )
37
43
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
44
  from datahub.ingestion.source.common.subtypes import JobContainerSubTypes
45
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
46
+ StaleEntityRemovalHandler,
47
+ StaleEntityRemovalSourceReport,
48
+ )
49
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
50
+ StatefulIngestionConfigBase,
51
+ StatefulIngestionSourceBase,
52
+ )
39
53
  from datahub.metadata.schema_classes import (
40
54
  BrowsePathEntryClass,
41
55
  BrowsePathsV2Class,
@@ -81,7 +95,7 @@ class ProcessGroupKey(ContainerKey):
81
95
  process_group_id: str
82
96
 
83
97
 
84
- class NifiSourceConfig(EnvConfigMixin):
98
+ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
99
  site_url: str = Field(
86
100
  description="URL for Nifi, ending with /nifi/. e.g. https://mynifi.domain/nifi/"
87
101
  )
@@ -452,7 +466,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
466
 
453
467
 
454
468
  @dataclass
455
- class NifiSourceReport(SourceReport):
469
+ class NifiSourceReport(StaleEntityRemovalSourceReport):
456
470
  filtered: LossyList[str] = field(default_factory=LossyList)
457
471
 
458
472
  def report_dropped(self, ent_name: str) -> None:
@@ -464,13 +478,14 @@ class NifiSourceReport(SourceReport):
464
478
  @config_class(NifiSourceConfig)
465
479
  @support_status(SupportStatus.CERTIFIED)
466
480
  @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations")
467
- class NifiSource(Source):
481
+ class NifiSource(StatefulIngestionSourceBase):
468
482
  config: NifiSourceConfig
469
483
  report: NifiSourceReport
470
484
 
471
485
  def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None:
472
- super().__init__(ctx)
486
+ super().__init__(config, ctx)
473
487
  self.config = config
488
+ self.ctx = ctx
474
489
  self.report = NifiSourceReport()
475
490
  self.session = requests.Session()
476
491
 
@@ -1151,6 +1166,14 @@ class NifiSource(Source):
1151
1166
  token_response.raise_for_status()
1152
1167
  self.session.headers.update({"Authorization": "Bearer " + token_response.text})
1153
1168
 
1169
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1170
+ return [
1171
+ *super().get_workunit_processors(),
1172
+ StaleEntityRemovalHandler.create(
1173
+ self, self.config, self.ctx
1174
+ ).workunit_processor,
1175
+ ]
1176
+
1154
1177
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1155
1178
  try:
1156
1179
  self.authenticate()