acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +7 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +1 -0
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/metadata/_schema_classes.py +517 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17362 -17638
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_entity.py +18 -1
- datahub/sdk/container.py +3 -1
- datahub/sdk/dataset.py +5 -3
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
|
|
|
32
32
|
platform_name,
|
|
33
33
|
support_status,
|
|
34
34
|
)
|
|
35
|
-
from datahub.ingestion.api.source import
|
|
35
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
37
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
38
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
39
|
+
StaleEntityRemovalHandler,
|
|
40
|
+
StaleEntityRemovalSourceReport,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
43
|
+
StatefulIngestionConfigBase,
|
|
44
|
+
StatefulIngestionSourceBase,
|
|
45
|
+
)
|
|
38
46
|
from datahub.ingestion.source_config.operation_config import (
|
|
39
47
|
OperationConfig,
|
|
40
48
|
is_profiling_enabled,
|
|
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
|
|
|
188
196
|
|
|
189
197
|
|
|
190
198
|
@dataclass
|
|
191
|
-
class ElasticsearchSourceReport(
|
|
199
|
+
class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
|
|
192
200
|
index_scanned: int = 0
|
|
193
201
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
202
|
|
|
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
|
|
|
240
248
|
)
|
|
241
249
|
|
|
242
250
|
|
|
243
|
-
class ElasticsearchSourceConfig(
|
|
251
|
+
class ElasticsearchSourceConfig(
|
|
252
|
+
StatefulIngestionConfigBase,
|
|
253
|
+
PlatformInstanceConfigMixin,
|
|
254
|
+
EnvConfigMixin,
|
|
255
|
+
):
|
|
244
256
|
host: str = Field(
|
|
245
257
|
default="localhost:9200", description="The elastic search host URI."
|
|
246
258
|
)
|
|
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
337
349
|
@config_class(ElasticsearchSourceConfig)
|
|
338
350
|
@support_status(SupportStatus.CERTIFIED)
|
|
339
351
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
340
|
-
class ElasticsearchSource(
|
|
352
|
+
class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
341
353
|
"""
|
|
342
354
|
This plugin extracts the following:
|
|
343
355
|
|
|
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
|
|
|
346
358
|
"""
|
|
347
359
|
|
|
348
360
|
def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
|
|
349
|
-
super().__init__(ctx)
|
|
361
|
+
super().__init__(config, ctx)
|
|
350
362
|
self.source_config = config
|
|
351
363
|
self.client = Elasticsearch(
|
|
352
364
|
self.source_config.host,
|
|
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
|
|
|
361
373
|
ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
|
|
362
374
|
url_prefix=self.source_config.url_prefix,
|
|
363
375
|
)
|
|
364
|
-
self.report = ElasticsearchSourceReport()
|
|
376
|
+
self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
|
|
365
377
|
self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
|
|
366
378
|
self.platform: str = "elasticsearch"
|
|
367
379
|
self.cat_response: Optional[List[Dict[str, Any]]] = None
|
|
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
|
|
|
373
385
|
config = ElasticsearchSourceConfig.parse_obj(config_dict)
|
|
374
386
|
return cls(config, ctx)
|
|
375
387
|
|
|
388
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
389
|
+
return [
|
|
390
|
+
*super().get_workunit_processors(),
|
|
391
|
+
StaleEntityRemovalHandler.create(
|
|
392
|
+
self, self.source_config, self.ctx
|
|
393
|
+
).workunit_processor,
|
|
394
|
+
]
|
|
395
|
+
|
|
376
396
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
377
397
|
indices = self.client.indices.get_alias()
|
|
378
398
|
for index in indices:
|
|
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
|
|
|
20
20
|
from pydantic import Field
|
|
21
21
|
|
|
22
22
|
import datahub.emitter.mce_builder as builder
|
|
23
|
-
from datahub.configuration.common import ConfigModel
|
|
24
23
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
25
24
|
from datahub.ingestion.api.common import PipelineContext
|
|
26
25
|
from datahub.ingestion.api.decorators import (
|
|
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
|
|
|
31
30
|
platform_name,
|
|
32
31
|
support_status,
|
|
33
32
|
)
|
|
34
|
-
from datahub.ingestion.api.source import
|
|
33
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
36
|
+
StaleEntityRemovalHandler,
|
|
37
|
+
StaleEntityRemovalSourceReport,
|
|
38
|
+
)
|
|
39
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
40
|
+
StatefulIngestionConfigBase,
|
|
41
|
+
StatefulIngestionSourceBase,
|
|
42
|
+
)
|
|
36
43
|
from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
|
|
37
44
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
38
45
|
MLFeatureSnapshot,
|
|
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
|
|
|
86
93
|
}
|
|
87
94
|
|
|
88
95
|
|
|
89
|
-
class FeastRepositorySourceConfig(
|
|
96
|
+
class FeastRepositorySourceConfig(
|
|
97
|
+
StatefulIngestionConfigBase,
|
|
98
|
+
):
|
|
90
99
|
path: str = Field(description="Path to Feast repository")
|
|
91
100
|
fs_yaml_file: Optional[str] = Field(
|
|
92
101
|
default=None,
|
|
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
|
|
|
122
131
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
123
132
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
124
133
|
@dataclass
|
|
125
|
-
class FeastRepositorySource(
|
|
134
|
+
class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
126
135
|
"""
|
|
127
136
|
This plugin extracts:
|
|
128
137
|
|
|
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
|
|
|
135
144
|
|
|
136
145
|
platform = "feast"
|
|
137
146
|
source_config: FeastRepositorySourceConfig
|
|
138
|
-
report:
|
|
147
|
+
report: StaleEntityRemovalSourceReport
|
|
139
148
|
feature_store: FeatureStore
|
|
140
149
|
|
|
141
150
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
142
|
-
super().__init__(ctx)
|
|
151
|
+
super().__init__(config, ctx)
|
|
143
152
|
self.source_config = config
|
|
144
|
-
self.
|
|
153
|
+
self.ctx = ctx
|
|
154
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
145
155
|
self.feature_store = FeatureStore(
|
|
146
156
|
repo_path=self.source_config.path,
|
|
147
157
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
|
|
|
158
168
|
|
|
159
169
|
if ml_feature_data_type is None:
|
|
160
170
|
self.report.report_warning(
|
|
161
|
-
|
|
171
|
+
"unable to map type",
|
|
172
|
+
f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
|
|
162
173
|
)
|
|
163
174
|
|
|
164
175
|
ml_feature_data_type = MLFeatureDataType.UNKNOWN
|
|
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
|
|
|
456
467
|
config = FeastRepositorySourceConfig.parse_obj(config_dict)
|
|
457
468
|
return cls(config, ctx)
|
|
458
469
|
|
|
470
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
471
|
+
return [
|
|
472
|
+
*super().get_workunit_processors(),
|
|
473
|
+
StaleEntityRemovalHandler.create(
|
|
474
|
+
self, self.source_config, self.ctx
|
|
475
|
+
).workunit_processor,
|
|
476
|
+
]
|
|
477
|
+
|
|
459
478
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
460
479
|
for feature_view in self.feature_store.list_feature_views():
|
|
461
480
|
for entity_name in feature_view.entities:
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
351
351
|
self.report.add_deserialize_time(deserialize_duration)
|
|
352
352
|
yield i, item
|
|
353
353
|
except Exception as e:
|
|
354
|
-
self.report.report_failure(f"path-{i}", str(e))
|
|
354
|
+
self.report.report_failure(f"{file_status.path}-{i}", str(e))
|
|
355
355
|
|
|
356
356
|
@staticmethod
|
|
357
357
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
|
|
|
14
14
|
from pydantic import validator
|
|
15
15
|
from pydantic.fields import Field
|
|
16
16
|
|
|
17
|
-
from datahub.configuration.common import ConfigModel
|
|
18
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
19
|
from datahub.ingestion.api.decorators import (
|
|
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
|
|
|
56
55
|
nest_asyncio.apply()
|
|
57
56
|
|
|
58
57
|
|
|
59
|
-
class OktaConfig(StatefulIngestionConfigBase
|
|
58
|
+
class OktaConfig(StatefulIngestionConfigBase):
|
|
60
59
|
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
|
|
61
60
|
okta_domain: str = Field(
|
|
62
61
|
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Any, Callable, Iterable, Optional, TypeVar, Union
|
|
2
|
+
from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
|
|
3
3
|
|
|
4
4
|
from mlflow import MlflowClient
|
|
5
5
|
from mlflow.entities import Run
|
|
@@ -8,7 +8,9 @@ from mlflow.store.entities import PagedList
|
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
10
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.source_common import
|
|
11
|
+
from datahub.configuration.source_common import (
|
|
12
|
+
EnvConfigMixin,
|
|
13
|
+
)
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
14
16
|
from datahub.ingestion.api.decorators import (
|
|
@@ -18,8 +20,20 @@ from datahub.ingestion.api.decorators import (
|
|
|
18
20
|
platform_name,
|
|
19
21
|
support_status,
|
|
20
22
|
)
|
|
21
|
-
from datahub.ingestion.api.source import
|
|
23
|
+
from datahub.ingestion.api.source import (
|
|
24
|
+
MetadataWorkUnitProcessor,
|
|
25
|
+
SourceCapability,
|
|
26
|
+
SourceReport,
|
|
27
|
+
)
|
|
22
28
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
29
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
30
|
+
StaleEntityRemovalHandler,
|
|
31
|
+
StaleEntityRemovalSourceReport,
|
|
32
|
+
)
|
|
33
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
34
|
+
StatefulIngestionConfigBase,
|
|
35
|
+
StatefulIngestionSourceBase,
|
|
36
|
+
)
|
|
23
37
|
from datahub.metadata.schema_classes import (
|
|
24
38
|
GlobalTagsClass,
|
|
25
39
|
MLHyperParamClass,
|
|
@@ -35,7 +49,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
49
|
T = TypeVar("T")
|
|
36
50
|
|
|
37
51
|
|
|
38
|
-
class MLflowConfig(EnvConfigMixin):
|
|
52
|
+
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
39
53
|
tracking_uri: Optional[str] = Field(
|
|
40
54
|
default=None,
|
|
41
55
|
description=(
|
|
@@ -79,7 +93,7 @@ class MLflowRegisteredModelStageInfo:
|
|
|
79
93
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
80
94
|
)
|
|
81
95
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
82
|
-
class MLflowSource(
|
|
96
|
+
class MLflowSource(StatefulIngestionSourceBase):
|
|
83
97
|
platform = "mlflow"
|
|
84
98
|
registered_model_stages_info = (
|
|
85
99
|
MLflowRegisteredModelStageInfo(
|
|
@@ -105,9 +119,10 @@ class MLflowSource(Source):
|
|
|
105
119
|
)
|
|
106
120
|
|
|
107
121
|
def __init__(self, ctx: PipelineContext, config: MLflowConfig):
|
|
108
|
-
super().__init__(ctx)
|
|
122
|
+
super().__init__(config, ctx)
|
|
123
|
+
self.ctx = ctx
|
|
109
124
|
self.config = config
|
|
110
|
-
self.report =
|
|
125
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
111
126
|
self.client = MlflowClient(
|
|
112
127
|
tracking_uri=self.config.tracking_uri,
|
|
113
128
|
registry_uri=self.config.registry_uri,
|
|
@@ -116,6 +131,14 @@ class MLflowSource(Source):
|
|
|
116
131
|
def get_report(self) -> SourceReport:
|
|
117
132
|
return self.report
|
|
118
133
|
|
|
134
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
135
|
+
return [
|
|
136
|
+
*super().get_workunit_processors(),
|
|
137
|
+
StaleEntityRemovalHandler.create(
|
|
138
|
+
self, self.config, self.ctx
|
|
139
|
+
).workunit_processor,
|
|
140
|
+
]
|
|
141
|
+
|
|
119
142
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
120
143
|
yield from self._get_tags_workunits()
|
|
121
144
|
yield from self._get_ml_model_workunits()
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
|
|
|
23
23
|
|
|
24
24
|
import datahub.emitter.mce_builder as builder
|
|
25
25
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
26
|
-
from datahub.configuration.source_common import
|
|
26
|
+
from datahub.configuration.source_common import (
|
|
27
|
+
DatasetLineageProviderConfigBase,
|
|
28
|
+
)
|
|
27
29
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
28
30
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
29
31
|
from datahub.emitter.mcp_builder import (
|
|
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
|
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
|
|
140
|
-
class ModeConfig(
|
|
142
|
+
class ModeConfig(
|
|
143
|
+
StatefulIngestionConfigBase,
|
|
144
|
+
DatasetLineageProviderConfigBase,
|
|
145
|
+
):
|
|
141
146
|
# See https://mode.com/developer/api-reference/authentication/
|
|
142
147
|
# for authentication
|
|
143
148
|
connect_uri: str = Field(
|
|
@@ -7,7 +7,9 @@ import pandas as pd
|
|
|
7
7
|
from neo4j import GraphDatabase
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.source_common import
|
|
10
|
+
from datahub.configuration.source_common import (
|
|
11
|
+
EnvConfigMixin,
|
|
12
|
+
)
|
|
11
13
|
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
19
|
platform_name,
|
|
18
20
|
support_status,
|
|
19
21
|
)
|
|
20
|
-
from datahub.ingestion.api.source import
|
|
22
|
+
from datahub.ingestion.api.source import (
|
|
23
|
+
MetadataWorkUnitProcessor,
|
|
24
|
+
)
|
|
21
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
26
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
|
+
StaleEntityRemovalHandler,
|
|
29
|
+
)
|
|
30
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
|
+
StatefulIngestionConfigBase,
|
|
32
|
+
StatefulIngestionReport,
|
|
33
|
+
StatefulIngestionSourceBase,
|
|
34
|
+
)
|
|
23
35
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
36
|
from datahub.metadata.schema_classes import (
|
|
25
37
|
AuditStampClass,
|
|
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
|
|
|
52
64
|
}
|
|
53
65
|
|
|
54
66
|
|
|
55
|
-
class Neo4jConfig(EnvConfigMixin):
|
|
67
|
+
class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
|
|
56
68
|
username: str = Field(description="Neo4j Username")
|
|
57
69
|
password: str = Field(description="Neo4j Password")
|
|
58
70
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
|
|
|
60
72
|
|
|
61
73
|
|
|
62
74
|
@dataclass
|
|
63
|
-
class Neo4jSourceReport(
|
|
75
|
+
class Neo4jSourceReport(StatefulIngestionReport):
|
|
64
76
|
obj_failures: int = 0
|
|
65
77
|
obj_created: int = 0
|
|
66
78
|
|
|
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
|
|
|
68
80
|
@platform_name("Neo4j", id="neo4j")
|
|
69
81
|
@config_class(Neo4jConfig)
|
|
70
82
|
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
-
class Neo4jSource(
|
|
83
|
+
class Neo4jSource(StatefulIngestionSourceBase):
|
|
72
84
|
NODE = "node"
|
|
73
85
|
RELATIONSHIP = "relationship"
|
|
74
86
|
PLATFORM = "neo4j"
|
|
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
|
|
|
76
88
|
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
89
|
self.ctx = ctx
|
|
78
90
|
self.config = config
|
|
79
|
-
self.report = Neo4jSourceReport()
|
|
91
|
+
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
80
92
|
|
|
81
93
|
@classmethod
|
|
82
94
|
def create(cls, config_dict, ctx):
|
|
@@ -282,6 +294,14 @@ class Neo4jSource(Source):
|
|
|
282
294
|
def get_relationships(self, record: dict) -> dict:
|
|
283
295
|
return record.get("relationships", None)
|
|
284
296
|
|
|
297
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
298
|
+
return [
|
|
299
|
+
*super().get_workunit_processors(),
|
|
300
|
+
StaleEntityRemovalHandler.create(
|
|
301
|
+
self, self.config, self.ctx
|
|
302
|
+
).workunit_processor,
|
|
303
|
+
]
|
|
304
|
+
|
|
285
305
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
306
|
df = self.get_neo4j_metadata(
|
|
287
307
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -22,7 +22,9 @@ from requests_gssapi import HTTPSPNEGOAuth
|
|
|
22
22
|
|
|
23
23
|
import datahub.emitter.mce_builder as builder
|
|
24
24
|
from datahub.configuration.common import AllowDenyPattern
|
|
25
|
-
from datahub.configuration.source_common import
|
|
25
|
+
from datahub.configuration.source_common import (
|
|
26
|
+
EnvConfigMixin,
|
|
27
|
+
)
|
|
26
28
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
27
29
|
from datahub.emitter.mcp_builder import ContainerKey, gen_containers
|
|
28
30
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -33,9 +35,21 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
35
|
platform_name,
|
|
34
36
|
support_status,
|
|
35
37
|
)
|
|
36
|
-
from datahub.ingestion.api.source import
|
|
38
|
+
from datahub.ingestion.api.source import (
|
|
39
|
+
MetadataWorkUnitProcessor,
|
|
40
|
+
SourceCapability,
|
|
41
|
+
SourceReport,
|
|
42
|
+
)
|
|
37
43
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
44
|
from datahub.ingestion.source.common.subtypes import JobContainerSubTypes
|
|
45
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
46
|
+
StaleEntityRemovalHandler,
|
|
47
|
+
StaleEntityRemovalSourceReport,
|
|
48
|
+
)
|
|
49
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
50
|
+
StatefulIngestionConfigBase,
|
|
51
|
+
StatefulIngestionSourceBase,
|
|
52
|
+
)
|
|
39
53
|
from datahub.metadata.schema_classes import (
|
|
40
54
|
BrowsePathEntryClass,
|
|
41
55
|
BrowsePathsV2Class,
|
|
@@ -81,7 +95,7 @@ class ProcessGroupKey(ContainerKey):
|
|
|
81
95
|
process_group_id: str
|
|
82
96
|
|
|
83
97
|
|
|
84
|
-
class NifiSourceConfig(EnvConfigMixin):
|
|
98
|
+
class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
85
99
|
site_url: str = Field(
|
|
86
100
|
description="URL for Nifi, ending with /nifi/. e.g. https://mynifi.domain/nifi/"
|
|
87
101
|
)
|
|
@@ -452,7 +466,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
|
|
|
452
466
|
|
|
453
467
|
|
|
454
468
|
@dataclass
|
|
455
|
-
class NifiSourceReport(
|
|
469
|
+
class NifiSourceReport(StaleEntityRemovalSourceReport):
|
|
456
470
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
457
471
|
|
|
458
472
|
def report_dropped(self, ent_name: str) -> None:
|
|
@@ -464,13 +478,14 @@ class NifiSourceReport(SourceReport):
|
|
|
464
478
|
@config_class(NifiSourceConfig)
|
|
465
479
|
@support_status(SupportStatus.CERTIFIED)
|
|
466
480
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations")
|
|
467
|
-
class NifiSource(
|
|
481
|
+
class NifiSource(StatefulIngestionSourceBase):
|
|
468
482
|
config: NifiSourceConfig
|
|
469
483
|
report: NifiSourceReport
|
|
470
484
|
|
|
471
485
|
def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None:
|
|
472
|
-
super().__init__(ctx)
|
|
486
|
+
super().__init__(config, ctx)
|
|
473
487
|
self.config = config
|
|
488
|
+
self.ctx = ctx
|
|
474
489
|
self.report = NifiSourceReport()
|
|
475
490
|
self.session = requests.Session()
|
|
476
491
|
|
|
@@ -1151,6 +1166,14 @@ class NifiSource(Source):
|
|
|
1151
1166
|
token_response.raise_for_status()
|
|
1152
1167
|
self.session.headers.update({"Authorization": "Bearer " + token_response.text})
|
|
1153
1168
|
|
|
1169
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1170
|
+
return [
|
|
1171
|
+
*super().get_workunit_processors(),
|
|
1172
|
+
StaleEntityRemovalHandler.create(
|
|
1173
|
+
self, self.config, self.ctx
|
|
1174
|
+
).workunit_processor,
|
|
1175
|
+
]
|
|
1176
|
+
|
|
1154
1177
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1155
1178
|
try:
|
|
1156
1179
|
self.authenticate()
|
|
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
|
|
|
14
14
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.common import AllowDenyPattern
|
|
17
|
-
from datahub.configuration.source_common import
|
|
17
|
+
from datahub.configuration.source_common import (
|
|
18
|
+
EnvConfigMixin,
|
|
19
|
+
)
|
|
18
20
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
27
|
platform_name,
|
|
26
28
|
support_status,
|
|
27
29
|
)
|
|
28
|
-
from datahub.ingestion.api.source import
|
|
30
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
29
31
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
32
|
from datahub.ingestion.source.powerbi_report_server.constants import (
|
|
31
33
|
API_ENDPOINTS,
|
|
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
|
|
|
39
41
|
PowerBiReport,
|
|
40
42
|
Report,
|
|
41
43
|
)
|
|
44
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
45
|
+
StaleEntityRemovalHandler,
|
|
46
|
+
StaleEntityRemovalSourceReport,
|
|
47
|
+
)
|
|
48
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
49
|
+
StatefulIngestionConfigBase,
|
|
50
|
+
StatefulIngestionSourceBase,
|
|
51
|
+
)
|
|
42
52
|
from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
|
|
43
53
|
from datahub.metadata.schema_classes import (
|
|
44
54
|
BrowsePathsClass,
|
|
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
|
|
|
58
68
|
LOGGER = logging.getLogger(__name__)
|
|
59
69
|
|
|
60
70
|
|
|
61
|
-
class PowerBiReportServerAPIConfig(EnvConfigMixin):
|
|
71
|
+
class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
62
72
|
username: str = pydantic.Field(description="Windows account username")
|
|
63
73
|
password: str = pydantic.Field(description="Windows account password")
|
|
64
74
|
workstation_name: str = pydantic.Field(
|
|
@@ -475,7 +485,7 @@ class Mapper:
|
|
|
475
485
|
|
|
476
486
|
|
|
477
487
|
@dataclass
|
|
478
|
-
class PowerBiReportServerDashboardSourceReport(
|
|
488
|
+
class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
479
489
|
scanned_report: int = 0
|
|
480
490
|
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
481
491
|
|
|
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
490
500
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
491
501
|
@support_status(SupportStatus.INCUBATING)
|
|
492
502
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
493
|
-
class PowerBiReportServerDashboardSource(
|
|
503
|
+
class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
|
|
494
504
|
"""
|
|
495
505
|
Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
|
|
496
506
|
It extracts the following:
|
|
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
520
530
|
def __init__(
|
|
521
531
|
self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
|
|
522
532
|
):
|
|
523
|
-
super().__init__(ctx)
|
|
533
|
+
super().__init__(config, ctx)
|
|
524
534
|
self.source_config = config
|
|
535
|
+
self.ctx = ctx
|
|
525
536
|
self.report = PowerBiReportServerDashboardSourceReport()
|
|
526
537
|
self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
|
|
527
538
|
self.powerbi_client = PowerBiReportServerAPI(self.source_config)
|
|
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
532
543
|
config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
|
|
533
544
|
return cls(config, ctx)
|
|
534
545
|
|
|
546
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
547
|
+
return [
|
|
548
|
+
*super().get_workunit_processors(),
|
|
549
|
+
StaleEntityRemovalHandler.create(
|
|
550
|
+
self, self.source_config, self.ctx
|
|
551
|
+
).workunit_processor,
|
|
552
|
+
]
|
|
553
|
+
|
|
535
554
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
536
555
|
"""
|
|
537
556
|
Datahub Ingestion framework invoke this method
|
|
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
|
|
|
116
116
|
def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
|
|
117
117
|
super().__init__(config, ctx)
|
|
118
118
|
self.platform: str = "pulsar"
|
|
119
|
+
self.ctx = ctx
|
|
119
120
|
self.config: PulsarSourceConfig = config
|
|
120
121
|
self.report: PulsarSourceReport = PulsarSourceReport()
|
|
121
122
|
|
|
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
|
|
|
12
12
|
from urllib3.util.retry import Retry
|
|
13
13
|
|
|
14
14
|
import datahub.emitter.mce_builder as builder
|
|
15
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
15
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
16
16
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
17
17
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
18
|
from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
|
22
22
|
platform_name,
|
|
23
23
|
support_status,
|
|
24
24
|
)
|
|
25
|
-
from datahub.ingestion.api.source import
|
|
25
|
+
from datahub.ingestion.api.source import (
|
|
26
|
+
MetadataWorkUnitProcessor,
|
|
27
|
+
SourceCapability,
|
|
28
|
+
SourceReport,
|
|
29
|
+
)
|
|
26
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
32
|
+
StaleEntityRemovalHandler,
|
|
33
|
+
StaleEntityRemovalSourceReport,
|
|
34
|
+
)
|
|
35
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
36
|
+
StatefulIngestionConfigBase,
|
|
37
|
+
StatefulIngestionSourceBase,
|
|
38
|
+
)
|
|
27
39
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
28
40
|
AuditStamp,
|
|
29
41
|
ChangeAuditStamps,
|
|
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
|
|
|
235
247
|
return f"{database_name}.{table_name}"
|
|
236
248
|
|
|
237
249
|
|
|
238
|
-
class RedashConfig(
|
|
250
|
+
class RedashConfig(
|
|
251
|
+
StatefulIngestionConfigBase,
|
|
252
|
+
):
|
|
239
253
|
# See the Redash API for details
|
|
240
254
|
# https://redash.io/help/user-guide/integrations-and-api/api
|
|
241
255
|
connect_uri: str = Field(
|
|
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
|
|
|
277
291
|
|
|
278
292
|
|
|
279
293
|
@dataclass
|
|
280
|
-
class RedashSourceReport(
|
|
294
|
+
class RedashSourceReport(StaleEntityRemovalSourceReport):
|
|
281
295
|
items_scanned: int = 0
|
|
282
296
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
283
297
|
queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
|
|
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
|
|
|
305
319
|
@config_class(RedashConfig)
|
|
306
320
|
@support_status(SupportStatus.INCUBATING)
|
|
307
321
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
308
|
-
class RedashSource(
|
|
322
|
+
class RedashSource(StatefulIngestionSourceBase):
|
|
309
323
|
"""
|
|
310
324
|
This plugin extracts the following:
|
|
311
325
|
|
|
@@ -316,8 +330,9 @@ class RedashSource(Source):
|
|
|
316
330
|
platform = "redash"
|
|
317
331
|
|
|
318
332
|
def __init__(self, ctx: PipelineContext, config: RedashConfig):
|
|
319
|
-
super().__init__(ctx)
|
|
333
|
+
super().__init__(config, ctx)
|
|
320
334
|
self.config: RedashConfig = config
|
|
335
|
+
self.ctx = ctx
|
|
321
336
|
self.report: RedashSourceReport = RedashSourceReport()
|
|
322
337
|
|
|
323
338
|
# Handle trailing slash removal
|
|
@@ -724,6 +739,14 @@ class RedashSource(Source):
|
|
|
724
739
|
def add_config_to_report(self) -> None:
|
|
725
740
|
self.report.api_page_limit = self.config.api_page_limit
|
|
726
741
|
|
|
742
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
743
|
+
return [
|
|
744
|
+
*super().get_workunit_processors(),
|
|
745
|
+
StaleEntityRemovalHandler.create(
|
|
746
|
+
self, self.config, self.ctx
|
|
747
|
+
).workunit_processor,
|
|
748
|
+
]
|
|
749
|
+
|
|
727
750
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
728
751
|
self.validate_connection()
|
|
729
752
|
self.add_config_to_report()
|
|
@@ -5,7 +5,9 @@ import pydantic
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
|
-
from datahub.configuration.source_common import
|
|
8
|
+
from datahub.configuration.source_common import (
|
|
9
|
+
DatasetSourceConfigMixin,
|
|
10
|
+
)
|
|
9
11
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
10
12
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
11
13
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|