acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/emitter/mcp_builder.py +4 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +7 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/openapi_parser.py +46 -14
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +1 -0
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/metadata/_schema_classes.py +534 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17379 -17637
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_entity.py +20 -1
- datahub/sdk/_shared.py +163 -13
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +23 -5
- datahub/sdk/dataset.py +109 -17
- datahub/sdk/main_client.py +17 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
|
|
|
32
32
|
platform_name,
|
|
33
33
|
support_status,
|
|
34
34
|
)
|
|
35
|
-
from datahub.ingestion.api.source import
|
|
35
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
37
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
38
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
39
|
+
StaleEntityRemovalHandler,
|
|
40
|
+
StaleEntityRemovalSourceReport,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
43
|
+
StatefulIngestionConfigBase,
|
|
44
|
+
StatefulIngestionSourceBase,
|
|
45
|
+
)
|
|
38
46
|
from datahub.ingestion.source_config.operation_config import (
|
|
39
47
|
OperationConfig,
|
|
40
48
|
is_profiling_enabled,
|
|
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
|
|
|
188
196
|
|
|
189
197
|
|
|
190
198
|
@dataclass
|
|
191
|
-
class ElasticsearchSourceReport(
|
|
199
|
+
class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
|
|
192
200
|
index_scanned: int = 0
|
|
193
201
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
202
|
|
|
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
|
|
|
240
248
|
)
|
|
241
249
|
|
|
242
250
|
|
|
243
|
-
class ElasticsearchSourceConfig(
|
|
251
|
+
class ElasticsearchSourceConfig(
|
|
252
|
+
StatefulIngestionConfigBase,
|
|
253
|
+
PlatformInstanceConfigMixin,
|
|
254
|
+
EnvConfigMixin,
|
|
255
|
+
):
|
|
244
256
|
host: str = Field(
|
|
245
257
|
default="localhost:9200", description="The elastic search host URI."
|
|
246
258
|
)
|
|
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
337
349
|
@config_class(ElasticsearchSourceConfig)
|
|
338
350
|
@support_status(SupportStatus.CERTIFIED)
|
|
339
351
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
340
|
-
class ElasticsearchSource(
|
|
352
|
+
class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
341
353
|
"""
|
|
342
354
|
This plugin extracts the following:
|
|
343
355
|
|
|
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
|
|
|
346
358
|
"""
|
|
347
359
|
|
|
348
360
|
def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
|
|
349
|
-
super().__init__(ctx)
|
|
361
|
+
super().__init__(config, ctx)
|
|
350
362
|
self.source_config = config
|
|
351
363
|
self.client = Elasticsearch(
|
|
352
364
|
self.source_config.host,
|
|
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
|
|
|
361
373
|
ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
|
|
362
374
|
url_prefix=self.source_config.url_prefix,
|
|
363
375
|
)
|
|
364
|
-
self.report = ElasticsearchSourceReport()
|
|
376
|
+
self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
|
|
365
377
|
self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
|
|
366
378
|
self.platform: str = "elasticsearch"
|
|
367
379
|
self.cat_response: Optional[List[Dict[str, Any]]] = None
|
|
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
|
|
|
373
385
|
config = ElasticsearchSourceConfig.parse_obj(config_dict)
|
|
374
386
|
return cls(config, ctx)
|
|
375
387
|
|
|
388
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
389
|
+
return [
|
|
390
|
+
*super().get_workunit_processors(),
|
|
391
|
+
StaleEntityRemovalHandler.create(
|
|
392
|
+
self, self.source_config, self.ctx
|
|
393
|
+
).workunit_processor,
|
|
394
|
+
]
|
|
395
|
+
|
|
376
396
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
377
397
|
indices = self.client.indices.get_alias()
|
|
378
398
|
for index in indices:
|
|
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
|
|
|
20
20
|
from pydantic import Field
|
|
21
21
|
|
|
22
22
|
import datahub.emitter.mce_builder as builder
|
|
23
|
-
from datahub.configuration.common import ConfigModel
|
|
24
23
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
25
24
|
from datahub.ingestion.api.common import PipelineContext
|
|
26
25
|
from datahub.ingestion.api.decorators import (
|
|
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
|
|
|
31
30
|
platform_name,
|
|
32
31
|
support_status,
|
|
33
32
|
)
|
|
34
|
-
from datahub.ingestion.api.source import
|
|
33
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
36
|
+
StaleEntityRemovalHandler,
|
|
37
|
+
StaleEntityRemovalSourceReport,
|
|
38
|
+
)
|
|
39
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
40
|
+
StatefulIngestionConfigBase,
|
|
41
|
+
StatefulIngestionSourceBase,
|
|
42
|
+
)
|
|
36
43
|
from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
|
|
37
44
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
38
45
|
MLFeatureSnapshot,
|
|
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
|
|
|
86
93
|
}
|
|
87
94
|
|
|
88
95
|
|
|
89
|
-
class FeastRepositorySourceConfig(
|
|
96
|
+
class FeastRepositorySourceConfig(
|
|
97
|
+
StatefulIngestionConfigBase,
|
|
98
|
+
):
|
|
90
99
|
path: str = Field(description="Path to Feast repository")
|
|
91
100
|
fs_yaml_file: Optional[str] = Field(
|
|
92
101
|
default=None,
|
|
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
|
|
|
122
131
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
123
132
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
124
133
|
@dataclass
|
|
125
|
-
class FeastRepositorySource(
|
|
134
|
+
class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
126
135
|
"""
|
|
127
136
|
This plugin extracts:
|
|
128
137
|
|
|
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
|
|
|
135
144
|
|
|
136
145
|
platform = "feast"
|
|
137
146
|
source_config: FeastRepositorySourceConfig
|
|
138
|
-
report:
|
|
147
|
+
report: StaleEntityRemovalSourceReport
|
|
139
148
|
feature_store: FeatureStore
|
|
140
149
|
|
|
141
150
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
142
|
-
super().__init__(ctx)
|
|
151
|
+
super().__init__(config, ctx)
|
|
143
152
|
self.source_config = config
|
|
144
|
-
self.
|
|
153
|
+
self.ctx = ctx
|
|
154
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
145
155
|
self.feature_store = FeatureStore(
|
|
146
156
|
repo_path=self.source_config.path,
|
|
147
157
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
|
|
|
158
168
|
|
|
159
169
|
if ml_feature_data_type is None:
|
|
160
170
|
self.report.report_warning(
|
|
161
|
-
|
|
171
|
+
"unable to map type",
|
|
172
|
+
f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
|
|
162
173
|
)
|
|
163
174
|
|
|
164
175
|
ml_feature_data_type = MLFeatureDataType.UNKNOWN
|
|
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
|
|
|
456
467
|
config = FeastRepositorySourceConfig.parse_obj(config_dict)
|
|
457
468
|
return cls(config, ctx)
|
|
458
469
|
|
|
470
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
471
|
+
return [
|
|
472
|
+
*super().get_workunit_processors(),
|
|
473
|
+
StaleEntityRemovalHandler.create(
|
|
474
|
+
self, self.source_config, self.ctx
|
|
475
|
+
).workunit_processor,
|
|
476
|
+
]
|
|
477
|
+
|
|
459
478
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
460
479
|
for feature_view in self.feature_store.list_feature_views():
|
|
461
480
|
for entity_name in feature_view.entities:
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
351
351
|
self.report.add_deserialize_time(deserialize_duration)
|
|
352
352
|
yield i, item
|
|
353
353
|
except Exception as e:
|
|
354
|
-
self.report.report_failure(f"path-{i}", str(e))
|
|
354
|
+
self.report.report_failure(f"{file_status.path}-{i}", str(e))
|
|
355
355
|
|
|
356
356
|
@staticmethod
|
|
357
357
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
|
|
|
14
14
|
from pydantic import validator
|
|
15
15
|
from pydantic.fields import Field
|
|
16
16
|
|
|
17
|
-
from datahub.configuration.common import ConfigModel
|
|
18
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
19
|
from datahub.ingestion.api.decorators import (
|
|
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
|
|
|
56
55
|
nest_asyncio.apply()
|
|
57
56
|
|
|
58
57
|
|
|
59
|
-
class OktaConfig(StatefulIngestionConfigBase
|
|
58
|
+
class OktaConfig(StatefulIngestionConfigBase):
|
|
60
59
|
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
|
|
61
60
|
okta_domain: str = Field(
|
|
62
61
|
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Any, Callable, Iterable, Optional, TypeVar, Union
|
|
2
|
+
from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
|
|
3
3
|
|
|
4
4
|
from mlflow import MlflowClient
|
|
5
5
|
from mlflow.entities import Run
|
|
@@ -8,7 +8,9 @@ from mlflow.store.entities import PagedList
|
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
10
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.source_common import
|
|
11
|
+
from datahub.configuration.source_common import (
|
|
12
|
+
EnvConfigMixin,
|
|
13
|
+
)
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
14
16
|
from datahub.ingestion.api.decorators import (
|
|
@@ -18,8 +20,20 @@ from datahub.ingestion.api.decorators import (
|
|
|
18
20
|
platform_name,
|
|
19
21
|
support_status,
|
|
20
22
|
)
|
|
21
|
-
from datahub.ingestion.api.source import
|
|
23
|
+
from datahub.ingestion.api.source import (
|
|
24
|
+
MetadataWorkUnitProcessor,
|
|
25
|
+
SourceCapability,
|
|
26
|
+
SourceReport,
|
|
27
|
+
)
|
|
22
28
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
29
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
30
|
+
StaleEntityRemovalHandler,
|
|
31
|
+
StaleEntityRemovalSourceReport,
|
|
32
|
+
)
|
|
33
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
34
|
+
StatefulIngestionConfigBase,
|
|
35
|
+
StatefulIngestionSourceBase,
|
|
36
|
+
)
|
|
23
37
|
from datahub.metadata.schema_classes import (
|
|
24
38
|
GlobalTagsClass,
|
|
25
39
|
MLHyperParamClass,
|
|
@@ -35,7 +49,7 @@ from datahub.metadata.schema_classes import (
|
|
|
35
49
|
T = TypeVar("T")
|
|
36
50
|
|
|
37
51
|
|
|
38
|
-
class MLflowConfig(EnvConfigMixin):
|
|
52
|
+
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
39
53
|
tracking_uri: Optional[str] = Field(
|
|
40
54
|
default=None,
|
|
41
55
|
description=(
|
|
@@ -79,7 +93,7 @@ class MLflowRegisteredModelStageInfo:
|
|
|
79
93
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
80
94
|
)
|
|
81
95
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
82
|
-
class MLflowSource(
|
|
96
|
+
class MLflowSource(StatefulIngestionSourceBase):
|
|
83
97
|
platform = "mlflow"
|
|
84
98
|
registered_model_stages_info = (
|
|
85
99
|
MLflowRegisteredModelStageInfo(
|
|
@@ -105,9 +119,10 @@ class MLflowSource(Source):
|
|
|
105
119
|
)
|
|
106
120
|
|
|
107
121
|
def __init__(self, ctx: PipelineContext, config: MLflowConfig):
|
|
108
|
-
super().__init__(ctx)
|
|
122
|
+
super().__init__(config, ctx)
|
|
123
|
+
self.ctx = ctx
|
|
109
124
|
self.config = config
|
|
110
|
-
self.report =
|
|
125
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
111
126
|
self.client = MlflowClient(
|
|
112
127
|
tracking_uri=self.config.tracking_uri,
|
|
113
128
|
registry_uri=self.config.registry_uri,
|
|
@@ -116,6 +131,14 @@ class MLflowSource(Source):
|
|
|
116
131
|
def get_report(self) -> SourceReport:
|
|
117
132
|
return self.report
|
|
118
133
|
|
|
134
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
135
|
+
return [
|
|
136
|
+
*super().get_workunit_processors(),
|
|
137
|
+
StaleEntityRemovalHandler.create(
|
|
138
|
+
self, self.config, self.ctx
|
|
139
|
+
).workunit_processor,
|
|
140
|
+
]
|
|
141
|
+
|
|
119
142
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
120
143
|
yield from self._get_tags_workunits()
|
|
121
144
|
yield from self._get_ml_model_workunits()
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -23,7 +23,9 @@ from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponenti
|
|
|
23
23
|
|
|
24
24
|
import datahub.emitter.mce_builder as builder
|
|
25
25
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
26
|
-
from datahub.configuration.source_common import
|
|
26
|
+
from datahub.configuration.source_common import (
|
|
27
|
+
DatasetLineageProviderConfigBase,
|
|
28
|
+
)
|
|
27
29
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
28
30
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
29
31
|
from datahub.emitter.mcp_builder import (
|
|
@@ -137,7 +139,10 @@ class ModeAPIConfig(ConfigModel):
|
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
|
|
140
|
-
class ModeConfig(
|
|
142
|
+
class ModeConfig(
|
|
143
|
+
StatefulIngestionConfigBase,
|
|
144
|
+
DatasetLineageProviderConfigBase,
|
|
145
|
+
):
|
|
141
146
|
# See https://mode.com/developer/api-reference/authentication/
|
|
142
147
|
# for authentication
|
|
143
148
|
connect_uri: str = Field(
|
|
@@ -7,7 +7,9 @@ import pandas as pd
|
|
|
7
7
|
from neo4j import GraphDatabase
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.source_common import
|
|
10
|
+
from datahub.configuration.source_common import (
|
|
11
|
+
EnvConfigMixin,
|
|
12
|
+
)
|
|
11
13
|
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
12
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -17,9 +19,19 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
19
|
platform_name,
|
|
18
20
|
support_status,
|
|
19
21
|
)
|
|
20
|
-
from datahub.ingestion.api.source import
|
|
22
|
+
from datahub.ingestion.api.source import (
|
|
23
|
+
MetadataWorkUnitProcessor,
|
|
24
|
+
)
|
|
21
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
26
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
|
+
StaleEntityRemovalHandler,
|
|
29
|
+
)
|
|
30
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
|
+
StatefulIngestionConfigBase,
|
|
32
|
+
StatefulIngestionReport,
|
|
33
|
+
StatefulIngestionSourceBase,
|
|
34
|
+
)
|
|
23
35
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
|
|
24
36
|
from datahub.metadata.schema_classes import (
|
|
25
37
|
AuditStampClass,
|
|
@@ -52,7 +64,7 @@ _type_mapping: Dict[Union[Type, str], Type] = {
|
|
|
52
64
|
}
|
|
53
65
|
|
|
54
66
|
|
|
55
|
-
class Neo4jConfig(EnvConfigMixin):
|
|
67
|
+
class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
|
|
56
68
|
username: str = Field(description="Neo4j Username")
|
|
57
69
|
password: str = Field(description="Neo4j Password")
|
|
58
70
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
@@ -60,7 +72,7 @@ class Neo4jConfig(EnvConfigMixin):
|
|
|
60
72
|
|
|
61
73
|
|
|
62
74
|
@dataclass
|
|
63
|
-
class Neo4jSourceReport(
|
|
75
|
+
class Neo4jSourceReport(StatefulIngestionReport):
|
|
64
76
|
obj_failures: int = 0
|
|
65
77
|
obj_created: int = 0
|
|
66
78
|
|
|
@@ -68,7 +80,7 @@ class Neo4jSourceReport(SourceReport):
|
|
|
68
80
|
@platform_name("Neo4j", id="neo4j")
|
|
69
81
|
@config_class(Neo4jConfig)
|
|
70
82
|
@support_status(SupportStatus.CERTIFIED)
|
|
71
|
-
class Neo4jSource(
|
|
83
|
+
class Neo4jSource(StatefulIngestionSourceBase):
|
|
72
84
|
NODE = "node"
|
|
73
85
|
RELATIONSHIP = "relationship"
|
|
74
86
|
PLATFORM = "neo4j"
|
|
@@ -76,7 +88,7 @@ class Neo4jSource(Source):
|
|
|
76
88
|
def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
|
|
77
89
|
self.ctx = ctx
|
|
78
90
|
self.config = config
|
|
79
|
-
self.report = Neo4jSourceReport()
|
|
91
|
+
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
80
92
|
|
|
81
93
|
@classmethod
|
|
82
94
|
def create(cls, config_dict, ctx):
|
|
@@ -282,6 +294,14 @@ class Neo4jSource(Source):
|
|
|
282
294
|
def get_relationships(self, record: dict) -> dict:
|
|
283
295
|
return record.get("relationships", None)
|
|
284
296
|
|
|
297
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
298
|
+
return [
|
|
299
|
+
*super().get_workunit_processors(),
|
|
300
|
+
StaleEntityRemovalHandler.create(
|
|
301
|
+
self, self.config, self.ctx
|
|
302
|
+
).workunit_processor,
|
|
303
|
+
]
|
|
304
|
+
|
|
285
305
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
286
306
|
df = self.get_neo4j_metadata(
|
|
287
307
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -22,7 +22,9 @@ from requests_gssapi import HTTPSPNEGOAuth
|
|
|
22
22
|
|
|
23
23
|
import datahub.emitter.mce_builder as builder
|
|
24
24
|
from datahub.configuration.common import AllowDenyPattern
|
|
25
|
-
from datahub.configuration.source_common import
|
|
25
|
+
from datahub.configuration.source_common import (
|
|
26
|
+
EnvConfigMixin,
|
|
27
|
+
)
|
|
26
28
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
27
29
|
from datahub.emitter.mcp_builder import ContainerKey, gen_containers
|
|
28
30
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -33,9 +35,21 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
35
|
platform_name,
|
|
34
36
|
support_status,
|
|
35
37
|
)
|
|
36
|
-
from datahub.ingestion.api.source import
|
|
38
|
+
from datahub.ingestion.api.source import (
|
|
39
|
+
MetadataWorkUnitProcessor,
|
|
40
|
+
SourceCapability,
|
|
41
|
+
SourceReport,
|
|
42
|
+
)
|
|
37
43
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
44
|
from datahub.ingestion.source.common.subtypes import JobContainerSubTypes
|
|
45
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
46
|
+
StaleEntityRemovalHandler,
|
|
47
|
+
StaleEntityRemovalSourceReport,
|
|
48
|
+
)
|
|
49
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
50
|
+
StatefulIngestionConfigBase,
|
|
51
|
+
StatefulIngestionSourceBase,
|
|
52
|
+
)
|
|
39
53
|
from datahub.metadata.schema_classes import (
|
|
40
54
|
BrowsePathEntryClass,
|
|
41
55
|
BrowsePathsV2Class,
|
|
@@ -81,7 +95,7 @@ class ProcessGroupKey(ContainerKey):
|
|
|
81
95
|
process_group_id: str
|
|
82
96
|
|
|
83
97
|
|
|
84
|
-
class NifiSourceConfig(EnvConfigMixin):
|
|
98
|
+
class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
85
99
|
site_url: str = Field(
|
|
86
100
|
description="URL for Nifi, ending with /nifi/. e.g. https://mynifi.domain/nifi/"
|
|
87
101
|
)
|
|
@@ -452,7 +466,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
|
|
|
452
466
|
|
|
453
467
|
|
|
454
468
|
@dataclass
|
|
455
|
-
class NifiSourceReport(
|
|
469
|
+
class NifiSourceReport(StaleEntityRemovalSourceReport):
|
|
456
470
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
457
471
|
|
|
458
472
|
def report_dropped(self, ent_name: str) -> None:
|
|
@@ -464,13 +478,14 @@ class NifiSourceReport(SourceReport):
|
|
|
464
478
|
@config_class(NifiSourceConfig)
|
|
465
479
|
@support_status(SupportStatus.CERTIFIED)
|
|
466
480
|
@capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations")
|
|
467
|
-
class NifiSource(
|
|
481
|
+
class NifiSource(StatefulIngestionSourceBase):
|
|
468
482
|
config: NifiSourceConfig
|
|
469
483
|
report: NifiSourceReport
|
|
470
484
|
|
|
471
485
|
def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None:
|
|
472
|
-
super().__init__(ctx)
|
|
486
|
+
super().__init__(config, ctx)
|
|
473
487
|
self.config = config
|
|
488
|
+
self.ctx = ctx
|
|
474
489
|
self.report = NifiSourceReport()
|
|
475
490
|
self.session = requests.Session()
|
|
476
491
|
|
|
@@ -1151,6 +1166,14 @@ class NifiSource(Source):
|
|
|
1151
1166
|
token_response.raise_for_status()
|
|
1152
1167
|
self.session.headers.update({"Authorization": "Bearer " + token_response.text})
|
|
1153
1168
|
|
|
1169
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1170
|
+
return [
|
|
1171
|
+
*super().get_workunit_processors(),
|
|
1172
|
+
StaleEntityRemovalHandler.create(
|
|
1173
|
+
self, self.config, self.ctx
|
|
1174
|
+
).workunit_processor,
|
|
1175
|
+
]
|
|
1176
|
+
|
|
1154
1177
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1155
1178
|
try:
|
|
1156
1179
|
self.authenticate()
|
|
@@ -12,7 +12,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
12
12
|
SchemaField,
|
|
13
13
|
SchemaMetadata,
|
|
14
14
|
)
|
|
15
|
-
from datahub.metadata.schema_classes import
|
|
15
|
+
from datahub.metadata.schema_classes import (
|
|
16
|
+
RecordTypeClass,
|
|
17
|
+
SchemaFieldDataTypeClass,
|
|
18
|
+
StringTypeClass,
|
|
19
|
+
)
|
|
16
20
|
|
|
17
21
|
logger = logging.getLogger(__name__)
|
|
18
22
|
|
|
@@ -20,9 +24,12 @@ logger = logging.getLogger(__name__)
|
|
|
20
24
|
def flatten(d: dict, prefix: str = "") -> Generator:
|
|
21
25
|
for k, v in d.items():
|
|
22
26
|
if isinstance(v, dict):
|
|
27
|
+
# First yield the parent field
|
|
28
|
+
yield f"{prefix}.{k}".strip(".")
|
|
29
|
+
# Then yield all nested fields
|
|
23
30
|
yield from flatten(v, f"{prefix}.{k}")
|
|
24
31
|
else:
|
|
25
|
-
yield f"{prefix}
|
|
32
|
+
yield f"{prefix}.{k}".strip(".") # Use dot instead of hyphen
|
|
26
33
|
|
|
27
34
|
|
|
28
35
|
def flatten2list(d: dict) -> list:
|
|
@@ -34,7 +41,7 @@ def flatten2list(d: dict) -> list:
|
|
|
34
41
|
"anotherone": {"third_a": {"last": 3}}
|
|
35
42
|
}
|
|
36
43
|
|
|
37
|
-
|
|
44
|
+
yields:
|
|
38
45
|
|
|
39
46
|
["first.second_a",
|
|
40
47
|
"first.second_b",
|
|
@@ -43,7 +50,7 @@ def flatten2list(d: dict) -> list:
|
|
|
43
50
|
]
|
|
44
51
|
"""
|
|
45
52
|
fl_l = list(flatten(d))
|
|
46
|
-
return
|
|
53
|
+
return fl_l
|
|
47
54
|
|
|
48
55
|
|
|
49
56
|
def request_call(
|
|
@@ -322,6 +329,8 @@ def extract_fields(
|
|
|
322
329
|
return ["contains_a_string"], {"contains_a_string": dict_data[0]}
|
|
323
330
|
else:
|
|
324
331
|
raise ValueError("unknown format")
|
|
332
|
+
elif not dict_data: # Handle empty dict case
|
|
333
|
+
return [], {}
|
|
325
334
|
if len(dict_data) > 1:
|
|
326
335
|
# the elements are directly inside the dict
|
|
327
336
|
return flatten2list(dict_data), dict_data
|
|
@@ -384,16 +393,39 @@ def set_metadata(
|
|
|
384
393
|
dataset_name: str, fields: List, platform: str = "api"
|
|
385
394
|
) -> SchemaMetadata:
|
|
386
395
|
canonical_schema: List[SchemaField] = []
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
396
|
+
seen_paths = set()
|
|
397
|
+
|
|
398
|
+
# Process all flattened fields
|
|
399
|
+
for field_path in fields:
|
|
400
|
+
parts = field_path.split(".")
|
|
401
|
+
|
|
402
|
+
# Add struct/object fields for each ancestor path
|
|
403
|
+
current_path: List[str] = []
|
|
404
|
+
for part in parts[:-1]:
|
|
405
|
+
ancestor_path = ".".join(current_path + [part])
|
|
406
|
+
if ancestor_path not in seen_paths:
|
|
407
|
+
struct_field = SchemaField(
|
|
408
|
+
fieldPath=ancestor_path,
|
|
409
|
+
nativeDataType="object", # OpenAPI term for struct/record
|
|
410
|
+
type=SchemaFieldDataTypeClass(type=RecordTypeClass()),
|
|
411
|
+
description="",
|
|
412
|
+
recursive=False,
|
|
413
|
+
)
|
|
414
|
+
canonical_schema.append(struct_field)
|
|
415
|
+
seen_paths.add(ancestor_path)
|
|
416
|
+
current_path.append(part)
|
|
417
|
+
|
|
418
|
+
# Add the leaf field if not already seen
|
|
419
|
+
if field_path not in seen_paths:
|
|
420
|
+
leaf_field = SchemaField(
|
|
421
|
+
fieldPath=field_path,
|
|
422
|
+
nativeDataType="str", # Keeping `str` for backwards compatability, ideally this is the correct type
|
|
423
|
+
type=SchemaFieldDataTypeClass(type=StringTypeClass()),
|
|
424
|
+
description="",
|
|
425
|
+
recursive=False,
|
|
426
|
+
)
|
|
427
|
+
canonical_schema.append(leaf_field)
|
|
428
|
+
seen_paths.add(field_path)
|
|
397
429
|
|
|
398
430
|
schema_metadata = SchemaMetadata(
|
|
399
431
|
schemaName=dataset_name,
|
|
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
|
|
|
14
14
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.common import AllowDenyPattern
|
|
17
|
-
from datahub.configuration.source_common import
|
|
17
|
+
from datahub.configuration.source_common import (
|
|
18
|
+
EnvConfigMixin,
|
|
19
|
+
)
|
|
18
20
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
27
|
platform_name,
|
|
26
28
|
support_status,
|
|
27
29
|
)
|
|
28
|
-
from datahub.ingestion.api.source import
|
|
30
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
29
31
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
32
|
from datahub.ingestion.source.powerbi_report_server.constants import (
|
|
31
33
|
API_ENDPOINTS,
|
|
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
|
|
|
39
41
|
PowerBiReport,
|
|
40
42
|
Report,
|
|
41
43
|
)
|
|
44
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
45
|
+
StaleEntityRemovalHandler,
|
|
46
|
+
StaleEntityRemovalSourceReport,
|
|
47
|
+
)
|
|
48
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
49
|
+
StatefulIngestionConfigBase,
|
|
50
|
+
StatefulIngestionSourceBase,
|
|
51
|
+
)
|
|
42
52
|
from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
|
|
43
53
|
from datahub.metadata.schema_classes import (
|
|
44
54
|
BrowsePathsClass,
|
|
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
|
|
|
58
68
|
LOGGER = logging.getLogger(__name__)
|
|
59
69
|
|
|
60
70
|
|
|
61
|
-
class PowerBiReportServerAPIConfig(EnvConfigMixin):
|
|
71
|
+
class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
62
72
|
username: str = pydantic.Field(description="Windows account username")
|
|
63
73
|
password: str = pydantic.Field(description="Windows account password")
|
|
64
74
|
workstation_name: str = pydantic.Field(
|
|
@@ -475,7 +485,7 @@ class Mapper:
|
|
|
475
485
|
|
|
476
486
|
|
|
477
487
|
@dataclass
|
|
478
|
-
class PowerBiReportServerDashboardSourceReport(
|
|
488
|
+
class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
479
489
|
scanned_report: int = 0
|
|
480
490
|
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
481
491
|
|
|
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
490
500
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
491
501
|
@support_status(SupportStatus.INCUBATING)
|
|
492
502
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
493
|
-
class PowerBiReportServerDashboardSource(
|
|
503
|
+
class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
|
|
494
504
|
"""
|
|
495
505
|
Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
|
|
496
506
|
It extracts the following:
|
|
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
520
530
|
def __init__(
|
|
521
531
|
self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
|
|
522
532
|
):
|
|
523
|
-
super().__init__(ctx)
|
|
533
|
+
super().__init__(config, ctx)
|
|
524
534
|
self.source_config = config
|
|
535
|
+
self.ctx = ctx
|
|
525
536
|
self.report = PowerBiReportServerDashboardSourceReport()
|
|
526
537
|
self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
|
|
527
538
|
self.powerbi_client = PowerBiReportServerAPI(self.source_config)
|
|
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
532
543
|
config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
|
|
533
544
|
return cls(config, ctx)
|
|
534
545
|
|
|
546
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
547
|
+
return [
|
|
548
|
+
*super().get_workunit_processors(),
|
|
549
|
+
StaleEntityRemovalHandler.create(
|
|
550
|
+
self, self.source_config, self.ctx
|
|
551
|
+
).workunit_processor,
|
|
552
|
+
]
|
|
553
|
+
|
|
535
554
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
536
555
|
"""
|
|
537
556
|
Datahub Ingestion framework invoke this method
|
|
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
|
|
|
116
116
|
def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
|
|
117
117
|
super().__init__(config, ctx)
|
|
118
118
|
self.platform: str = "pulsar"
|
|
119
|
+
self.ctx = ctx
|
|
119
120
|
self.config: PulsarSourceConfig = config
|
|
120
121
|
self.report: PulsarSourceReport = PulsarSourceReport()
|
|
121
122
|
|