acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, List, Optional, Type
|
|
3
|
+
|
|
4
|
+
import jpype
|
|
5
|
+
import jpype.imports
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
import datahub.emitter.mce_builder as builder
|
|
9
|
+
import datahub.metadata.schema_classes as models
|
|
10
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
11
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
12
|
+
from datahub.ingestion.api.decorators import (
|
|
13
|
+
SourceCapability,
|
|
14
|
+
SupportStatus,
|
|
15
|
+
capability,
|
|
16
|
+
config_class,
|
|
17
|
+
platform_name,
|
|
18
|
+
support_status,
|
|
19
|
+
)
|
|
20
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
|
|
21
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
|
+
from datahub.ingestion.source.kafka_connect.common import (
|
|
23
|
+
CONNECTOR_CLASS,
|
|
24
|
+
SINK,
|
|
25
|
+
SOURCE,
|
|
26
|
+
BaseConnector,
|
|
27
|
+
ConnectorManifest,
|
|
28
|
+
KafkaConnectLineage,
|
|
29
|
+
KafkaConnectSourceConfig,
|
|
30
|
+
KafkaConnectSourceReport,
|
|
31
|
+
get_platform_instance,
|
|
32
|
+
transform_connector_config,
|
|
33
|
+
)
|
|
34
|
+
from datahub.ingestion.source.kafka_connect.sink_connectors import (
|
|
35
|
+
BIGQUERY_SINK_CONNECTOR_CLASS,
|
|
36
|
+
S3_SINK_CONNECTOR_CLASS,
|
|
37
|
+
SNOWFLAKE_SINK_CONNECTOR_CLASS,
|
|
38
|
+
BigQuerySinkConnector,
|
|
39
|
+
ConfluentS3SinkConnector,
|
|
40
|
+
SnowflakeSinkConnector,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.kafka_connect.source_connectors import (
|
|
43
|
+
DEBEZIUM_SOURCE_CONNECTOR_PREFIX,
|
|
44
|
+
JDBC_SOURCE_CONNECTOR_CLASS,
|
|
45
|
+
MONGO_SOURCE_CONNECTOR_CLASS,
|
|
46
|
+
ConfigDrivenSourceConnector,
|
|
47
|
+
ConfluentJDBCSourceConnector,
|
|
48
|
+
DebeziumSourceConnector,
|
|
49
|
+
MongoSourceConnector,
|
|
50
|
+
)
|
|
51
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
52
|
+
StaleEntityRemovalHandler,
|
|
53
|
+
)
|
|
54
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
55
|
+
StatefulIngestionSourceBase,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@platform_name("Kafka Connect")
|
|
62
|
+
@config_class(KafkaConnectSourceConfig)
|
|
63
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
64
|
+
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
65
|
+
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
66
|
+
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
67
|
+
class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
68
|
+
config: KafkaConnectSourceConfig
|
|
69
|
+
report: KafkaConnectSourceReport
|
|
70
|
+
platform: str = "kafka-connect"
|
|
71
|
+
|
|
72
|
+
def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
|
|
73
|
+
super().__init__(config, ctx)
|
|
74
|
+
self.config = config
|
|
75
|
+
self.report = KafkaConnectSourceReport()
|
|
76
|
+
self.session = requests.Session()
|
|
77
|
+
self.session.headers.update(
|
|
78
|
+
{
|
|
79
|
+
"Accept": "application/json",
|
|
80
|
+
"Content-Type": "application/json",
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Test the connection
|
|
85
|
+
if self.config.username is not None and self.config.password is not None:
|
|
86
|
+
logger.info(
|
|
87
|
+
f"Connecting to {self.config.connect_uri} with Authentication..."
|
|
88
|
+
)
|
|
89
|
+
self.session.auth = (self.config.username, self.config.password)
|
|
90
|
+
|
|
91
|
+
test_response = self.session.get(f"{self.config.connect_uri}/connectors")
|
|
92
|
+
test_response.raise_for_status()
|
|
93
|
+
logger.info(f"Connection to {self.config.connect_uri} is ok")
|
|
94
|
+
if not jpype.isJVMStarted():
|
|
95
|
+
jpype.startJVM()
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
99
|
+
config = KafkaConnectSourceConfig.parse_obj(config_dict)
|
|
100
|
+
return cls(config, ctx)
|
|
101
|
+
|
|
102
|
+
def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
|
|
103
|
+
"""Get Kafka Connect connectors manifest using REST API.
|
|
104
|
+
Enrich with lineages metadata.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
connector_response = self.session.get(
|
|
108
|
+
f"{self.config.connect_uri}/connectors",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
payload = connector_response.json()
|
|
112
|
+
|
|
113
|
+
for connector_name in payload:
|
|
114
|
+
connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
|
|
115
|
+
connector_manifest = self._get_connector_manifest(
|
|
116
|
+
connector_name, connector_url
|
|
117
|
+
)
|
|
118
|
+
if (
|
|
119
|
+
connector_manifest is None
|
|
120
|
+
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
121
|
+
):
|
|
122
|
+
self.report.report_dropped(connector_name)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
if self.config.provided_configs:
|
|
126
|
+
transform_connector_config(
|
|
127
|
+
connector_manifest.config, self.config.provided_configs
|
|
128
|
+
)
|
|
129
|
+
connector_manifest.url = connector_url
|
|
130
|
+
connector_manifest.topic_names = self._get_connector_topics(connector_name)
|
|
131
|
+
connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
|
|
132
|
+
|
|
133
|
+
class_type: Type[BaseConnector] = BaseConnector
|
|
134
|
+
|
|
135
|
+
# Populate Source Connector metadata
|
|
136
|
+
if connector_manifest.type == SOURCE:
|
|
137
|
+
connector_manifest.tasks = self._get_connector_tasks(connector_name)
|
|
138
|
+
|
|
139
|
+
# JDBC source connector lineages
|
|
140
|
+
if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS:
|
|
141
|
+
class_type = ConfluentJDBCSourceConnector
|
|
142
|
+
elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX):
|
|
143
|
+
class_type = DebeziumSourceConnector
|
|
144
|
+
elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS:
|
|
145
|
+
class_type = MongoSourceConnector
|
|
146
|
+
elif any(
|
|
147
|
+
[
|
|
148
|
+
connector.connector_name == connector_manifest.name
|
|
149
|
+
for connector in self.config.generic_connectors
|
|
150
|
+
]
|
|
151
|
+
):
|
|
152
|
+
class_type = ConfigDrivenSourceConnector
|
|
153
|
+
else:
|
|
154
|
+
self.report.report_dropped(connector_manifest.name)
|
|
155
|
+
self.report.warning(
|
|
156
|
+
"Lineage for Source Connector not supported. "
|
|
157
|
+
"Please refer to Kafka Connect docs to use `generic_connectors` config.",
|
|
158
|
+
context=f"{connector_manifest.name} of type {connector_class_value}",
|
|
159
|
+
)
|
|
160
|
+
continue
|
|
161
|
+
elif connector_manifest.type == SINK:
|
|
162
|
+
if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS:
|
|
163
|
+
class_type = BigQuerySinkConnector
|
|
164
|
+
elif connector_class_value == S3_SINK_CONNECTOR_CLASS:
|
|
165
|
+
class_type = ConfluentS3SinkConnector
|
|
166
|
+
elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS:
|
|
167
|
+
class_type = SnowflakeSinkConnector
|
|
168
|
+
else:
|
|
169
|
+
self.report.report_dropped(connector_manifest.name)
|
|
170
|
+
self.report.warning(
|
|
171
|
+
"Lineage for Sink Connector not supported.",
|
|
172
|
+
context=f"{connector_manifest.name} of type {connector_class_value}",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
connector_class = class_type(connector_manifest, self.config, self.report)
|
|
176
|
+
connector_manifest.lineages = connector_class.extract_lineages()
|
|
177
|
+
connector_manifest.flow_property_bag = (
|
|
178
|
+
connector_class.extract_flow_property_bag()
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
yield connector_manifest
|
|
182
|
+
|
|
183
|
+
def _get_connector_manifest(
|
|
184
|
+
self, connector_name: str, connector_url: str
|
|
185
|
+
) -> Optional[ConnectorManifest]:
|
|
186
|
+
try:
|
|
187
|
+
connector_response = self.session.get(connector_url)
|
|
188
|
+
connector_response.raise_for_status()
|
|
189
|
+
except Exception as e:
|
|
190
|
+
self.report.warning(
|
|
191
|
+
"Failed to get connector details", connector_name, exc=e
|
|
192
|
+
)
|
|
193
|
+
return None
|
|
194
|
+
manifest = connector_response.json()
|
|
195
|
+
connector_manifest = ConnectorManifest(**manifest)
|
|
196
|
+
return connector_manifest
|
|
197
|
+
|
|
198
|
+
def _get_connector_tasks(self, connector_name: str) -> dict:
|
|
199
|
+
try:
|
|
200
|
+
response = self.session.get(
|
|
201
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
|
|
202
|
+
)
|
|
203
|
+
response.raise_for_status()
|
|
204
|
+
except Exception as e:
|
|
205
|
+
self.report.warning(
|
|
206
|
+
"Error getting connector tasks", context=connector_name, exc=e
|
|
207
|
+
)
|
|
208
|
+
return {}
|
|
209
|
+
|
|
210
|
+
return response.json()
|
|
211
|
+
|
|
212
|
+
def _get_connector_topics(self, connector_name: str) -> List[str]:
|
|
213
|
+
try:
|
|
214
|
+
response = self.session.get(
|
|
215
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
216
|
+
)
|
|
217
|
+
response.raise_for_status()
|
|
218
|
+
except Exception as e:
|
|
219
|
+
self.report.warning(
|
|
220
|
+
"Error getting connector topics", context=connector_name, exc=e
|
|
221
|
+
)
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
return response.json()[connector_name]["topics"]
|
|
225
|
+
|
|
226
|
+
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
227
|
+
connector_name = connector.name
|
|
228
|
+
connector_type = connector.type
|
|
229
|
+
connector_class = connector.config.get(CONNECTOR_CLASS)
|
|
230
|
+
flow_property_bag = connector.flow_property_bag
|
|
231
|
+
# connector_url = connector.url # NOTE: this will expose connector credential when used
|
|
232
|
+
flow_urn = builder.make_data_flow_urn(
|
|
233
|
+
self.platform,
|
|
234
|
+
connector_name,
|
|
235
|
+
self.config.env,
|
|
236
|
+
self.config.platform_instance,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
return MetadataChangeProposalWrapper(
|
|
240
|
+
entityUrn=flow_urn,
|
|
241
|
+
aspect=models.DataFlowInfoClass(
|
|
242
|
+
name=connector_name,
|
|
243
|
+
description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
|
|
244
|
+
customProperties=flow_property_bag,
|
|
245
|
+
# externalUrl=connector_url, # NOTE: this will expose connector credential when used
|
|
246
|
+
),
|
|
247
|
+
).as_workunit()
|
|
248
|
+
|
|
249
|
+
def construct_job_workunits(
|
|
250
|
+
self, connector: ConnectorManifest
|
|
251
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
252
|
+
connector_name = connector.name
|
|
253
|
+
flow_urn = builder.make_data_flow_urn(
|
|
254
|
+
self.platform,
|
|
255
|
+
connector_name,
|
|
256
|
+
self.config.env,
|
|
257
|
+
self.config.platform_instance,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
lineages = connector.lineages
|
|
261
|
+
if lineages:
|
|
262
|
+
for lineage in lineages:
|
|
263
|
+
source_dataset = lineage.source_dataset
|
|
264
|
+
source_platform = lineage.source_platform
|
|
265
|
+
target_dataset = lineage.target_dataset
|
|
266
|
+
target_platform = lineage.target_platform
|
|
267
|
+
job_property_bag = lineage.job_property_bag
|
|
268
|
+
|
|
269
|
+
source_platform_instance = get_platform_instance(
|
|
270
|
+
self.config, connector_name, source_platform
|
|
271
|
+
)
|
|
272
|
+
target_platform_instance = get_platform_instance(
|
|
273
|
+
self.config, connector_name, target_platform
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
job_id = self.get_job_id(lineage, connector, self.config)
|
|
277
|
+
job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
|
|
278
|
+
|
|
279
|
+
inlets = (
|
|
280
|
+
[
|
|
281
|
+
self.make_lineage_dataset_urn(
|
|
282
|
+
source_platform, source_dataset, source_platform_instance
|
|
283
|
+
)
|
|
284
|
+
]
|
|
285
|
+
if source_dataset
|
|
286
|
+
else []
|
|
287
|
+
)
|
|
288
|
+
outlets = [
|
|
289
|
+
self.make_lineage_dataset_urn(
|
|
290
|
+
target_platform, target_dataset, target_platform_instance
|
|
291
|
+
)
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
yield MetadataChangeProposalWrapper(
|
|
295
|
+
entityUrn=job_urn,
|
|
296
|
+
aspect=models.DataJobInfoClass(
|
|
297
|
+
name=f"{connector_name}:{job_id}",
|
|
298
|
+
type="COMMAND",
|
|
299
|
+
customProperties=job_property_bag,
|
|
300
|
+
),
|
|
301
|
+
).as_workunit()
|
|
302
|
+
|
|
303
|
+
yield MetadataChangeProposalWrapper(
|
|
304
|
+
entityUrn=job_urn,
|
|
305
|
+
aspect=models.DataJobInputOutputClass(
|
|
306
|
+
inputDatasets=inlets,
|
|
307
|
+
outputDatasets=outlets,
|
|
308
|
+
),
|
|
309
|
+
).as_workunit()
|
|
310
|
+
|
|
311
|
+
def get_job_id(
|
|
312
|
+
self,
|
|
313
|
+
lineage: KafkaConnectLineage,
|
|
314
|
+
connector: ConnectorManifest,
|
|
315
|
+
config: KafkaConnectSourceConfig,
|
|
316
|
+
) -> str:
|
|
317
|
+
connector_class = connector.config.get(CONNECTOR_CLASS)
|
|
318
|
+
|
|
319
|
+
# Note - This block is only to maintain backward compatibility of Job URN
|
|
320
|
+
if (
|
|
321
|
+
connector_class
|
|
322
|
+
and connector.type == SOURCE
|
|
323
|
+
and (
|
|
324
|
+
"JdbcSourceConnector" in connector_class
|
|
325
|
+
or connector_class.startswith("io.debezium.connector")
|
|
326
|
+
)
|
|
327
|
+
and lineage.source_dataset
|
|
328
|
+
and config.connect_to_platform_map
|
|
329
|
+
and config.connect_to_platform_map.get(connector.name)
|
|
330
|
+
and config.connect_to_platform_map[connector.name].get(
|
|
331
|
+
lineage.source_platform
|
|
332
|
+
)
|
|
333
|
+
):
|
|
334
|
+
return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
|
|
335
|
+
|
|
336
|
+
return (
|
|
337
|
+
lineage.source_dataset
|
|
338
|
+
if lineage.source_dataset
|
|
339
|
+
else f"unknown_source.{lineage.target_dataset}"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
343
|
+
return [
|
|
344
|
+
*super().get_workunit_processors(),
|
|
345
|
+
StaleEntityRemovalHandler.create(
|
|
346
|
+
self, self.config, self.ctx
|
|
347
|
+
).workunit_processor,
|
|
348
|
+
]
|
|
349
|
+
|
|
350
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
351
|
+
for connector in self.get_connectors_manifest():
|
|
352
|
+
yield self.construct_flow_workunit(connector)
|
|
353
|
+
yield from self.construct_job_workunits(connector)
|
|
354
|
+
self.report.report_connector_scanned(connector.name)
|
|
355
|
+
|
|
356
|
+
def get_report(self) -> KafkaConnectSourceReport:
|
|
357
|
+
return self.report
|
|
358
|
+
|
|
359
|
+
def make_lineage_dataset_urn(
|
|
360
|
+
self, platform: str, name: str, platform_instance: Optional[str]
|
|
361
|
+
) -> str:
|
|
362
|
+
if self.config.convert_lineage_urns_to_lowercase:
|
|
363
|
+
name = name.lower()
|
|
364
|
+
|
|
365
|
+
return builder.make_dataset_urn_with_platform_instance(
|
|
366
|
+
platform, name, platform_instance, self.config.env
|
|
367
|
+
)
|