acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/emitter/rest_emitter.py +16 -1
  8. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
  9. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  10. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  11. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  12. datahub/ingestion/source/kafka_connect/common.py +202 -0
  13. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  14. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  15. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  16. datahub/ingestion/source/looker/looker_common.py +54 -2
  17. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  18. datahub/ingestion/source/looker/looker_source.py +12 -1
  19. datahub/ingestion/source/mlflow.py +30 -5
  20. datahub/ingestion/source/powerbi/config.py +1 -14
  21. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  22. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  23. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
  24. datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
  25. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
  27. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  28. datahub/ingestion/source/sql/mssql/source.py +14 -0
  29. datahub/ingestion/source/tableau/tableau.py +4 -5
  30. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  31. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  32. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  33. datahub/ingestion/source/unity/source.py +4 -0
  34. datahub/ingestion/source_report/ingestion_stage.py +1 -0
  35. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  36. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  37. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  38. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,202 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, Iterable, List, Optional
4
+
5
+ from pydantic.fields import Field
6
+
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel
8
+ from datahub.configuration.source_common import (
9
+ DatasetLineageProviderConfigBase,
10
+ PlatformInstanceConfigMixin,
11
+ )
12
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
13
+ StaleEntityRemovalSourceReport,
14
+ StatefulStaleMetadataRemovalConfig,
15
+ )
16
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
17
+ StatefulIngestionConfigBase,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ KAFKA = "kafka"
23
+ SOURCE = "source"
24
+ SINK = "sink"
25
+ CONNECTOR_CLASS = "connector.class"
26
+
27
+
28
+ class ProvidedConfig(ConfigModel):
29
+ provider: str
30
+ path_key: str
31
+ value: str
32
+
33
+
34
+ class GenericConnectorConfig(ConfigModel):
35
+ connector_name: str
36
+ source_dataset: str
37
+ source_platform: str
38
+
39
+
40
+ class KafkaConnectSourceConfig(
41
+ PlatformInstanceConfigMixin,
42
+ DatasetLineageProviderConfigBase,
43
+ StatefulIngestionConfigBase,
44
+ ):
45
+ # See the Connect REST Interface for details
46
+ # https://docs.confluent.io/platform/current/connect/references/restapi.html#
47
+ connect_uri: str = Field(
48
+ default="http://localhost:8083/", description="URI to connect to."
49
+ )
50
+ username: Optional[str] = Field(default=None, description="Kafka Connect username.")
51
+ password: Optional[str] = Field(default=None, description="Kafka Connect password.")
52
+ cluster_name: Optional[str] = Field(
53
+ default="connect-cluster", description="Cluster to ingest from."
54
+ )
55
+ # convert lineage dataset's urns to lowercase
56
+ convert_lineage_urns_to_lowercase: bool = Field(
57
+ default=False,
58
+ description="Whether to convert the urns of ingested lineage dataset to lowercase",
59
+ )
60
+ connector_patterns: AllowDenyPattern = Field(
61
+ default=AllowDenyPattern.allow_all(),
62
+ description="regex patterns for connectors to filter for ingestion.",
63
+ )
64
+ provided_configs: Optional[List[ProvidedConfig]] = Field(
65
+ default=None, description="Provided Configurations"
66
+ )
67
+ connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
68
+ default=None,
69
+ description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
70
+ )
71
+ platform_instance_map: Optional[Dict[str, str]] = Field(
72
+ default=None,
73
+ description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
74
+ )
75
+ generic_connectors: List[GenericConnectorConfig] = Field(
76
+ default=[],
77
+ description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
78
+ )
79
+
80
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
81
+
82
+
83
+ @dataclass
84
+ class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
+ connectors_scanned: int = 0
86
+ filtered: List[str] = field(default_factory=list)
87
+
88
+ def report_connector_scanned(self, connector: str) -> None:
89
+ self.connectors_scanned += 1
90
+
91
+ def report_dropped(self, connector: str) -> None:
92
+ self.filtered.append(connector)
93
+
94
+
95
+ @dataclass
96
+ class KafkaConnectLineage:
97
+ """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
98
+
99
+ source_platform: str
100
+ target_dataset: str
101
+ target_platform: str
102
+ job_property_bag: Optional[Dict[str, str]] = None
103
+ source_dataset: Optional[str] = None
104
+
105
+
106
+ @dataclass
107
+ class ConnectorManifest:
108
+ """Each instance is potential DataFlow"""
109
+
110
+ name: str
111
+ type: str
112
+ config: Dict
113
+ tasks: Dict
114
+ url: Optional[str] = None
115
+ flow_property_bag: Optional[Dict[str, str]] = None
116
+ lineages: List[KafkaConnectLineage] = field(default_factory=list)
117
+ topic_names: Iterable[str] = field(default_factory=list)
118
+
119
+
120
+ def remove_prefix(text: str, prefix: str) -> str:
121
+ if text.startswith(prefix):
122
+ index = len(prefix)
123
+ return text[index:]
124
+ return text
125
+
126
+
127
+ def unquote(
128
+ string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
129
+ ) -> str:
130
+ """
131
+ If string starts and ends with a quote, unquote it
132
+ """
133
+ trailing_quote = trailing_quote if trailing_quote else leading_quote
134
+ if string.startswith(leading_quote) and string.endswith(trailing_quote):
135
+ string = string[1:-1]
136
+ return string
137
+
138
+
139
+ def get_dataset_name(
140
+ database_name: Optional[str],
141
+ source_table: str,
142
+ ) -> str:
143
+ if database_name:
144
+ dataset_name = database_name + "." + source_table
145
+ else:
146
+ dataset_name = source_table
147
+
148
+ return dataset_name
149
+
150
+
151
+ def get_platform_instance(
152
+ config: KafkaConnectSourceConfig, connector_name: str, platform: str
153
+ ) -> Optional[str]:
154
+ instance_name = None
155
+ if (
156
+ config.connect_to_platform_map
157
+ and config.connect_to_platform_map.get(connector_name)
158
+ and config.connect_to_platform_map[connector_name].get(platform)
159
+ ):
160
+ instance_name = config.connect_to_platform_map[connector_name][platform]
161
+ if config.platform_instance_map and config.platform_instance_map.get(platform):
162
+ logger.warning(
163
+ f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
164
+ "Will prefer connector specific platform instance from connect_to_platform_map."
165
+ )
166
+ elif config.platform_instance_map and config.platform_instance_map.get(platform):
167
+ instance_name = config.platform_instance_map[platform]
168
+ logger.info(
169
+ f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
170
+ )
171
+ return instance_name
172
+
173
+
174
+ def transform_connector_config(
175
+ connector_config: Dict, provided_configs: List[ProvidedConfig]
176
+ ) -> None:
177
+ """This method will update provided configs in connector config values, if any"""
178
+ lookupsByProvider = {}
179
+ for pconfig in provided_configs:
180
+ lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
181
+ for k, v in connector_config.items():
182
+ for key, value in lookupsByProvider.items():
183
+ if key in v:
184
+ connector_config[k] = connector_config[k].replace(key, value)
185
+
186
+
187
+ # TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
188
+ def has_three_level_hierarchy(platform: str) -> bool:
189
+ return platform in ["postgres", "trino", "redshift", "snowflake"]
190
+
191
+
192
+ @dataclass
193
+ class BaseConnector:
194
+ connector_manifest: ConnectorManifest
195
+ config: KafkaConnectSourceConfig
196
+ report: KafkaConnectSourceReport
197
+
198
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
199
+ return []
200
+
201
+ def extract_flow_property_bag(self) -> Optional[Dict[str, str]]:
202
+ return None
@@ -0,0 +1,367 @@
1
+ import logging
2
+ from typing import Iterable, List, Optional, Type
3
+
4
+ import jpype
5
+ import jpype.imports
6
+ import requests
7
+
8
+ import datahub.emitter.mce_builder as builder
9
+ import datahub.metadata.schema_classes as models
10
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
+ from datahub.ingestion.api.common import PipelineContext
12
+ from datahub.ingestion.api.decorators import (
13
+ SourceCapability,
14
+ SupportStatus,
15
+ capability,
16
+ config_class,
17
+ platform_name,
18
+ support_status,
19
+ )
20
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
21
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.kafka_connect.common import (
23
+ CONNECTOR_CLASS,
24
+ SINK,
25
+ SOURCE,
26
+ BaseConnector,
27
+ ConnectorManifest,
28
+ KafkaConnectLineage,
29
+ KafkaConnectSourceConfig,
30
+ KafkaConnectSourceReport,
31
+ get_platform_instance,
32
+ transform_connector_config,
33
+ )
34
+ from datahub.ingestion.source.kafka_connect.sink_connectors import (
35
+ BIGQUERY_SINK_CONNECTOR_CLASS,
36
+ S3_SINK_CONNECTOR_CLASS,
37
+ SNOWFLAKE_SINK_CONNECTOR_CLASS,
38
+ BigQuerySinkConnector,
39
+ ConfluentS3SinkConnector,
40
+ SnowflakeSinkConnector,
41
+ )
42
+ from datahub.ingestion.source.kafka_connect.source_connectors import (
43
+ DEBEZIUM_SOURCE_CONNECTOR_PREFIX,
44
+ JDBC_SOURCE_CONNECTOR_CLASS,
45
+ MONGO_SOURCE_CONNECTOR_CLASS,
46
+ ConfigDrivenSourceConnector,
47
+ ConfluentJDBCSourceConnector,
48
+ DebeziumSourceConnector,
49
+ MongoSourceConnector,
50
+ )
51
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
52
+ StaleEntityRemovalHandler,
53
+ )
54
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
55
+ StatefulIngestionSourceBase,
56
+ )
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ @platform_name("Kafka Connect")
62
+ @config_class(KafkaConnectSourceConfig)
63
+ @support_status(SupportStatus.CERTIFIED)
64
+ @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
65
+ @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
66
+ @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
67
+ class KafkaConnectSource(StatefulIngestionSourceBase):
68
+ config: KafkaConnectSourceConfig
69
+ report: KafkaConnectSourceReport
70
+ platform: str = "kafka-connect"
71
+
72
+ def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
73
+ super().__init__(config, ctx)
74
+ self.config = config
75
+ self.report = KafkaConnectSourceReport()
76
+ self.session = requests.Session()
77
+ self.session.headers.update(
78
+ {
79
+ "Accept": "application/json",
80
+ "Content-Type": "application/json",
81
+ }
82
+ )
83
+
84
+ # Test the connection
85
+ if self.config.username is not None and self.config.password is not None:
86
+ logger.info(
87
+ f"Connecting to {self.config.connect_uri} with Authentication..."
88
+ )
89
+ self.session.auth = (self.config.username, self.config.password)
90
+
91
+ test_response = self.session.get(f"{self.config.connect_uri}/connectors")
92
+ test_response.raise_for_status()
93
+ logger.info(f"Connection to {self.config.connect_uri} is ok")
94
+ if not jpype.isJVMStarted():
95
+ jpype.startJVM()
96
+
97
+ @classmethod
98
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
99
+ config = KafkaConnectSourceConfig.parse_obj(config_dict)
100
+ return cls(config, ctx)
101
+
102
+ def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
103
+ """Get Kafka Connect connectors manifest using REST API.
104
+ Enrich with lineages metadata.
105
+ """
106
+
107
+ connector_response = self.session.get(
108
+ f"{self.config.connect_uri}/connectors",
109
+ )
110
+
111
+ payload = connector_response.json()
112
+
113
+ for connector_name in payload:
114
+ connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
115
+ connector_manifest = self._get_connector_manifest(
116
+ connector_name, connector_url
117
+ )
118
+ if (
119
+ connector_manifest is None
120
+ or not self.config.connector_patterns.allowed(connector_manifest.name)
121
+ ):
122
+ self.report.report_dropped(connector_name)
123
+ continue
124
+
125
+ if self.config.provided_configs:
126
+ transform_connector_config(
127
+ connector_manifest.config, self.config.provided_configs
128
+ )
129
+ connector_manifest.url = connector_url
130
+ connector_manifest.topic_names = self._get_connector_topics(connector_name)
131
+ connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
132
+
133
+ class_type: Type[BaseConnector] = BaseConnector
134
+
135
+ # Populate Source Connector metadata
136
+ if connector_manifest.type == SOURCE:
137
+ connector_manifest.tasks = self._get_connector_tasks(connector_name)
138
+
139
+ # JDBC source connector lineages
140
+ if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS:
141
+ class_type = ConfluentJDBCSourceConnector
142
+ elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX):
143
+ class_type = DebeziumSourceConnector
144
+ elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS:
145
+ class_type = MongoSourceConnector
146
+ elif any(
147
+ [
148
+ connector.connector_name == connector_manifest.name
149
+ for connector in self.config.generic_connectors
150
+ ]
151
+ ):
152
+ class_type = ConfigDrivenSourceConnector
153
+ else:
154
+ self.report.report_dropped(connector_manifest.name)
155
+ self.report.warning(
156
+ "Lineage for Source Connector not supported. "
157
+ "Please refer to Kafka Connect docs to use `generic_connectors` config.",
158
+ context=f"{connector_manifest.name} of type {connector_class_value}",
159
+ )
160
+ continue
161
+ elif connector_manifest.type == SINK:
162
+ if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS:
163
+ class_type = BigQuerySinkConnector
164
+ elif connector_class_value == S3_SINK_CONNECTOR_CLASS:
165
+ class_type = ConfluentS3SinkConnector
166
+ elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS:
167
+ class_type = SnowflakeSinkConnector
168
+ else:
169
+ self.report.report_dropped(connector_manifest.name)
170
+ self.report.warning(
171
+ "Lineage for Sink Connector not supported.",
172
+ context=f"{connector_manifest.name} of type {connector_class_value}",
173
+ )
174
+
175
+ connector_class = class_type(connector_manifest, self.config, self.report)
176
+ connector_manifest.lineages = connector_class.extract_lineages()
177
+ connector_manifest.flow_property_bag = (
178
+ connector_class.extract_flow_property_bag()
179
+ )
180
+
181
+ yield connector_manifest
182
+
183
+ def _get_connector_manifest(
184
+ self, connector_name: str, connector_url: str
185
+ ) -> Optional[ConnectorManifest]:
186
+ try:
187
+ connector_response = self.session.get(connector_url)
188
+ connector_response.raise_for_status()
189
+ except Exception as e:
190
+ self.report.warning(
191
+ "Failed to get connector details", connector_name, exc=e
192
+ )
193
+ return None
194
+ manifest = connector_response.json()
195
+ connector_manifest = ConnectorManifest(**manifest)
196
+ return connector_manifest
197
+
198
+ def _get_connector_tasks(self, connector_name: str) -> dict:
199
+ try:
200
+ response = self.session.get(
201
+ f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
202
+ )
203
+ response.raise_for_status()
204
+ except Exception as e:
205
+ self.report.warning(
206
+ "Error getting connector tasks", context=connector_name, exc=e
207
+ )
208
+ return {}
209
+
210
+ return response.json()
211
+
212
+ def _get_connector_topics(self, connector_name: str) -> List[str]:
213
+ try:
214
+ response = self.session.get(
215
+ f"{self.config.connect_uri}/connectors/{connector_name}/topics",
216
+ )
217
+ response.raise_for_status()
218
+ except Exception as e:
219
+ self.report.warning(
220
+ "Error getting connector topics", context=connector_name, exc=e
221
+ )
222
+ return []
223
+
224
+ return response.json()[connector_name]["topics"]
225
+
226
+ def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
227
+ connector_name = connector.name
228
+ connector_type = connector.type
229
+ connector_class = connector.config.get(CONNECTOR_CLASS)
230
+ flow_property_bag = connector.flow_property_bag
231
+ # connector_url = connector.url # NOTE: this will expose connector credential when used
232
+ flow_urn = builder.make_data_flow_urn(
233
+ self.platform,
234
+ connector_name,
235
+ self.config.env,
236
+ self.config.platform_instance,
237
+ )
238
+
239
+ return MetadataChangeProposalWrapper(
240
+ entityUrn=flow_urn,
241
+ aspect=models.DataFlowInfoClass(
242
+ name=connector_name,
243
+ description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
244
+ customProperties=flow_property_bag,
245
+ # externalUrl=connector_url, # NOTE: this will expose connector credential when used
246
+ ),
247
+ ).as_workunit()
248
+
249
+ def construct_job_workunits(
250
+ self, connector: ConnectorManifest
251
+ ) -> Iterable[MetadataWorkUnit]:
252
+ connector_name = connector.name
253
+ flow_urn = builder.make_data_flow_urn(
254
+ self.platform,
255
+ connector_name,
256
+ self.config.env,
257
+ self.config.platform_instance,
258
+ )
259
+
260
+ lineages = connector.lineages
261
+ if lineages:
262
+ for lineage in lineages:
263
+ source_dataset = lineage.source_dataset
264
+ source_platform = lineage.source_platform
265
+ target_dataset = lineage.target_dataset
266
+ target_platform = lineage.target_platform
267
+ job_property_bag = lineage.job_property_bag
268
+
269
+ source_platform_instance = get_platform_instance(
270
+ self.config, connector_name, source_platform
271
+ )
272
+ target_platform_instance = get_platform_instance(
273
+ self.config, connector_name, target_platform
274
+ )
275
+
276
+ job_id = self.get_job_id(lineage, connector, self.config)
277
+ job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
278
+
279
+ inlets = (
280
+ [
281
+ self.make_lineage_dataset_urn(
282
+ source_platform, source_dataset, source_platform_instance
283
+ )
284
+ ]
285
+ if source_dataset
286
+ else []
287
+ )
288
+ outlets = [
289
+ self.make_lineage_dataset_urn(
290
+ target_platform, target_dataset, target_platform_instance
291
+ )
292
+ ]
293
+
294
+ yield MetadataChangeProposalWrapper(
295
+ entityUrn=job_urn,
296
+ aspect=models.DataJobInfoClass(
297
+ name=f"{connector_name}:{job_id}",
298
+ type="COMMAND",
299
+ customProperties=job_property_bag,
300
+ ),
301
+ ).as_workunit()
302
+
303
+ yield MetadataChangeProposalWrapper(
304
+ entityUrn=job_urn,
305
+ aspect=models.DataJobInputOutputClass(
306
+ inputDatasets=inlets,
307
+ outputDatasets=outlets,
308
+ ),
309
+ ).as_workunit()
310
+
311
+ def get_job_id(
312
+ self,
313
+ lineage: KafkaConnectLineage,
314
+ connector: ConnectorManifest,
315
+ config: KafkaConnectSourceConfig,
316
+ ) -> str:
317
+ connector_class = connector.config.get(CONNECTOR_CLASS)
318
+
319
+ # Note - This block is only to maintain backward compatibility of Job URN
320
+ if (
321
+ connector_class
322
+ and connector.type == SOURCE
323
+ and (
324
+ "JdbcSourceConnector" in connector_class
325
+ or connector_class.startswith("io.debezium.connector")
326
+ )
327
+ and lineage.source_dataset
328
+ and config.connect_to_platform_map
329
+ and config.connect_to_platform_map.get(connector.name)
330
+ and config.connect_to_platform_map[connector.name].get(
331
+ lineage.source_platform
332
+ )
333
+ ):
334
+ return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
335
+
336
+ return (
337
+ lineage.source_dataset
338
+ if lineage.source_dataset
339
+ else f"unknown_source.{lineage.target_dataset}"
340
+ )
341
+
342
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
343
+ return [
344
+ *super().get_workunit_processors(),
345
+ StaleEntityRemovalHandler.create(
346
+ self, self.config, self.ctx
347
+ ).workunit_processor,
348
+ ]
349
+
350
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
351
+ for connector in self.get_connectors_manifest():
352
+ yield self.construct_flow_workunit(connector)
353
+ yield from self.construct_job_workunits(connector)
354
+ self.report.report_connector_scanned(connector.name)
355
+
356
+ def get_report(self) -> KafkaConnectSourceReport:
357
+ return self.report
358
+
359
+ def make_lineage_dataset_urn(
360
+ self, platform: str, name: str, platform_instance: Optional[str]
361
+ ) -> str:
362
+ if self.config.convert_lineage_urns_to_lowercase:
363
+ name = name.lower()
364
+
365
+ return builder.make_dataset_urn_with_platform_instance(
366
+ platform, name, platform_instance, self.config.env
367
+ )