acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/rest_emitter.py +16 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +54 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +12 -1
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
- datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +14 -0
- datahub/ingestion/source/tableau/tableau.py +4 -5
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/sql_parsing/tool_meta_extractor.py +116 -5
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,1468 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import re
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
-
|
|
6
|
-
import jpype
|
|
7
|
-
import jpype.imports
|
|
8
|
-
import requests
|
|
9
|
-
from pydantic.fields import Field
|
|
10
|
-
from sqlalchemy.engine.url import make_url
|
|
11
|
-
|
|
12
|
-
import datahub.emitter.mce_builder as builder
|
|
13
|
-
import datahub.metadata.schema_classes as models
|
|
14
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
15
|
-
from datahub.configuration.source_common import (
|
|
16
|
-
DatasetLineageProviderConfigBase,
|
|
17
|
-
PlatformInstanceConfigMixin,
|
|
18
|
-
)
|
|
19
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
|
-
from datahub.ingestion.api.common import PipelineContext
|
|
21
|
-
from datahub.ingestion.api.decorators import (
|
|
22
|
-
SourceCapability,
|
|
23
|
-
SupportStatus,
|
|
24
|
-
capability,
|
|
25
|
-
config_class,
|
|
26
|
-
platform_name,
|
|
27
|
-
support_status,
|
|
28
|
-
)
|
|
29
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
|
|
30
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
32
|
-
get_platform_from_sqlalchemy_uri,
|
|
33
|
-
)
|
|
34
|
-
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
35
|
-
StaleEntityRemovalHandler,
|
|
36
|
-
StaleEntityRemovalSourceReport,
|
|
37
|
-
StatefulStaleMetadataRemovalConfig,
|
|
38
|
-
)
|
|
39
|
-
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
40
|
-
StatefulIngestionConfigBase,
|
|
41
|
-
StatefulIngestionSourceBase,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
logger = logging.getLogger(__name__)
|
|
45
|
-
|
|
46
|
-
KAFKA = "kafka"
|
|
47
|
-
SOURCE = "source"
|
|
48
|
-
SINK = "sink"
|
|
49
|
-
CONNECTOR_CLASS = "connector.class"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ProvidedConfig(ConfigModel):
|
|
53
|
-
provider: str
|
|
54
|
-
path_key: str
|
|
55
|
-
value: str
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class GenericConnectorConfig(ConfigModel):
|
|
59
|
-
connector_name: str
|
|
60
|
-
source_dataset: str
|
|
61
|
-
source_platform: str
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class KafkaConnectSourceConfig(
|
|
65
|
-
PlatformInstanceConfigMixin,
|
|
66
|
-
DatasetLineageProviderConfigBase,
|
|
67
|
-
StatefulIngestionConfigBase,
|
|
68
|
-
):
|
|
69
|
-
# See the Connect REST Interface for details
|
|
70
|
-
# https://docs.confluent.io/platform/current/connect/references/restapi.html#
|
|
71
|
-
connect_uri: str = Field(
|
|
72
|
-
default="http://localhost:8083/", description="URI to connect to."
|
|
73
|
-
)
|
|
74
|
-
username: Optional[str] = Field(default=None, description="Kafka Connect username.")
|
|
75
|
-
password: Optional[str] = Field(default=None, description="Kafka Connect password.")
|
|
76
|
-
cluster_name: Optional[str] = Field(
|
|
77
|
-
default="connect-cluster", description="Cluster to ingest from."
|
|
78
|
-
)
|
|
79
|
-
# convert lineage dataset's urns to lowercase
|
|
80
|
-
convert_lineage_urns_to_lowercase: bool = Field(
|
|
81
|
-
default=False,
|
|
82
|
-
description="Whether to convert the urns of ingested lineage dataset to lowercase",
|
|
83
|
-
)
|
|
84
|
-
connector_patterns: AllowDenyPattern = Field(
|
|
85
|
-
default=AllowDenyPattern.allow_all(),
|
|
86
|
-
description="regex patterns for connectors to filter for ingestion.",
|
|
87
|
-
)
|
|
88
|
-
provided_configs: Optional[List[ProvidedConfig]] = Field(
|
|
89
|
-
default=None, description="Provided Configurations"
|
|
90
|
-
)
|
|
91
|
-
connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
|
|
92
|
-
default=None,
|
|
93
|
-
description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
|
|
94
|
-
)
|
|
95
|
-
platform_instance_map: Optional[Dict[str, str]] = Field(
|
|
96
|
-
default=None,
|
|
97
|
-
description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
|
|
98
|
-
)
|
|
99
|
-
generic_connectors: List[GenericConnectorConfig] = Field(
|
|
100
|
-
default=[],
|
|
101
|
-
description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
@dataclass
|
|
108
|
-
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
109
|
-
connectors_scanned: int = 0
|
|
110
|
-
filtered: List[str] = field(default_factory=list)
|
|
111
|
-
|
|
112
|
-
def report_connector_scanned(self, connector: str) -> None:
|
|
113
|
-
self.connectors_scanned += 1
|
|
114
|
-
|
|
115
|
-
def report_dropped(self, connector: str) -> None:
|
|
116
|
-
self.filtered.append(connector)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
@dataclass
|
|
120
|
-
class KafkaConnectLineage:
|
|
121
|
-
"""Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
|
|
122
|
-
|
|
123
|
-
source_platform: str
|
|
124
|
-
target_dataset: str
|
|
125
|
-
target_platform: str
|
|
126
|
-
job_property_bag: Optional[Dict[str, str]] = None
|
|
127
|
-
source_dataset: Optional[str] = None
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
@dataclass
|
|
131
|
-
class ConnectorManifest:
|
|
132
|
-
"""Each instance is potential DataFlow"""
|
|
133
|
-
|
|
134
|
-
name: str
|
|
135
|
-
type: str
|
|
136
|
-
config: Dict
|
|
137
|
-
tasks: Dict
|
|
138
|
-
url: Optional[str] = None
|
|
139
|
-
flow_property_bag: Optional[Dict[str, str]] = None
|
|
140
|
-
lineages: List[KafkaConnectLineage] = field(default_factory=list)
|
|
141
|
-
topic_names: Iterable[str] = field(default_factory=list)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def remove_prefix(text: str, prefix: str) -> str:
|
|
145
|
-
if text.startswith(prefix):
|
|
146
|
-
index = len(prefix)
|
|
147
|
-
return text[index:]
|
|
148
|
-
return text
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def unquote(
|
|
152
|
-
string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
|
|
153
|
-
) -> str:
|
|
154
|
-
"""
|
|
155
|
-
If string starts and ends with a quote, unquote it
|
|
156
|
-
"""
|
|
157
|
-
trailing_quote = trailing_quote if trailing_quote else leading_quote
|
|
158
|
-
if string.startswith(leading_quote) and string.endswith(trailing_quote):
|
|
159
|
-
string = string[1:-1]
|
|
160
|
-
return string
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def get_dataset_name(
|
|
164
|
-
database_name: Optional[str],
|
|
165
|
-
source_table: str,
|
|
166
|
-
) -> str:
|
|
167
|
-
if database_name:
|
|
168
|
-
dataset_name = database_name + "." + source_table
|
|
169
|
-
else:
|
|
170
|
-
dataset_name = source_table
|
|
171
|
-
|
|
172
|
-
return dataset_name
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def get_platform_instance(
|
|
176
|
-
config: KafkaConnectSourceConfig, connector_name: str, platform: str
|
|
177
|
-
) -> Optional[str]:
|
|
178
|
-
instance_name = None
|
|
179
|
-
if (
|
|
180
|
-
config.connect_to_platform_map
|
|
181
|
-
and config.connect_to_platform_map.get(connector_name)
|
|
182
|
-
and config.connect_to_platform_map[connector_name].get(platform)
|
|
183
|
-
):
|
|
184
|
-
instance_name = config.connect_to_platform_map[connector_name][platform]
|
|
185
|
-
if config.platform_instance_map and config.platform_instance_map.get(platform):
|
|
186
|
-
logger.warning(
|
|
187
|
-
f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
|
|
188
|
-
"Will prefer connector specific platform instance from connect_to_platform_map."
|
|
189
|
-
)
|
|
190
|
-
elif config.platform_instance_map and config.platform_instance_map.get(platform):
|
|
191
|
-
instance_name = config.platform_instance_map[platform]
|
|
192
|
-
logger.info(
|
|
193
|
-
f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
|
|
194
|
-
)
|
|
195
|
-
return instance_name
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
@dataclass
|
|
199
|
-
class ConfluentJDBCSourceConnector:
|
|
200
|
-
connector_manifest: ConnectorManifest
|
|
201
|
-
report: KafkaConnectSourceReport
|
|
202
|
-
|
|
203
|
-
def __init__(
|
|
204
|
-
self,
|
|
205
|
-
connector_manifest: ConnectorManifest,
|
|
206
|
-
config: KafkaConnectSourceConfig,
|
|
207
|
-
report: KafkaConnectSourceReport,
|
|
208
|
-
) -> None:
|
|
209
|
-
self.connector_manifest = connector_manifest
|
|
210
|
-
self.config = config
|
|
211
|
-
self.report = report
|
|
212
|
-
self._extract_lineages()
|
|
213
|
-
|
|
214
|
-
REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
|
|
215
|
-
KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
|
|
216
|
-
# https://kafka.apache.org/documentation/#connect_included_transformation
|
|
217
|
-
KAFKA_NONTOPICROUTING_TRANSFORMS = [
|
|
218
|
-
"InsertField",
|
|
219
|
-
"InsertField$Key",
|
|
220
|
-
"InsertField$Value",
|
|
221
|
-
"ReplaceField",
|
|
222
|
-
"ReplaceField$Key",
|
|
223
|
-
"ReplaceField$Value",
|
|
224
|
-
"MaskField",
|
|
225
|
-
"MaskField$Key",
|
|
226
|
-
"MaskField$Value",
|
|
227
|
-
"ValueToKey",
|
|
228
|
-
"ValueToKey$Key",
|
|
229
|
-
"ValueToKey$Value",
|
|
230
|
-
"HoistField",
|
|
231
|
-
"HoistField$Key",
|
|
232
|
-
"HoistField$Value",
|
|
233
|
-
"ExtractField",
|
|
234
|
-
"ExtractField$Key",
|
|
235
|
-
"ExtractField$Value",
|
|
236
|
-
"SetSchemaMetadata",
|
|
237
|
-
"SetSchemaMetadata$Key",
|
|
238
|
-
"SetSchemaMetadata$Value",
|
|
239
|
-
"Flatten",
|
|
240
|
-
"Flatten$Key",
|
|
241
|
-
"Flatten$Value",
|
|
242
|
-
"Cast",
|
|
243
|
-
"Cast$Key",
|
|
244
|
-
"Cast$Value",
|
|
245
|
-
"HeadersFrom",
|
|
246
|
-
"HeadersFrom$Key",
|
|
247
|
-
"HeadersFrom$Value",
|
|
248
|
-
"TimestampConverter",
|
|
249
|
-
"Filter",
|
|
250
|
-
"InsertHeader",
|
|
251
|
-
"DropHeaders",
|
|
252
|
-
]
|
|
253
|
-
# https://docs.confluent.io/platform/current/connect/transforms/overview.html
|
|
254
|
-
CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
|
|
255
|
-
"Drop",
|
|
256
|
-
"Drop$Key",
|
|
257
|
-
"Drop$Value",
|
|
258
|
-
"Filter",
|
|
259
|
-
"Filter$Key",
|
|
260
|
-
"Filter$Value",
|
|
261
|
-
"TombstoneHandler",
|
|
262
|
-
]
|
|
263
|
-
KNOWN_NONTOPICROUTING_TRANSFORMS = (
|
|
264
|
-
KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
265
|
-
+ [
|
|
266
|
-
f"org.apache.kafka.connect.transforms.{t}"
|
|
267
|
-
for t in KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
268
|
-
]
|
|
269
|
-
+ CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
270
|
-
+ [
|
|
271
|
-
f"io.confluent.connect.transforms.{t}"
|
|
272
|
-
for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
273
|
-
]
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
@dataclass
|
|
277
|
-
class JdbcParser:
|
|
278
|
-
db_connection_url: str
|
|
279
|
-
source_platform: str
|
|
280
|
-
database_name: str
|
|
281
|
-
topic_prefix: str
|
|
282
|
-
query: str
|
|
283
|
-
transforms: list
|
|
284
|
-
|
|
285
|
-
def get_parser(
|
|
286
|
-
self,
|
|
287
|
-
connector_manifest: ConnectorManifest,
|
|
288
|
-
) -> JdbcParser:
|
|
289
|
-
url = remove_prefix(
|
|
290
|
-
str(connector_manifest.config.get("connection.url")), "jdbc:"
|
|
291
|
-
)
|
|
292
|
-
url_instance = make_url(url)
|
|
293
|
-
source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
|
|
294
|
-
database_name = url_instance.database
|
|
295
|
-
assert database_name
|
|
296
|
-
db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
|
|
297
|
-
|
|
298
|
-
topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
|
|
299
|
-
|
|
300
|
-
query = self.connector_manifest.config.get("query", None)
|
|
301
|
-
|
|
302
|
-
transform_names = (
|
|
303
|
-
self.connector_manifest.config.get("transforms", "").split(",")
|
|
304
|
-
if self.connector_manifest.config.get("transforms")
|
|
305
|
-
else []
|
|
306
|
-
)
|
|
307
|
-
|
|
308
|
-
transforms = []
|
|
309
|
-
for name in transform_names:
|
|
310
|
-
transform = {"name": name}
|
|
311
|
-
transforms.append(transform)
|
|
312
|
-
for key in self.connector_manifest.config.keys():
|
|
313
|
-
if key.startswith(f"transforms.{name}."):
|
|
314
|
-
transform[
|
|
315
|
-
key.replace(f"transforms.{name}.", "")
|
|
316
|
-
] = self.connector_manifest.config[key]
|
|
317
|
-
|
|
318
|
-
return self.JdbcParser(
|
|
319
|
-
db_connection_url,
|
|
320
|
-
source_platform,
|
|
321
|
-
database_name,
|
|
322
|
-
topic_prefix,
|
|
323
|
-
query,
|
|
324
|
-
transforms,
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
def default_get_lineages(
|
|
328
|
-
self,
|
|
329
|
-
topic_prefix: str,
|
|
330
|
-
database_name: str,
|
|
331
|
-
source_platform: str,
|
|
332
|
-
topic_names: Optional[Iterable[str]] = None,
|
|
333
|
-
include_source_dataset: bool = True,
|
|
334
|
-
) -> List[KafkaConnectLineage]:
|
|
335
|
-
lineages: List[KafkaConnectLineage] = []
|
|
336
|
-
if not topic_names:
|
|
337
|
-
topic_names = self.connector_manifest.topic_names
|
|
338
|
-
table_name_tuples: List[Tuple] = self.get_table_names()
|
|
339
|
-
for topic in topic_names:
|
|
340
|
-
# All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
|
|
341
|
-
source_table: str = (
|
|
342
|
-
remove_prefix(topic, topic_prefix) if topic_prefix else topic
|
|
343
|
-
)
|
|
344
|
-
# include schema name for three-level hierarchies
|
|
345
|
-
if has_three_level_hierarchy(source_platform):
|
|
346
|
-
table_name_tuple: Tuple = next(
|
|
347
|
-
iter([t for t in table_name_tuples if t and t[-1] == source_table]),
|
|
348
|
-
(),
|
|
349
|
-
)
|
|
350
|
-
if len(table_name_tuple) > 1:
|
|
351
|
-
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
352
|
-
else:
|
|
353
|
-
include_source_dataset = False
|
|
354
|
-
self.report.warning(
|
|
355
|
-
"Could not find schema for table"
|
|
356
|
-
f"{self.connector_manifest.name} : {source_table}",
|
|
357
|
-
)
|
|
358
|
-
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
359
|
-
lineage = KafkaConnectLineage(
|
|
360
|
-
source_dataset=dataset_name if include_source_dataset else None,
|
|
361
|
-
source_platform=source_platform,
|
|
362
|
-
target_dataset=topic,
|
|
363
|
-
target_platform=KAFKA,
|
|
364
|
-
)
|
|
365
|
-
lineages.append(lineage)
|
|
366
|
-
return lineages
|
|
367
|
-
|
|
368
|
-
def get_table_names(self) -> List[Tuple]:
|
|
369
|
-
sep: str = "."
|
|
370
|
-
leading_quote_char: str = '"'
|
|
371
|
-
trailing_quote_char: str = leading_quote_char
|
|
372
|
-
|
|
373
|
-
table_ids: List[str] = []
|
|
374
|
-
if self.connector_manifest.tasks:
|
|
375
|
-
table_ids = (
|
|
376
|
-
",".join(
|
|
377
|
-
[
|
|
378
|
-
task["config"].get("tables")
|
|
379
|
-
for task in self.connector_manifest.tasks
|
|
380
|
-
]
|
|
381
|
-
)
|
|
382
|
-
).split(",")
|
|
383
|
-
quote_method = self.connector_manifest.config.get(
|
|
384
|
-
"quote.sql.identifiers", "always"
|
|
385
|
-
)
|
|
386
|
-
if (
|
|
387
|
-
quote_method == "always"
|
|
388
|
-
and table_ids
|
|
389
|
-
and table_ids[0]
|
|
390
|
-
and table_ids[-1]
|
|
391
|
-
):
|
|
392
|
-
leading_quote_char = table_ids[0][0]
|
|
393
|
-
trailing_quote_char = table_ids[-1][-1]
|
|
394
|
-
# This will only work for single character quotes
|
|
395
|
-
elif self.connector_manifest.config.get("table.whitelist"):
|
|
396
|
-
table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
|
|
397
|
-
|
|
398
|
-
# List of Tuple containing (schema, table)
|
|
399
|
-
tables: List[Tuple] = [
|
|
400
|
-
(
|
|
401
|
-
(
|
|
402
|
-
unquote(
|
|
403
|
-
table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
|
|
404
|
-
)
|
|
405
|
-
if len(table_id.split(sep)) > 1
|
|
406
|
-
else ""
|
|
407
|
-
),
|
|
408
|
-
unquote(
|
|
409
|
-
table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
|
|
410
|
-
),
|
|
411
|
-
)
|
|
412
|
-
for table_id in table_ids
|
|
413
|
-
]
|
|
414
|
-
return tables
|
|
415
|
-
|
|
416
|
-
def _extract_lineages(self):
|
|
417
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
418
|
-
parser = self.get_parser(self.connector_manifest)
|
|
419
|
-
source_platform = parser.source_platform
|
|
420
|
-
database_name = parser.database_name
|
|
421
|
-
query = parser.query
|
|
422
|
-
topic_prefix = parser.topic_prefix
|
|
423
|
-
transforms = parser.transforms
|
|
424
|
-
self.connector_manifest.flow_property_bag = self.connector_manifest.config
|
|
425
|
-
|
|
426
|
-
# Mask/Remove properties that may reveal credentials
|
|
427
|
-
self.connector_manifest.flow_property_bag[
|
|
428
|
-
"connection.url"
|
|
429
|
-
] = parser.db_connection_url
|
|
430
|
-
if "connection.password" in self.connector_manifest.flow_property_bag:
|
|
431
|
-
del self.connector_manifest.flow_property_bag["connection.password"]
|
|
432
|
-
if "connection.user" in self.connector_manifest.flow_property_bag:
|
|
433
|
-
del self.connector_manifest.flow_property_bag["connection.user"]
|
|
434
|
-
|
|
435
|
-
logging.debug(
|
|
436
|
-
f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
if not self.connector_manifest.topic_names:
|
|
440
|
-
self.connector_manifest.lineages = lineages
|
|
441
|
-
return
|
|
442
|
-
|
|
443
|
-
if query:
|
|
444
|
-
# Lineage source_table can be extracted by parsing query
|
|
445
|
-
for topic in self.connector_manifest.topic_names:
|
|
446
|
-
# default method - as per earlier implementation
|
|
447
|
-
dataset_name: str = get_dataset_name(database_name, topic)
|
|
448
|
-
|
|
449
|
-
lineage = KafkaConnectLineage(
|
|
450
|
-
source_dataset=None,
|
|
451
|
-
source_platform=source_platform,
|
|
452
|
-
target_dataset=topic,
|
|
453
|
-
target_platform=KAFKA,
|
|
454
|
-
)
|
|
455
|
-
lineages.append(lineage)
|
|
456
|
-
self.report.warning(
|
|
457
|
-
"Could not find input dataset, the connector has query configuration set",
|
|
458
|
-
self.connector_manifest.name,
|
|
459
|
-
)
|
|
460
|
-
self.connector_manifest.lineages = lineages
|
|
461
|
-
return
|
|
462
|
-
|
|
463
|
-
SINGLE_TRANSFORM = len(transforms) == 1
|
|
464
|
-
NO_TRANSFORM = len(transforms) == 0
|
|
465
|
-
UNKNOWN_TRANSFORM = any(
|
|
466
|
-
[
|
|
467
|
-
transform["type"]
|
|
468
|
-
not in self.KNOWN_TOPICROUTING_TRANSFORMS
|
|
469
|
-
+ self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
470
|
-
for transform in transforms
|
|
471
|
-
]
|
|
472
|
-
)
|
|
473
|
-
ALL_TRANSFORMS_NON_TOPICROUTING = all(
|
|
474
|
-
[
|
|
475
|
-
transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
476
|
-
for transform in transforms
|
|
477
|
-
]
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
|
|
481
|
-
self.connector_manifest.lineages = self.default_get_lineages(
|
|
482
|
-
database_name=database_name,
|
|
483
|
-
source_platform=source_platform,
|
|
484
|
-
topic_prefix=topic_prefix,
|
|
485
|
-
)
|
|
486
|
-
return
|
|
487
|
-
|
|
488
|
-
if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
|
|
489
|
-
tables = self.get_table_names()
|
|
490
|
-
topic_names = list(self.connector_manifest.topic_names)
|
|
491
|
-
|
|
492
|
-
from java.util.regex import Pattern
|
|
493
|
-
|
|
494
|
-
for table in tables:
|
|
495
|
-
source_table: str = table[-1]
|
|
496
|
-
topic = topic_prefix + source_table if topic_prefix else source_table
|
|
497
|
-
|
|
498
|
-
transform_regex = Pattern.compile(transforms[0]["regex"])
|
|
499
|
-
transform_replacement = transforms[0]["replacement"]
|
|
500
|
-
|
|
501
|
-
matcher = transform_regex.matcher(topic)
|
|
502
|
-
if matcher.matches():
|
|
503
|
-
topic = str(matcher.replaceFirst(transform_replacement))
|
|
504
|
-
|
|
505
|
-
# Additional check to confirm that the topic present
|
|
506
|
-
# in connector topics
|
|
507
|
-
|
|
508
|
-
if topic in self.connector_manifest.topic_names:
|
|
509
|
-
# include schema name for three-level hierarchies
|
|
510
|
-
if has_three_level_hierarchy(source_platform) and len(table) > 1:
|
|
511
|
-
source_table = f"{table[-2]}.{table[-1]}"
|
|
512
|
-
|
|
513
|
-
dataset_name = get_dataset_name(database_name, source_table)
|
|
514
|
-
|
|
515
|
-
lineage = KafkaConnectLineage(
|
|
516
|
-
source_dataset=dataset_name,
|
|
517
|
-
source_platform=source_platform,
|
|
518
|
-
target_dataset=topic,
|
|
519
|
-
target_platform=KAFKA,
|
|
520
|
-
)
|
|
521
|
-
topic_names.remove(topic)
|
|
522
|
-
lineages.append(lineage)
|
|
523
|
-
|
|
524
|
-
if topic_names:
|
|
525
|
-
lineages.extend(
|
|
526
|
-
self.default_get_lineages(
|
|
527
|
-
database_name=database_name,
|
|
528
|
-
source_platform=source_platform,
|
|
529
|
-
topic_prefix=topic_prefix,
|
|
530
|
-
topic_names=topic_names,
|
|
531
|
-
include_source_dataset=False,
|
|
532
|
-
)
|
|
533
|
-
)
|
|
534
|
-
self.report.warning(
|
|
535
|
-
"Could not find input dataset for connector topics",
|
|
536
|
-
f"{self.connector_manifest.name} : {topic_names}",
|
|
537
|
-
)
|
|
538
|
-
self.connector_manifest.lineages = lineages
|
|
539
|
-
return
|
|
540
|
-
else:
|
|
541
|
-
include_source_dataset = True
|
|
542
|
-
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
543
|
-
self.report.warning(
|
|
544
|
-
"Could not find input dataset, connector has unknown transform",
|
|
545
|
-
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
546
|
-
)
|
|
547
|
-
include_source_dataset = False
|
|
548
|
-
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
549
|
-
self.report.warning(
|
|
550
|
-
"Could not find input dataset, connector has one or more unknown transforms",
|
|
551
|
-
self.connector_manifest.name,
|
|
552
|
-
)
|
|
553
|
-
include_source_dataset = False
|
|
554
|
-
lineages = self.default_get_lineages(
|
|
555
|
-
database_name=database_name,
|
|
556
|
-
source_platform=source_platform,
|
|
557
|
-
topic_prefix=topic_prefix,
|
|
558
|
-
include_source_dataset=include_source_dataset,
|
|
559
|
-
)
|
|
560
|
-
self.connector_manifest.lineages = lineages
|
|
561
|
-
return
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
@dataclass
|
|
565
|
-
class MongoSourceConnector:
|
|
566
|
-
# https://www.mongodb.com/docs/kafka-connector/current/source-connector/
|
|
567
|
-
|
|
568
|
-
connector_manifest: ConnectorManifest
|
|
569
|
-
|
|
570
|
-
def __init__(
|
|
571
|
-
self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig
|
|
572
|
-
) -> None:
|
|
573
|
-
self.connector_manifest = connector_manifest
|
|
574
|
-
self.config = config
|
|
575
|
-
self._extract_lineages()
|
|
576
|
-
|
|
577
|
-
@dataclass
|
|
578
|
-
class MongoSourceParser:
|
|
579
|
-
db_connection_url: Optional[str]
|
|
580
|
-
source_platform: str
|
|
581
|
-
database_name: Optional[str]
|
|
582
|
-
topic_prefix: Optional[str]
|
|
583
|
-
transforms: List[str]
|
|
584
|
-
|
|
585
|
-
def get_parser(
|
|
586
|
-
self,
|
|
587
|
-
connector_manifest: ConnectorManifest,
|
|
588
|
-
) -> MongoSourceParser:
|
|
589
|
-
parser = self.MongoSourceParser(
|
|
590
|
-
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
591
|
-
source_platform="mongodb",
|
|
592
|
-
database_name=connector_manifest.config.get("database"),
|
|
593
|
-
topic_prefix=connector_manifest.config.get("topic_prefix"),
|
|
594
|
-
transforms=(
|
|
595
|
-
connector_manifest.config["transforms"].split(",")
|
|
596
|
-
if "transforms" in connector_manifest.config
|
|
597
|
-
else []
|
|
598
|
-
),
|
|
599
|
-
)
|
|
600
|
-
|
|
601
|
-
return parser
|
|
602
|
-
|
|
603
|
-
def _extract_lineages(self):
|
|
604
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
605
|
-
parser = self.get_parser(self.connector_manifest)
|
|
606
|
-
source_platform = parser.source_platform
|
|
607
|
-
topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
|
|
608
|
-
|
|
609
|
-
if not self.connector_manifest.topic_names:
|
|
610
|
-
return lineages
|
|
611
|
-
|
|
612
|
-
for topic in self.connector_manifest.topic_names:
|
|
613
|
-
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
614
|
-
|
|
615
|
-
if found:
|
|
616
|
-
table_name = get_dataset_name(found.group(1), found.group(2))
|
|
617
|
-
|
|
618
|
-
lineage = KafkaConnectLineage(
|
|
619
|
-
source_dataset=table_name,
|
|
620
|
-
source_platform=source_platform,
|
|
621
|
-
target_dataset=topic,
|
|
622
|
-
target_platform=KAFKA,
|
|
623
|
-
)
|
|
624
|
-
lineages.append(lineage)
|
|
625
|
-
self.connector_manifest.lineages = lineages
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
@dataclass
|
|
629
|
-
class DebeziumSourceConnector:
|
|
630
|
-
connector_manifest: ConnectorManifest
|
|
631
|
-
report: KafkaConnectSourceReport
|
|
632
|
-
|
|
633
|
-
def __init__(
|
|
634
|
-
self,
|
|
635
|
-
connector_manifest: ConnectorManifest,
|
|
636
|
-
config: KafkaConnectSourceConfig,
|
|
637
|
-
report: KafkaConnectSourceReport,
|
|
638
|
-
) -> None:
|
|
639
|
-
self.connector_manifest = connector_manifest
|
|
640
|
-
self.config = config
|
|
641
|
-
self.report = report
|
|
642
|
-
self._extract_lineages()
|
|
643
|
-
|
|
644
|
-
@dataclass
|
|
645
|
-
class DebeziumParser:
|
|
646
|
-
source_platform: str
|
|
647
|
-
server_name: Optional[str]
|
|
648
|
-
database_name: Optional[str]
|
|
649
|
-
|
|
650
|
-
def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
|
|
651
|
-
if "topic.prefix" in connector_manifest.config:
|
|
652
|
-
return connector_manifest.config["topic.prefix"]
|
|
653
|
-
else:
|
|
654
|
-
return connector_manifest.config.get("database.server.name", "")
|
|
655
|
-
|
|
656
|
-
def get_parser(
|
|
657
|
-
self,
|
|
658
|
-
connector_manifest: ConnectorManifest,
|
|
659
|
-
) -> DebeziumParser:
|
|
660
|
-
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
661
|
-
|
|
662
|
-
if connector_class == "io.debezium.connector.mysql.MySqlConnector":
|
|
663
|
-
parser = self.DebeziumParser(
|
|
664
|
-
source_platform="mysql",
|
|
665
|
-
server_name=self.get_server_name(connector_manifest),
|
|
666
|
-
database_name=None,
|
|
667
|
-
)
|
|
668
|
-
elif connector_class == "MySqlConnector":
|
|
669
|
-
parser = self.DebeziumParser(
|
|
670
|
-
source_platform="mysql",
|
|
671
|
-
server_name=self.get_server_name(connector_manifest),
|
|
672
|
-
database_name=None,
|
|
673
|
-
)
|
|
674
|
-
elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
|
|
675
|
-
parser = self.DebeziumParser(
|
|
676
|
-
source_platform="mongodb",
|
|
677
|
-
server_name=self.get_server_name(connector_manifest),
|
|
678
|
-
database_name=None,
|
|
679
|
-
)
|
|
680
|
-
elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
|
|
681
|
-
parser = self.DebeziumParser(
|
|
682
|
-
source_platform="postgres",
|
|
683
|
-
server_name=self.get_server_name(connector_manifest),
|
|
684
|
-
database_name=connector_manifest.config.get("database.dbname"),
|
|
685
|
-
)
|
|
686
|
-
elif connector_class == "io.debezium.connector.oracle.OracleConnector":
|
|
687
|
-
parser = self.DebeziumParser(
|
|
688
|
-
source_platform="oracle",
|
|
689
|
-
server_name=self.get_server_name(connector_manifest),
|
|
690
|
-
database_name=connector_manifest.config.get("database.dbname"),
|
|
691
|
-
)
|
|
692
|
-
elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
|
|
693
|
-
database_name = connector_manifest.config.get(
|
|
694
|
-
"database.names"
|
|
695
|
-
) or connector_manifest.config.get("database.dbname")
|
|
696
|
-
|
|
697
|
-
if "," in str(database_name):
|
|
698
|
-
raise Exception(
|
|
699
|
-
f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
|
|
700
|
-
)
|
|
701
|
-
|
|
702
|
-
parser = self.DebeziumParser(
|
|
703
|
-
source_platform="mssql",
|
|
704
|
-
server_name=self.get_server_name(connector_manifest),
|
|
705
|
-
database_name=database_name,
|
|
706
|
-
)
|
|
707
|
-
elif connector_class == "io.debezium.connector.db2.Db2Connector":
|
|
708
|
-
parser = self.DebeziumParser(
|
|
709
|
-
source_platform="db2",
|
|
710
|
-
server_name=self.get_server_name(connector_manifest),
|
|
711
|
-
database_name=connector_manifest.config.get("database.dbname"),
|
|
712
|
-
)
|
|
713
|
-
elif connector_class == "io.debezium.connector.vitess.VitessConnector":
|
|
714
|
-
parser = self.DebeziumParser(
|
|
715
|
-
source_platform="vitess",
|
|
716
|
-
server_name=self.get_server_name(connector_manifest),
|
|
717
|
-
database_name=connector_manifest.config.get("vitess.keyspace"),
|
|
718
|
-
)
|
|
719
|
-
else:
|
|
720
|
-
raise ValueError(f"Connector class '{connector_class}' is unknown.")
|
|
721
|
-
|
|
722
|
-
return parser
|
|
723
|
-
|
|
724
|
-
def _extract_lineages(self):
|
|
725
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
726
|
-
|
|
727
|
-
try:
|
|
728
|
-
parser = self.get_parser(self.connector_manifest)
|
|
729
|
-
source_platform = parser.source_platform
|
|
730
|
-
server_name = parser.server_name
|
|
731
|
-
database_name = parser.database_name
|
|
732
|
-
topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
|
|
733
|
-
|
|
734
|
-
if not self.connector_manifest.topic_names:
|
|
735
|
-
return lineages
|
|
736
|
-
|
|
737
|
-
for topic in self.connector_manifest.topic_names:
|
|
738
|
-
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
739
|
-
|
|
740
|
-
if found:
|
|
741
|
-
table_name = get_dataset_name(database_name, found.group(2))
|
|
742
|
-
|
|
743
|
-
lineage = KafkaConnectLineage(
|
|
744
|
-
source_dataset=table_name,
|
|
745
|
-
source_platform=source_platform,
|
|
746
|
-
target_dataset=topic,
|
|
747
|
-
target_platform=KAFKA,
|
|
748
|
-
)
|
|
749
|
-
lineages.append(lineage)
|
|
750
|
-
self.connector_manifest.lineages = lineages
|
|
751
|
-
except Exception as e:
|
|
752
|
-
self.report.warning(
|
|
753
|
-
"Error resolving lineage for connector",
|
|
754
|
-
self.connector_manifest.name,
|
|
755
|
-
exc=e,
|
|
756
|
-
)
|
|
757
|
-
|
|
758
|
-
return
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
@dataclass
|
|
762
|
-
class BigQuerySinkConnector:
|
|
763
|
-
connector_manifest: ConnectorManifest
|
|
764
|
-
report: KafkaConnectSourceReport
|
|
765
|
-
|
|
766
|
-
def __init__(
|
|
767
|
-
self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
|
|
768
|
-
) -> None:
|
|
769
|
-
self.connector_manifest = connector_manifest
|
|
770
|
-
self.report = report
|
|
771
|
-
self._extract_lineages()
|
|
772
|
-
|
|
773
|
-
@dataclass
|
|
774
|
-
class BQParser:
|
|
775
|
-
project: str
|
|
776
|
-
target_platform: str
|
|
777
|
-
sanitizeTopics: str
|
|
778
|
-
transforms: list
|
|
779
|
-
topicsToTables: Optional[str] = None
|
|
780
|
-
datasets: Optional[str] = None
|
|
781
|
-
defaultDataset: Optional[str] = None
|
|
782
|
-
version: str = "v1"
|
|
783
|
-
|
|
784
|
-
def get_parser(
|
|
785
|
-
self,
|
|
786
|
-
connector_manifest: ConnectorManifest,
|
|
787
|
-
) -> BQParser:
|
|
788
|
-
project = connector_manifest.config["project"]
|
|
789
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
|
|
790
|
-
transform_names = (
|
|
791
|
-
self.connector_manifest.config.get("transforms", "").split(",")
|
|
792
|
-
if self.connector_manifest.config.get("transforms")
|
|
793
|
-
else []
|
|
794
|
-
)
|
|
795
|
-
transforms = []
|
|
796
|
-
for name in transform_names:
|
|
797
|
-
transform = {"name": name}
|
|
798
|
-
transforms.append(transform)
|
|
799
|
-
for key in self.connector_manifest.config.keys():
|
|
800
|
-
if key.startswith(f"transforms.{name}."):
|
|
801
|
-
transform[
|
|
802
|
-
key.replace(f"transforms.{name}.", "")
|
|
803
|
-
] = self.connector_manifest.config[key]
|
|
804
|
-
|
|
805
|
-
if "defaultDataset" in connector_manifest.config:
|
|
806
|
-
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
807
|
-
return self.BQParser(
|
|
808
|
-
project=project,
|
|
809
|
-
defaultDataset=defaultDataset,
|
|
810
|
-
target_platform="bigquery",
|
|
811
|
-
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
812
|
-
version="v2",
|
|
813
|
-
transforms=transforms,
|
|
814
|
-
)
|
|
815
|
-
else:
|
|
816
|
-
# version 1.6.x and similar configs supported
|
|
817
|
-
datasets = connector_manifest.config["datasets"]
|
|
818
|
-
topicsToTables = connector_manifest.config.get("topicsToTables")
|
|
819
|
-
|
|
820
|
-
return self.BQParser(
|
|
821
|
-
project=project,
|
|
822
|
-
topicsToTables=topicsToTables,
|
|
823
|
-
datasets=datasets,
|
|
824
|
-
target_platform="bigquery",
|
|
825
|
-
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
826
|
-
transforms=transforms,
|
|
827
|
-
)
|
|
828
|
-
|
|
829
|
-
def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
|
|
830
|
-
entries = property.split(",")
|
|
831
|
-
for entry in entries:
|
|
832
|
-
key, val = entry.rsplit("=")
|
|
833
|
-
yield (key.strip(), val.strip())
|
|
834
|
-
|
|
835
|
-
def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]:
|
|
836
|
-
topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore
|
|
837
|
-
from java.util.regex import Pattern
|
|
838
|
-
|
|
839
|
-
for pattern, dataset in topicregex_dataset_map.items():
|
|
840
|
-
patternMatcher = Pattern.compile(pattern).matcher(topic)
|
|
841
|
-
if patternMatcher.matches():
|
|
842
|
-
return dataset
|
|
843
|
-
return None
|
|
844
|
-
|
|
845
|
-
def sanitize_table_name(self, table_name):
|
|
846
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
|
|
847
|
-
if re.match("^[^a-zA-Z_].*", table_name):
|
|
848
|
-
table_name = "_" + table_name
|
|
849
|
-
|
|
850
|
-
return table_name
|
|
851
|
-
|
|
852
|
-
def get_dataset_table_for_topic(
|
|
853
|
-
self, topic: str, parser: BQParser
|
|
854
|
-
) -> Optional[str]:
|
|
855
|
-
if parser.version == "v2":
|
|
856
|
-
dataset = parser.defaultDataset
|
|
857
|
-
parts = topic.split(":")
|
|
858
|
-
if len(parts) == 2:
|
|
859
|
-
dataset = parts[0]
|
|
860
|
-
table = parts[1]
|
|
861
|
-
else:
|
|
862
|
-
table = parts[0]
|
|
863
|
-
else:
|
|
864
|
-
dataset = self.get_dataset_for_topic_v1(topic, parser)
|
|
865
|
-
if dataset is None:
|
|
866
|
-
return None
|
|
867
|
-
|
|
868
|
-
table = topic
|
|
869
|
-
if parser.topicsToTables:
|
|
870
|
-
topicregex_table_map: Dict[str, str] = dict(
|
|
871
|
-
self.get_list(parser.topicsToTables) # type: ignore
|
|
872
|
-
)
|
|
873
|
-
from java.util.regex import Pattern
|
|
874
|
-
|
|
875
|
-
for pattern, tbl in topicregex_table_map.items():
|
|
876
|
-
patternMatcher = Pattern.compile(pattern).matcher(topic)
|
|
877
|
-
if patternMatcher.matches():
|
|
878
|
-
table = tbl
|
|
879
|
-
break
|
|
880
|
-
|
|
881
|
-
if parser.sanitizeTopics:
|
|
882
|
-
table = self.sanitize_table_name(table)
|
|
883
|
-
return f"{dataset}.{table}"
|
|
884
|
-
|
|
885
|
-
def apply_transformations(
|
|
886
|
-
self, topic: str, transforms: List[Dict[str, str]]
|
|
887
|
-
) -> str:
|
|
888
|
-
for transform in transforms:
|
|
889
|
-
if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
|
|
890
|
-
regex = transform["regex"]
|
|
891
|
-
replacement = transform["replacement"]
|
|
892
|
-
pattern = re.compile(regex)
|
|
893
|
-
if pattern.match(topic):
|
|
894
|
-
topic = pattern.sub(replacement, topic, count=1)
|
|
895
|
-
return topic
|
|
896
|
-
|
|
897
|
-
def _extract_lineages(self):
|
|
898
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
899
|
-
parser = self.get_parser(self.connector_manifest)
|
|
900
|
-
if not parser:
|
|
901
|
-
return lineages
|
|
902
|
-
target_platform = parser.target_platform
|
|
903
|
-
project = parser.project
|
|
904
|
-
transforms = parser.transforms
|
|
905
|
-
self.connector_manifest.flow_property_bag = self.connector_manifest.config
|
|
906
|
-
# Mask/Remove properties that may reveal credentials
|
|
907
|
-
if "keyfile" in self.connector_manifest.flow_property_bag:
|
|
908
|
-
del self.connector_manifest.flow_property_bag["keyfile"]
|
|
909
|
-
|
|
910
|
-
for topic in self.connector_manifest.topic_names:
|
|
911
|
-
transformed_topic = self.apply_transformations(topic, transforms)
|
|
912
|
-
dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
|
|
913
|
-
if dataset_table is None:
|
|
914
|
-
self.report.warning(
|
|
915
|
-
"Could not find target dataset for topic, please check your connector configuration"
|
|
916
|
-
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
917
|
-
)
|
|
918
|
-
continue
|
|
919
|
-
target_dataset = f"{project}.{dataset_table}"
|
|
920
|
-
|
|
921
|
-
lineages.append(
|
|
922
|
-
KafkaConnectLineage(
|
|
923
|
-
source_dataset=transformed_topic,
|
|
924
|
-
source_platform=KAFKA,
|
|
925
|
-
target_dataset=target_dataset,
|
|
926
|
-
target_platform=target_platform,
|
|
927
|
-
)
|
|
928
|
-
)
|
|
929
|
-
self.connector_manifest.lineages = lineages
|
|
930
|
-
return
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
@dataclass
|
|
934
|
-
class SnowflakeSinkConnector:
|
|
935
|
-
connector_manifest: ConnectorManifest
|
|
936
|
-
report: KafkaConnectSourceReport
|
|
937
|
-
|
|
938
|
-
def __init__(
|
|
939
|
-
self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
|
|
940
|
-
) -> None:
|
|
941
|
-
self.connector_manifest = connector_manifest
|
|
942
|
-
self.report = report
|
|
943
|
-
self._extract_lineages()
|
|
944
|
-
|
|
945
|
-
@dataclass
|
|
946
|
-
class SnowflakeParser:
|
|
947
|
-
database_name: str
|
|
948
|
-
schema_name: str
|
|
949
|
-
topics_to_tables: Dict[str, str]
|
|
950
|
-
|
|
951
|
-
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
952
|
-
"""
|
|
953
|
-
This function converts the topic name to a valid Snowflake table name using some rules.
|
|
954
|
-
Refer below link for more info
|
|
955
|
-
https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
|
|
956
|
-
"""
|
|
957
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
958
|
-
if re.match("^[^a-zA-Z_].*", table_name):
|
|
959
|
-
table_name = "_" + table_name
|
|
960
|
-
# Connector may append original topic's hash code as suffix for conflict resolution
|
|
961
|
-
# if generated table names for 2 topics are similar. This corner case is not handled here.
|
|
962
|
-
# Note that Snowflake recommends to choose topic names that follow the rules for
|
|
963
|
-
# Snowflake identifier names so this case is not recommended by snowflake.
|
|
964
|
-
return table_name
|
|
965
|
-
|
|
966
|
-
def get_parser(
|
|
967
|
-
self,
|
|
968
|
-
connector_manifest: ConnectorManifest,
|
|
969
|
-
) -> SnowflakeParser:
|
|
970
|
-
database_name = connector_manifest.config["snowflake.database.name"]
|
|
971
|
-
schema_name = connector_manifest.config["snowflake.schema.name"]
|
|
972
|
-
|
|
973
|
-
# Fetch user provided topic to table map
|
|
974
|
-
provided_topics_to_tables: Dict[str, str] = {}
|
|
975
|
-
if connector_manifest.config.get("snowflake.topic2table.map"):
|
|
976
|
-
for each in connector_manifest.config["snowflake.topic2table.map"].split(
|
|
977
|
-
","
|
|
978
|
-
):
|
|
979
|
-
topic, table = each.split(":")
|
|
980
|
-
provided_topics_to_tables[topic.strip()] = table.strip()
|
|
981
|
-
|
|
982
|
-
topics_to_tables: Dict[str, str] = {}
|
|
983
|
-
# Extract lineage for only those topics whose data ingestion started
|
|
984
|
-
for topic in connector_manifest.topic_names:
|
|
985
|
-
if topic in provided_topics_to_tables:
|
|
986
|
-
# If user provided which table to get mapped with this topic
|
|
987
|
-
topics_to_tables[topic] = provided_topics_to_tables[topic]
|
|
988
|
-
else:
|
|
989
|
-
# Else connector converts topic name to a valid Snowflake table name.
|
|
990
|
-
topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
|
|
991
|
-
|
|
992
|
-
return self.SnowflakeParser(
|
|
993
|
-
database_name=database_name,
|
|
994
|
-
schema_name=schema_name,
|
|
995
|
-
topics_to_tables=topics_to_tables,
|
|
996
|
-
)
|
|
997
|
-
|
|
998
|
-
def _extract_lineages(self):
|
|
999
|
-
self.connector_manifest.flow_property_bag = self.connector_manifest.config
|
|
1000
|
-
|
|
1001
|
-
# For all snowflake sink connector properties, refer below link
|
|
1002
|
-
# https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
|
|
1003
|
-
# remove private keys, secrets from properties
|
|
1004
|
-
secret_properties = [
|
|
1005
|
-
"snowflake.private.key",
|
|
1006
|
-
"snowflake.private.key.passphrase",
|
|
1007
|
-
"value.converter.basic.auth.user.info",
|
|
1008
|
-
]
|
|
1009
|
-
for k in secret_properties:
|
|
1010
|
-
if k in self.connector_manifest.flow_property_bag:
|
|
1011
|
-
del self.connector_manifest.flow_property_bag[k]
|
|
1012
|
-
|
|
1013
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
1014
|
-
parser = self.get_parser(self.connector_manifest)
|
|
1015
|
-
|
|
1016
|
-
for topic, table in parser.topics_to_tables.items():
|
|
1017
|
-
target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
1018
|
-
lineages.append(
|
|
1019
|
-
KafkaConnectLineage(
|
|
1020
|
-
source_dataset=topic,
|
|
1021
|
-
source_platform=KAFKA,
|
|
1022
|
-
target_dataset=target_dataset,
|
|
1023
|
-
target_platform="snowflake",
|
|
1024
|
-
)
|
|
1025
|
-
)
|
|
1026
|
-
|
|
1027
|
-
self.connector_manifest.lineages = lineages
|
|
1028
|
-
return
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
@dataclass
|
|
1032
|
-
class ConfluentS3SinkConnector:
|
|
1033
|
-
connector_manifest: ConnectorManifest
|
|
1034
|
-
|
|
1035
|
-
def __init__(
|
|
1036
|
-
self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
|
|
1037
|
-
) -> None:
|
|
1038
|
-
self.connector_manifest = connector_manifest
|
|
1039
|
-
self.report = report
|
|
1040
|
-
self._extract_lineages()
|
|
1041
|
-
|
|
1042
|
-
@dataclass
|
|
1043
|
-
class S3SinkParser:
|
|
1044
|
-
target_platform: str
|
|
1045
|
-
bucket: str
|
|
1046
|
-
topics_dir: str
|
|
1047
|
-
topics: Iterable[str]
|
|
1048
|
-
|
|
1049
|
-
def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
|
|
1050
|
-
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
|
|
1051
|
-
bucket = connector_manifest.config.get("s3.bucket.name")
|
|
1052
|
-
if not bucket:
|
|
1053
|
-
raise ValueError(
|
|
1054
|
-
"Could not find 's3.bucket.name' in connector configuration"
|
|
1055
|
-
)
|
|
1056
|
-
|
|
1057
|
-
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
|
|
1058
|
-
topics_dir = connector_manifest.config.get("topics.dir", "topics")
|
|
1059
|
-
|
|
1060
|
-
return self.S3SinkParser(
|
|
1061
|
-
target_platform="s3",
|
|
1062
|
-
bucket=bucket,
|
|
1063
|
-
topics_dir=topics_dir,
|
|
1064
|
-
topics=connector_manifest.topic_names,
|
|
1065
|
-
)
|
|
1066
|
-
|
|
1067
|
-
def _extract_lineages(self):
|
|
1068
|
-
self.connector_manifest.flow_property_bag = self.connector_manifest.config
|
|
1069
|
-
|
|
1070
|
-
# remove keys, secrets from properties
|
|
1071
|
-
secret_properties = [
|
|
1072
|
-
"aws.access.key.id",
|
|
1073
|
-
"aws.secret.access.key",
|
|
1074
|
-
"s3.sse.customer.key",
|
|
1075
|
-
"s3.proxy.password",
|
|
1076
|
-
]
|
|
1077
|
-
for k in secret_properties:
|
|
1078
|
-
if k in self.connector_manifest.flow_property_bag:
|
|
1079
|
-
del self.connector_manifest.flow_property_bag[k]
|
|
1080
|
-
|
|
1081
|
-
try:
|
|
1082
|
-
parser = self._get_parser(self.connector_manifest)
|
|
1083
|
-
|
|
1084
|
-
lineages: List[KafkaConnectLineage] = list()
|
|
1085
|
-
for topic in parser.topics:
|
|
1086
|
-
target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
|
|
1087
|
-
|
|
1088
|
-
lineages.append(
|
|
1089
|
-
KafkaConnectLineage(
|
|
1090
|
-
source_dataset=topic,
|
|
1091
|
-
source_platform="kafka",
|
|
1092
|
-
target_dataset=target_dataset,
|
|
1093
|
-
target_platform=parser.target_platform,
|
|
1094
|
-
)
|
|
1095
|
-
)
|
|
1096
|
-
self.connector_manifest.lineages = lineages
|
|
1097
|
-
except Exception as e:
|
|
1098
|
-
self.report.warning(
|
|
1099
|
-
"Error resolving lineage for connector",
|
|
1100
|
-
self.connector_manifest.name,
|
|
1101
|
-
exc=e,
|
|
1102
|
-
)
|
|
1103
|
-
|
|
1104
|
-
return
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
def transform_connector_config(
|
|
1108
|
-
connector_config: Dict, provided_configs: List[ProvidedConfig]
|
|
1109
|
-
) -> None:
|
|
1110
|
-
"""This method will update provided configs in connector config values, if any"""
|
|
1111
|
-
lookupsByProvider = {}
|
|
1112
|
-
for pconfig in provided_configs:
|
|
1113
|
-
lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
|
|
1114
|
-
for k, v in connector_config.items():
|
|
1115
|
-
for key, value in lookupsByProvider.items():
|
|
1116
|
-
if key in v:
|
|
1117
|
-
connector_config[k] = connector_config[k].replace(key, value)
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
@platform_name("Kafka Connect")
|
|
1121
|
-
@config_class(KafkaConnectSourceConfig)
|
|
1122
|
-
@support_status(SupportStatus.CERTIFIED)
|
|
1123
|
-
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
1124
|
-
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
1125
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
1126
|
-
class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
1127
|
-
config: KafkaConnectSourceConfig
|
|
1128
|
-
report: KafkaConnectSourceReport
|
|
1129
|
-
platform: str = "kafka-connect"
|
|
1130
|
-
|
|
1131
|
-
def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
|
|
1132
|
-
super().__init__(config, ctx)
|
|
1133
|
-
self.config = config
|
|
1134
|
-
self.report = KafkaConnectSourceReport()
|
|
1135
|
-
self.session = requests.Session()
|
|
1136
|
-
self.session.headers.update(
|
|
1137
|
-
{
|
|
1138
|
-
"Accept": "application/json",
|
|
1139
|
-
"Content-Type": "application/json",
|
|
1140
|
-
}
|
|
1141
|
-
)
|
|
1142
|
-
|
|
1143
|
-
# Test the connection
|
|
1144
|
-
if self.config.username is not None and self.config.password is not None:
|
|
1145
|
-
logger.info(
|
|
1146
|
-
f"Connecting to {self.config.connect_uri} with Authentication..."
|
|
1147
|
-
)
|
|
1148
|
-
self.session.auth = (self.config.username, self.config.password)
|
|
1149
|
-
|
|
1150
|
-
test_response = self.session.get(f"{self.config.connect_uri}/connectors")
|
|
1151
|
-
test_response.raise_for_status()
|
|
1152
|
-
logger.info(f"Connection to {self.config.connect_uri} is ok")
|
|
1153
|
-
if not jpype.isJVMStarted():
|
|
1154
|
-
jpype.startJVM()
|
|
1155
|
-
|
|
1156
|
-
@classmethod
|
|
1157
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
1158
|
-
config = KafkaConnectSourceConfig.parse_obj(config_dict)
|
|
1159
|
-
return cls(config, ctx)
|
|
1160
|
-
|
|
1161
|
-
def get_connectors_manifest(self) -> List[ConnectorManifest]:
|
|
1162
|
-
"""Get Kafka Connect connectors manifest using REST API.
|
|
1163
|
-
Enrich with lineages metadata.
|
|
1164
|
-
"""
|
|
1165
|
-
connectors_manifest = list()
|
|
1166
|
-
|
|
1167
|
-
connector_response = self.session.get(
|
|
1168
|
-
f"{self.config.connect_uri}/connectors",
|
|
1169
|
-
)
|
|
1170
|
-
|
|
1171
|
-
payload = connector_response.json()
|
|
1172
|
-
|
|
1173
|
-
for connector_name in payload:
|
|
1174
|
-
connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
|
|
1175
|
-
connector_manifest = self._get_connector_manifest(
|
|
1176
|
-
connector_name, connector_url
|
|
1177
|
-
)
|
|
1178
|
-
if (
|
|
1179
|
-
connector_manifest is None
|
|
1180
|
-
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
1181
|
-
):
|
|
1182
|
-
self.report.report_dropped(connector_name)
|
|
1183
|
-
continue
|
|
1184
|
-
|
|
1185
|
-
if self.config.provided_configs:
|
|
1186
|
-
transform_connector_config(
|
|
1187
|
-
connector_manifest.config, self.config.provided_configs
|
|
1188
|
-
)
|
|
1189
|
-
# Initialize connector lineages
|
|
1190
|
-
connector_manifest.lineages = list()
|
|
1191
|
-
connector_manifest.url = connector_url
|
|
1192
|
-
|
|
1193
|
-
connector_manifest.topic_names = self._get_connector_topics(connector_name)
|
|
1194
|
-
|
|
1195
|
-
# Populate Source Connector metadata
|
|
1196
|
-
if connector_manifest.type == SOURCE:
|
|
1197
|
-
connector_manifest.tasks = self._get_connector_tasks(connector_name)
|
|
1198
|
-
|
|
1199
|
-
# JDBC source connector lineages
|
|
1200
|
-
if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
|
|
1201
|
-
"io.confluent.connect.jdbc.JdbcSourceConnector"
|
|
1202
|
-
):
|
|
1203
|
-
connector_manifest = ConfluentJDBCSourceConnector(
|
|
1204
|
-
connector_manifest=connector_manifest,
|
|
1205
|
-
config=self.config,
|
|
1206
|
-
report=self.report,
|
|
1207
|
-
).connector_manifest
|
|
1208
|
-
elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith(
|
|
1209
|
-
"io.debezium.connector"
|
|
1210
|
-
):
|
|
1211
|
-
connector_manifest = DebeziumSourceConnector(
|
|
1212
|
-
connector_manifest=connector_manifest,
|
|
1213
|
-
config=self.config,
|
|
1214
|
-
report=self.report,
|
|
1215
|
-
).connector_manifest
|
|
1216
|
-
elif (
|
|
1217
|
-
connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
1218
|
-
== "com.mongodb.kafka.connect.MongoSourceConnector"
|
|
1219
|
-
):
|
|
1220
|
-
connector_manifest = MongoSourceConnector(
|
|
1221
|
-
connector_manifest=connector_manifest, config=self.config
|
|
1222
|
-
).connector_manifest
|
|
1223
|
-
else:
|
|
1224
|
-
# Find the target connector object in the list, or log an error if unknown.
|
|
1225
|
-
target_connector = None
|
|
1226
|
-
for connector in self.config.generic_connectors:
|
|
1227
|
-
if connector.connector_name == connector_manifest.name:
|
|
1228
|
-
target_connector = connector
|
|
1229
|
-
break
|
|
1230
|
-
if not target_connector:
|
|
1231
|
-
logger.warning(
|
|
1232
|
-
f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector."
|
|
1233
|
-
)
|
|
1234
|
-
continue
|
|
1235
|
-
|
|
1236
|
-
for topic in connector_manifest.topic_names:
|
|
1237
|
-
lineage = KafkaConnectLineage(
|
|
1238
|
-
source_dataset=target_connector.source_dataset,
|
|
1239
|
-
source_platform=target_connector.source_platform,
|
|
1240
|
-
target_dataset=topic,
|
|
1241
|
-
target_platform=KAFKA,
|
|
1242
|
-
)
|
|
1243
|
-
|
|
1244
|
-
connector_manifest.lineages.append(lineage)
|
|
1245
|
-
|
|
1246
|
-
if connector_manifest.type == SINK:
|
|
1247
|
-
if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
|
|
1248
|
-
"com.wepay.kafka.connect.bigquery.BigQuerySinkConnector"
|
|
1249
|
-
):
|
|
1250
|
-
connector_manifest = BigQuerySinkConnector(
|
|
1251
|
-
connector_manifest=connector_manifest, report=self.report
|
|
1252
|
-
).connector_manifest
|
|
1253
|
-
elif connector_manifest.config.get("connector.class").__eq__(
|
|
1254
|
-
"io.confluent.connect.s3.S3SinkConnector"
|
|
1255
|
-
):
|
|
1256
|
-
connector_manifest = ConfluentS3SinkConnector(
|
|
1257
|
-
connector_manifest=connector_manifest, report=self.report
|
|
1258
|
-
).connector_manifest
|
|
1259
|
-
elif connector_manifest.config.get("connector.class").__eq__(
|
|
1260
|
-
"com.snowflake.kafka.connector.SnowflakeSinkConnector"
|
|
1261
|
-
):
|
|
1262
|
-
connector_manifest = SnowflakeSinkConnector(
|
|
1263
|
-
connector_manifest=connector_manifest, report=self.report
|
|
1264
|
-
).connector_manifest
|
|
1265
|
-
else:
|
|
1266
|
-
self.report.report_dropped(connector_manifest.name)
|
|
1267
|
-
logger.warning(
|
|
1268
|
-
f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented"
|
|
1269
|
-
)
|
|
1270
|
-
pass
|
|
1271
|
-
|
|
1272
|
-
connectors_manifest.append(connector_manifest)
|
|
1273
|
-
|
|
1274
|
-
return connectors_manifest
|
|
1275
|
-
|
|
1276
|
-
def _get_connector_manifest(
|
|
1277
|
-
self, connector_name: str, connector_url: str
|
|
1278
|
-
) -> Optional[ConnectorManifest]:
|
|
1279
|
-
try:
|
|
1280
|
-
connector_response = self.session.get(connector_url)
|
|
1281
|
-
connector_response.raise_for_status()
|
|
1282
|
-
except Exception as e:
|
|
1283
|
-
self.report.warning(
|
|
1284
|
-
"Failed to get connector details", connector_name, exc=e
|
|
1285
|
-
)
|
|
1286
|
-
return None
|
|
1287
|
-
manifest = connector_response.json()
|
|
1288
|
-
connector_manifest = ConnectorManifest(**manifest)
|
|
1289
|
-
return connector_manifest
|
|
1290
|
-
|
|
1291
|
-
def _get_connector_tasks(self, connector_name: str) -> dict:
|
|
1292
|
-
try:
|
|
1293
|
-
response = self.session.get(
|
|
1294
|
-
f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
|
|
1295
|
-
)
|
|
1296
|
-
response.raise_for_status()
|
|
1297
|
-
except Exception as e:
|
|
1298
|
-
self.report.warning(
|
|
1299
|
-
"Error getting connector tasks", context=connector_name, exc=e
|
|
1300
|
-
)
|
|
1301
|
-
return {}
|
|
1302
|
-
|
|
1303
|
-
return response.json()
|
|
1304
|
-
|
|
1305
|
-
def _get_connector_topics(self, connector_name: str) -> List[str]:
|
|
1306
|
-
try:
|
|
1307
|
-
response = self.session.get(
|
|
1308
|
-
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
1309
|
-
)
|
|
1310
|
-
response.raise_for_status()
|
|
1311
|
-
except Exception as e:
|
|
1312
|
-
self.report.warning(
|
|
1313
|
-
"Error getting connector topics", context=connector_name, exc=e
|
|
1314
|
-
)
|
|
1315
|
-
return []
|
|
1316
|
-
|
|
1317
|
-
return response.json()[connector_name]["topics"]
|
|
1318
|
-
|
|
1319
|
-
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
1320
|
-
connector_name = connector.name
|
|
1321
|
-
connector_type = connector.type
|
|
1322
|
-
connector_class = connector.config.get(CONNECTOR_CLASS)
|
|
1323
|
-
flow_property_bag = connector.flow_property_bag
|
|
1324
|
-
# connector_url = connector.url # NOTE: this will expose connector credential when used
|
|
1325
|
-
flow_urn = builder.make_data_flow_urn(
|
|
1326
|
-
self.platform,
|
|
1327
|
-
connector_name,
|
|
1328
|
-
self.config.env,
|
|
1329
|
-
self.config.platform_instance,
|
|
1330
|
-
)
|
|
1331
|
-
|
|
1332
|
-
return MetadataChangeProposalWrapper(
|
|
1333
|
-
entityUrn=flow_urn,
|
|
1334
|
-
aspect=models.DataFlowInfoClass(
|
|
1335
|
-
name=connector_name,
|
|
1336
|
-
description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
|
|
1337
|
-
customProperties=flow_property_bag,
|
|
1338
|
-
# externalUrl=connector_url, # NOTE: this will expose connector credential when used
|
|
1339
|
-
),
|
|
1340
|
-
).as_workunit()
|
|
1341
|
-
|
|
1342
|
-
def construct_job_workunits(
|
|
1343
|
-
self, connector: ConnectorManifest
|
|
1344
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
1345
|
-
connector_name = connector.name
|
|
1346
|
-
flow_urn = builder.make_data_flow_urn(
|
|
1347
|
-
self.platform,
|
|
1348
|
-
connector_name,
|
|
1349
|
-
self.config.env,
|
|
1350
|
-
self.config.platform_instance,
|
|
1351
|
-
)
|
|
1352
|
-
|
|
1353
|
-
lineages = connector.lineages
|
|
1354
|
-
if lineages:
|
|
1355
|
-
for lineage in lineages:
|
|
1356
|
-
source_dataset = lineage.source_dataset
|
|
1357
|
-
source_platform = lineage.source_platform
|
|
1358
|
-
target_dataset = lineage.target_dataset
|
|
1359
|
-
target_platform = lineage.target_platform
|
|
1360
|
-
job_property_bag = lineage.job_property_bag
|
|
1361
|
-
|
|
1362
|
-
source_platform_instance = get_platform_instance(
|
|
1363
|
-
self.config, connector_name, source_platform
|
|
1364
|
-
)
|
|
1365
|
-
target_platform_instance = get_platform_instance(
|
|
1366
|
-
self.config, connector_name, target_platform
|
|
1367
|
-
)
|
|
1368
|
-
|
|
1369
|
-
job_id = self.get_job_id(lineage, connector, self.config)
|
|
1370
|
-
job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
|
|
1371
|
-
|
|
1372
|
-
inlets = (
|
|
1373
|
-
[
|
|
1374
|
-
self.make_lineage_dataset_urn(
|
|
1375
|
-
source_platform, source_dataset, source_platform_instance
|
|
1376
|
-
)
|
|
1377
|
-
]
|
|
1378
|
-
if source_dataset
|
|
1379
|
-
else []
|
|
1380
|
-
)
|
|
1381
|
-
outlets = [
|
|
1382
|
-
self.make_lineage_dataset_urn(
|
|
1383
|
-
target_platform, target_dataset, target_platform_instance
|
|
1384
|
-
)
|
|
1385
|
-
]
|
|
1386
|
-
|
|
1387
|
-
yield MetadataChangeProposalWrapper(
|
|
1388
|
-
entityUrn=job_urn,
|
|
1389
|
-
aspect=models.DataJobInfoClass(
|
|
1390
|
-
name=f"{connector_name}:{job_id}",
|
|
1391
|
-
type="COMMAND",
|
|
1392
|
-
customProperties=job_property_bag,
|
|
1393
|
-
),
|
|
1394
|
-
).as_workunit()
|
|
1395
|
-
|
|
1396
|
-
yield MetadataChangeProposalWrapper(
|
|
1397
|
-
entityUrn=job_urn,
|
|
1398
|
-
aspect=models.DataJobInputOutputClass(
|
|
1399
|
-
inputDatasets=inlets,
|
|
1400
|
-
outputDatasets=outlets,
|
|
1401
|
-
),
|
|
1402
|
-
).as_workunit()
|
|
1403
|
-
|
|
1404
|
-
def get_job_id(
|
|
1405
|
-
self,
|
|
1406
|
-
lineage: KafkaConnectLineage,
|
|
1407
|
-
connector: ConnectorManifest,
|
|
1408
|
-
config: KafkaConnectSourceConfig,
|
|
1409
|
-
) -> str:
|
|
1410
|
-
connector_class = connector.config.get(CONNECTOR_CLASS)
|
|
1411
|
-
|
|
1412
|
-
# Note - This block is only to maintain backward compatibility of Job URN
|
|
1413
|
-
if (
|
|
1414
|
-
connector_class
|
|
1415
|
-
and connector.type == SOURCE
|
|
1416
|
-
and (
|
|
1417
|
-
"JdbcSourceConnector" in connector_class
|
|
1418
|
-
or connector_class.startswith("io.debezium.connector")
|
|
1419
|
-
)
|
|
1420
|
-
and lineage.source_dataset
|
|
1421
|
-
and config.connect_to_platform_map
|
|
1422
|
-
and config.connect_to_platform_map.get(connector.name)
|
|
1423
|
-
and config.connect_to_platform_map[connector.name].get(
|
|
1424
|
-
lineage.source_platform
|
|
1425
|
-
)
|
|
1426
|
-
):
|
|
1427
|
-
return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
|
|
1428
|
-
|
|
1429
|
-
return (
|
|
1430
|
-
lineage.source_dataset
|
|
1431
|
-
if lineage.source_dataset
|
|
1432
|
-
else f"unknown_source.{lineage.target_dataset}"
|
|
1433
|
-
)
|
|
1434
|
-
|
|
1435
|
-
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1436
|
-
return [
|
|
1437
|
-
*super().get_workunit_processors(),
|
|
1438
|
-
StaleEntityRemovalHandler.create(
|
|
1439
|
-
self, self.config, self.ctx
|
|
1440
|
-
).workunit_processor,
|
|
1441
|
-
]
|
|
1442
|
-
|
|
1443
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1444
|
-
connectors_manifest = self.get_connectors_manifest()
|
|
1445
|
-
for connector in connectors_manifest:
|
|
1446
|
-
name = connector.name
|
|
1447
|
-
|
|
1448
|
-
yield self.construct_flow_workunit(connector)
|
|
1449
|
-
yield from self.construct_job_workunits(connector)
|
|
1450
|
-
self.report.report_connector_scanned(name)
|
|
1451
|
-
|
|
1452
|
-
def get_report(self) -> KafkaConnectSourceReport:
|
|
1453
|
-
return self.report
|
|
1454
|
-
|
|
1455
|
-
def make_lineage_dataset_urn(
|
|
1456
|
-
self, platform: str, name: str, platform_instance: Optional[str]
|
|
1457
|
-
) -> str:
|
|
1458
|
-
if self.config.convert_lineage_urns_to_lowercase:
|
|
1459
|
-
name = name.lower()
|
|
1460
|
-
|
|
1461
|
-
return builder.make_dataset_urn_with_platform_instance(
|
|
1462
|
-
platform, name, platform_instance, self.config.env
|
|
1463
|
-
)
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
|
|
1467
|
-
def has_three_level_hierarchy(platform: str) -> bool:
|
|
1468
|
-
return platform in ["postgres", "trino", "redshift", "snowflake"]
|