acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/rest_emitter.py +16 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +54 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +12 -1
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
- datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +14 -0
- datahub/ingestion/source/tableau/tableau.py +4 -5
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/sql_parsing/tool_meta_extractor.py +116 -5
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from sqlalchemy.engine.url import make_url
|
|
7
|
+
|
|
8
|
+
from datahub.ingestion.source.kafka_connect.common import (
|
|
9
|
+
CONNECTOR_CLASS,
|
|
10
|
+
KAFKA,
|
|
11
|
+
BaseConnector,
|
|
12
|
+
ConnectorManifest,
|
|
13
|
+
KafkaConnectLineage,
|
|
14
|
+
get_dataset_name,
|
|
15
|
+
has_three_level_hierarchy,
|
|
16
|
+
remove_prefix,
|
|
17
|
+
unquote,
|
|
18
|
+
)
|
|
19
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
20
|
+
get_platform_from_sqlalchemy_uri,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
26
|
+
REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
|
|
27
|
+
KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
|
|
28
|
+
# https://kafka.apache.org/documentation/#connect_included_transformation
|
|
29
|
+
KAFKA_NONTOPICROUTING_TRANSFORMS = [
|
|
30
|
+
"InsertField",
|
|
31
|
+
"InsertField$Key",
|
|
32
|
+
"InsertField$Value",
|
|
33
|
+
"ReplaceField",
|
|
34
|
+
"ReplaceField$Key",
|
|
35
|
+
"ReplaceField$Value",
|
|
36
|
+
"MaskField",
|
|
37
|
+
"MaskField$Key",
|
|
38
|
+
"MaskField$Value",
|
|
39
|
+
"ValueToKey",
|
|
40
|
+
"ValueToKey$Key",
|
|
41
|
+
"ValueToKey$Value",
|
|
42
|
+
"HoistField",
|
|
43
|
+
"HoistField$Key",
|
|
44
|
+
"HoistField$Value",
|
|
45
|
+
"ExtractField",
|
|
46
|
+
"ExtractField$Key",
|
|
47
|
+
"ExtractField$Value",
|
|
48
|
+
"SetSchemaMetadata",
|
|
49
|
+
"SetSchemaMetadata$Key",
|
|
50
|
+
"SetSchemaMetadata$Value",
|
|
51
|
+
"Flatten",
|
|
52
|
+
"Flatten$Key",
|
|
53
|
+
"Flatten$Value",
|
|
54
|
+
"Cast",
|
|
55
|
+
"Cast$Key",
|
|
56
|
+
"Cast$Value",
|
|
57
|
+
"HeadersFrom",
|
|
58
|
+
"HeadersFrom$Key",
|
|
59
|
+
"HeadersFrom$Value",
|
|
60
|
+
"TimestampConverter",
|
|
61
|
+
"Filter",
|
|
62
|
+
"InsertHeader",
|
|
63
|
+
"DropHeaders",
|
|
64
|
+
]
|
|
65
|
+
# https://docs.confluent.io/platform/current/connect/transforms/overview.html
|
|
66
|
+
CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
|
|
67
|
+
"Drop",
|
|
68
|
+
"Drop$Key",
|
|
69
|
+
"Drop$Value",
|
|
70
|
+
"Filter",
|
|
71
|
+
"Filter$Key",
|
|
72
|
+
"Filter$Value",
|
|
73
|
+
"TombstoneHandler",
|
|
74
|
+
]
|
|
75
|
+
KNOWN_NONTOPICROUTING_TRANSFORMS = (
|
|
76
|
+
KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
77
|
+
+ [
|
|
78
|
+
f"org.apache.kafka.connect.transforms.{t}"
|
|
79
|
+
for t in KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
80
|
+
]
|
|
81
|
+
+ CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
82
|
+
+ [
|
|
83
|
+
f"io.confluent.connect.transforms.{t}"
|
|
84
|
+
for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
85
|
+
]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class JdbcParser:
|
|
90
|
+
db_connection_url: str
|
|
91
|
+
source_platform: str
|
|
92
|
+
database_name: str
|
|
93
|
+
topic_prefix: str
|
|
94
|
+
query: str
|
|
95
|
+
transforms: list
|
|
96
|
+
|
|
97
|
+
def get_parser(
|
|
98
|
+
self,
|
|
99
|
+
connector_manifest: ConnectorManifest,
|
|
100
|
+
) -> JdbcParser:
|
|
101
|
+
url = remove_prefix(
|
|
102
|
+
str(connector_manifest.config.get("connection.url")), "jdbc:"
|
|
103
|
+
)
|
|
104
|
+
url_instance = make_url(url)
|
|
105
|
+
source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
|
|
106
|
+
database_name = url_instance.database
|
|
107
|
+
assert database_name
|
|
108
|
+
db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
|
|
109
|
+
|
|
110
|
+
topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
|
|
111
|
+
|
|
112
|
+
query = self.connector_manifest.config.get("query", None)
|
|
113
|
+
|
|
114
|
+
transform_names = (
|
|
115
|
+
self.connector_manifest.config.get("transforms", "").split(",")
|
|
116
|
+
if self.connector_manifest.config.get("transforms")
|
|
117
|
+
else []
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
transforms = []
|
|
121
|
+
for name in transform_names:
|
|
122
|
+
transform = {"name": name}
|
|
123
|
+
transforms.append(transform)
|
|
124
|
+
for key in self.connector_manifest.config.keys():
|
|
125
|
+
if key.startswith(f"transforms.{name}."):
|
|
126
|
+
transform[
|
|
127
|
+
key.replace(f"transforms.{name}.", "")
|
|
128
|
+
] = self.connector_manifest.config[key]
|
|
129
|
+
|
|
130
|
+
return self.JdbcParser(
|
|
131
|
+
db_connection_url,
|
|
132
|
+
source_platform,
|
|
133
|
+
database_name,
|
|
134
|
+
topic_prefix,
|
|
135
|
+
query,
|
|
136
|
+
transforms,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def default_get_lineages(
|
|
140
|
+
self,
|
|
141
|
+
topic_prefix: str,
|
|
142
|
+
database_name: str,
|
|
143
|
+
source_platform: str,
|
|
144
|
+
topic_names: Optional[Iterable[str]] = None,
|
|
145
|
+
include_source_dataset: bool = True,
|
|
146
|
+
) -> List[KafkaConnectLineage]:
|
|
147
|
+
lineages: List[KafkaConnectLineage] = []
|
|
148
|
+
if not topic_names:
|
|
149
|
+
topic_names = self.connector_manifest.topic_names
|
|
150
|
+
table_name_tuples: List[Tuple] = self.get_table_names()
|
|
151
|
+
for topic in topic_names:
|
|
152
|
+
# All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
|
|
153
|
+
source_table: str = (
|
|
154
|
+
remove_prefix(topic, topic_prefix) if topic_prefix else topic
|
|
155
|
+
)
|
|
156
|
+
# include schema name for three-level hierarchies
|
|
157
|
+
if has_three_level_hierarchy(source_platform):
|
|
158
|
+
table_name_tuple: Tuple = next(
|
|
159
|
+
iter([t for t in table_name_tuples if t and t[-1] == source_table]),
|
|
160
|
+
(),
|
|
161
|
+
)
|
|
162
|
+
if len(table_name_tuple) > 1:
|
|
163
|
+
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
164
|
+
else:
|
|
165
|
+
include_source_dataset = False
|
|
166
|
+
self.report.warning(
|
|
167
|
+
"Could not find schema for table"
|
|
168
|
+
f"{self.connector_manifest.name} : {source_table}",
|
|
169
|
+
)
|
|
170
|
+
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
171
|
+
lineage = KafkaConnectLineage(
|
|
172
|
+
source_dataset=dataset_name if include_source_dataset else None,
|
|
173
|
+
source_platform=source_platform,
|
|
174
|
+
target_dataset=topic,
|
|
175
|
+
target_platform=KAFKA,
|
|
176
|
+
)
|
|
177
|
+
lineages.append(lineage)
|
|
178
|
+
return lineages
|
|
179
|
+
|
|
180
|
+
def get_table_names(self) -> List[Tuple]:
|
|
181
|
+
sep: str = "."
|
|
182
|
+
leading_quote_char: str = '"'
|
|
183
|
+
trailing_quote_char: str = leading_quote_char
|
|
184
|
+
|
|
185
|
+
table_ids: List[str] = []
|
|
186
|
+
if self.connector_manifest.tasks:
|
|
187
|
+
table_ids = (
|
|
188
|
+
",".join(
|
|
189
|
+
[
|
|
190
|
+
task["config"].get("tables")
|
|
191
|
+
for task in self.connector_manifest.tasks
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
).split(",")
|
|
195
|
+
quote_method = self.connector_manifest.config.get(
|
|
196
|
+
"quote.sql.identifiers", "always"
|
|
197
|
+
)
|
|
198
|
+
if (
|
|
199
|
+
quote_method == "always"
|
|
200
|
+
and table_ids
|
|
201
|
+
and table_ids[0]
|
|
202
|
+
and table_ids[-1]
|
|
203
|
+
):
|
|
204
|
+
leading_quote_char = table_ids[0][0]
|
|
205
|
+
trailing_quote_char = table_ids[-1][-1]
|
|
206
|
+
# This will only work for single character quotes
|
|
207
|
+
elif self.connector_manifest.config.get("table.whitelist"):
|
|
208
|
+
table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
|
|
209
|
+
|
|
210
|
+
# List of Tuple containing (schema, table)
|
|
211
|
+
tables: List[Tuple] = [
|
|
212
|
+
(
|
|
213
|
+
(
|
|
214
|
+
unquote(
|
|
215
|
+
table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
|
|
216
|
+
)
|
|
217
|
+
if len(table_id.split(sep)) > 1
|
|
218
|
+
else ""
|
|
219
|
+
),
|
|
220
|
+
unquote(
|
|
221
|
+
table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
|
|
222
|
+
),
|
|
223
|
+
)
|
|
224
|
+
for table_id in table_ids
|
|
225
|
+
]
|
|
226
|
+
return tables
|
|
227
|
+
|
|
228
|
+
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
229
|
+
flow_property_bag = {
|
|
230
|
+
k: v
|
|
231
|
+
for k, v in self.connector_manifest.config.items()
|
|
232
|
+
if k not in ["connection.password", "connection.user"]
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
# Mask/Remove properties that may reveal credentials
|
|
236
|
+
flow_property_bag["connection.url"] = self.get_parser(
|
|
237
|
+
self.connector_manifest
|
|
238
|
+
).db_connection_url
|
|
239
|
+
|
|
240
|
+
return flow_property_bag
|
|
241
|
+
|
|
242
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
243
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
244
|
+
parser = self.get_parser(self.connector_manifest)
|
|
245
|
+
source_platform = parser.source_platform
|
|
246
|
+
database_name = parser.database_name
|
|
247
|
+
query = parser.query
|
|
248
|
+
topic_prefix = parser.topic_prefix
|
|
249
|
+
transforms = parser.transforms
|
|
250
|
+
|
|
251
|
+
logging.debug(
|
|
252
|
+
f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if not self.connector_manifest.topic_names:
|
|
256
|
+
return lineages
|
|
257
|
+
|
|
258
|
+
if query:
|
|
259
|
+
# Lineage source_table can be extracted by parsing query
|
|
260
|
+
for topic in self.connector_manifest.topic_names:
|
|
261
|
+
# default method - as per earlier implementation
|
|
262
|
+
dataset_name: str = get_dataset_name(database_name, topic)
|
|
263
|
+
|
|
264
|
+
lineage = KafkaConnectLineage(
|
|
265
|
+
source_dataset=None,
|
|
266
|
+
source_platform=source_platform,
|
|
267
|
+
target_dataset=topic,
|
|
268
|
+
target_platform=KAFKA,
|
|
269
|
+
)
|
|
270
|
+
lineages.append(lineage)
|
|
271
|
+
self.report.warning(
|
|
272
|
+
"Could not find input dataset, the connector has query configuration set",
|
|
273
|
+
self.connector_manifest.name,
|
|
274
|
+
)
|
|
275
|
+
return lineages
|
|
276
|
+
|
|
277
|
+
SINGLE_TRANSFORM = len(transforms) == 1
|
|
278
|
+
NO_TRANSFORM = len(transforms) == 0
|
|
279
|
+
UNKNOWN_TRANSFORM = any(
|
|
280
|
+
[
|
|
281
|
+
transform["type"]
|
|
282
|
+
not in self.KNOWN_TOPICROUTING_TRANSFORMS
|
|
283
|
+
+ self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
284
|
+
for transform in transforms
|
|
285
|
+
]
|
|
286
|
+
)
|
|
287
|
+
ALL_TRANSFORMS_NON_TOPICROUTING = all(
|
|
288
|
+
[
|
|
289
|
+
transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
290
|
+
for transform in transforms
|
|
291
|
+
]
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
|
|
295
|
+
return self.default_get_lineages(
|
|
296
|
+
database_name=database_name,
|
|
297
|
+
source_platform=source_platform,
|
|
298
|
+
topic_prefix=topic_prefix,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
|
|
302
|
+
tables = self.get_table_names()
|
|
303
|
+
topic_names = list(self.connector_manifest.topic_names)
|
|
304
|
+
|
|
305
|
+
from java.util.regex import Pattern
|
|
306
|
+
|
|
307
|
+
for table in tables:
|
|
308
|
+
source_table: str = table[-1]
|
|
309
|
+
topic = topic_prefix + source_table if topic_prefix else source_table
|
|
310
|
+
|
|
311
|
+
transform_regex = Pattern.compile(transforms[0]["regex"])
|
|
312
|
+
transform_replacement = transforms[0]["replacement"]
|
|
313
|
+
|
|
314
|
+
matcher = transform_regex.matcher(topic)
|
|
315
|
+
if matcher.matches():
|
|
316
|
+
topic = str(matcher.replaceFirst(transform_replacement))
|
|
317
|
+
|
|
318
|
+
# Additional check to confirm that the topic present
|
|
319
|
+
# in connector topics
|
|
320
|
+
|
|
321
|
+
if topic in self.connector_manifest.topic_names:
|
|
322
|
+
# include schema name for three-level hierarchies
|
|
323
|
+
if has_three_level_hierarchy(source_platform) and len(table) > 1:
|
|
324
|
+
source_table = f"{table[-2]}.{table[-1]}"
|
|
325
|
+
|
|
326
|
+
dataset_name = get_dataset_name(database_name, source_table)
|
|
327
|
+
|
|
328
|
+
lineage = KafkaConnectLineage(
|
|
329
|
+
source_dataset=dataset_name,
|
|
330
|
+
source_platform=source_platform,
|
|
331
|
+
target_dataset=topic,
|
|
332
|
+
target_platform=KAFKA,
|
|
333
|
+
)
|
|
334
|
+
topic_names.remove(topic)
|
|
335
|
+
lineages.append(lineage)
|
|
336
|
+
|
|
337
|
+
if topic_names:
|
|
338
|
+
lineages.extend(
|
|
339
|
+
self.default_get_lineages(
|
|
340
|
+
database_name=database_name,
|
|
341
|
+
source_platform=source_platform,
|
|
342
|
+
topic_prefix=topic_prefix,
|
|
343
|
+
topic_names=topic_names,
|
|
344
|
+
include_source_dataset=False,
|
|
345
|
+
)
|
|
346
|
+
)
|
|
347
|
+
self.report.warning(
|
|
348
|
+
"Could not find input dataset for connector topics",
|
|
349
|
+
f"{self.connector_manifest.name} : {topic_names}",
|
|
350
|
+
)
|
|
351
|
+
return lineages
|
|
352
|
+
else:
|
|
353
|
+
include_source_dataset = True
|
|
354
|
+
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
355
|
+
self.report.warning(
|
|
356
|
+
"Could not find input dataset, connector has unknown transform",
|
|
357
|
+
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
358
|
+
)
|
|
359
|
+
include_source_dataset = False
|
|
360
|
+
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
361
|
+
self.report.warning(
|
|
362
|
+
"Could not find input dataset, connector has one or more unknown transforms",
|
|
363
|
+
self.connector_manifest.name,
|
|
364
|
+
)
|
|
365
|
+
include_source_dataset = False
|
|
366
|
+
lineages = self.default_get_lineages(
|
|
367
|
+
database_name=database_name,
|
|
368
|
+
source_platform=source_platform,
|
|
369
|
+
topic_prefix=topic_prefix,
|
|
370
|
+
include_source_dataset=include_source_dataset,
|
|
371
|
+
)
|
|
372
|
+
return lineages
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
@dataclass
|
|
376
|
+
class MongoSourceConnector(BaseConnector):
|
|
377
|
+
# https://www.mongodb.com/docs/kafka-connector/current/source-connector/
|
|
378
|
+
|
|
379
|
+
@dataclass
|
|
380
|
+
class MongoSourceParser:
|
|
381
|
+
db_connection_url: Optional[str]
|
|
382
|
+
source_platform: str
|
|
383
|
+
database_name: Optional[str]
|
|
384
|
+
topic_prefix: Optional[str]
|
|
385
|
+
transforms: List[str]
|
|
386
|
+
|
|
387
|
+
def get_parser(
|
|
388
|
+
self,
|
|
389
|
+
connector_manifest: ConnectorManifest,
|
|
390
|
+
) -> MongoSourceParser:
|
|
391
|
+
parser = self.MongoSourceParser(
|
|
392
|
+
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
|
+
source_platform="mongodb",
|
|
394
|
+
database_name=connector_manifest.config.get("database"),
|
|
395
|
+
topic_prefix=connector_manifest.config.get("topic_prefix"),
|
|
396
|
+
transforms=(
|
|
397
|
+
connector_manifest.config["transforms"].split(",")
|
|
398
|
+
if "transforms" in connector_manifest.config
|
|
399
|
+
else []
|
|
400
|
+
),
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
return parser
|
|
404
|
+
|
|
405
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
406
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
407
|
+
parser = self.get_parser(self.connector_manifest)
|
|
408
|
+
source_platform = parser.source_platform
|
|
409
|
+
topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
|
|
410
|
+
|
|
411
|
+
if not self.connector_manifest.topic_names:
|
|
412
|
+
return lineages
|
|
413
|
+
|
|
414
|
+
for topic in self.connector_manifest.topic_names:
|
|
415
|
+
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
416
|
+
|
|
417
|
+
if found:
|
|
418
|
+
table_name = get_dataset_name(found.group(1), found.group(2))
|
|
419
|
+
|
|
420
|
+
lineage = KafkaConnectLineage(
|
|
421
|
+
source_dataset=table_name,
|
|
422
|
+
source_platform=source_platform,
|
|
423
|
+
target_dataset=topic,
|
|
424
|
+
target_platform=KAFKA,
|
|
425
|
+
)
|
|
426
|
+
lineages.append(lineage)
|
|
427
|
+
return lineages
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@dataclass
|
|
431
|
+
class DebeziumSourceConnector(BaseConnector):
|
|
432
|
+
@dataclass
|
|
433
|
+
class DebeziumParser:
|
|
434
|
+
source_platform: str
|
|
435
|
+
server_name: Optional[str]
|
|
436
|
+
database_name: Optional[str]
|
|
437
|
+
|
|
438
|
+
def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
|
|
439
|
+
if "topic.prefix" in connector_manifest.config:
|
|
440
|
+
return connector_manifest.config["topic.prefix"]
|
|
441
|
+
else:
|
|
442
|
+
return connector_manifest.config.get("database.server.name", "")
|
|
443
|
+
|
|
444
|
+
def get_parser(
|
|
445
|
+
self,
|
|
446
|
+
connector_manifest: ConnectorManifest,
|
|
447
|
+
) -> DebeziumParser:
|
|
448
|
+
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
449
|
+
|
|
450
|
+
if connector_class == "io.debezium.connector.mysql.MySqlConnector":
|
|
451
|
+
parser = self.DebeziumParser(
|
|
452
|
+
source_platform="mysql",
|
|
453
|
+
server_name=self.get_server_name(connector_manifest),
|
|
454
|
+
database_name=None,
|
|
455
|
+
)
|
|
456
|
+
elif connector_class == "MySqlConnector":
|
|
457
|
+
parser = self.DebeziumParser(
|
|
458
|
+
source_platform="mysql",
|
|
459
|
+
server_name=self.get_server_name(connector_manifest),
|
|
460
|
+
database_name=None,
|
|
461
|
+
)
|
|
462
|
+
elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
|
|
463
|
+
parser = self.DebeziumParser(
|
|
464
|
+
source_platform="mongodb",
|
|
465
|
+
server_name=self.get_server_name(connector_manifest),
|
|
466
|
+
database_name=None,
|
|
467
|
+
)
|
|
468
|
+
elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
|
|
469
|
+
parser = self.DebeziumParser(
|
|
470
|
+
source_platform="postgres",
|
|
471
|
+
server_name=self.get_server_name(connector_manifest),
|
|
472
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
473
|
+
)
|
|
474
|
+
elif connector_class == "io.debezium.connector.oracle.OracleConnector":
|
|
475
|
+
parser = self.DebeziumParser(
|
|
476
|
+
source_platform="oracle",
|
|
477
|
+
server_name=self.get_server_name(connector_manifest),
|
|
478
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
479
|
+
)
|
|
480
|
+
elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
|
|
481
|
+
database_name = connector_manifest.config.get(
|
|
482
|
+
"database.names"
|
|
483
|
+
) or connector_manifest.config.get("database.dbname")
|
|
484
|
+
|
|
485
|
+
if "," in str(database_name):
|
|
486
|
+
raise Exception(
|
|
487
|
+
f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
parser = self.DebeziumParser(
|
|
491
|
+
source_platform="mssql",
|
|
492
|
+
server_name=self.get_server_name(connector_manifest),
|
|
493
|
+
database_name=database_name,
|
|
494
|
+
)
|
|
495
|
+
elif connector_class == "io.debezium.connector.db2.Db2Connector":
|
|
496
|
+
parser = self.DebeziumParser(
|
|
497
|
+
source_platform="db2",
|
|
498
|
+
server_name=self.get_server_name(connector_manifest),
|
|
499
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
500
|
+
)
|
|
501
|
+
elif connector_class == "io.debezium.connector.vitess.VitessConnector":
|
|
502
|
+
parser = self.DebeziumParser(
|
|
503
|
+
source_platform="vitess",
|
|
504
|
+
server_name=self.get_server_name(connector_manifest),
|
|
505
|
+
database_name=connector_manifest.config.get("vitess.keyspace"),
|
|
506
|
+
)
|
|
507
|
+
else:
|
|
508
|
+
raise ValueError(f"Connector class '{connector_class}' is unknown.")
|
|
509
|
+
|
|
510
|
+
return parser
|
|
511
|
+
|
|
512
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
513
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
514
|
+
|
|
515
|
+
try:
|
|
516
|
+
parser = self.get_parser(self.connector_manifest)
|
|
517
|
+
source_platform = parser.source_platform
|
|
518
|
+
server_name = parser.server_name
|
|
519
|
+
database_name = parser.database_name
|
|
520
|
+
topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
|
|
521
|
+
|
|
522
|
+
if not self.connector_manifest.topic_names:
|
|
523
|
+
return lineages
|
|
524
|
+
|
|
525
|
+
for topic in self.connector_manifest.topic_names:
|
|
526
|
+
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
527
|
+
|
|
528
|
+
if found:
|
|
529
|
+
table_name = get_dataset_name(database_name, found.group(2))
|
|
530
|
+
|
|
531
|
+
lineage = KafkaConnectLineage(
|
|
532
|
+
source_dataset=table_name,
|
|
533
|
+
source_platform=source_platform,
|
|
534
|
+
target_dataset=topic,
|
|
535
|
+
target_platform=KAFKA,
|
|
536
|
+
)
|
|
537
|
+
lineages.append(lineage)
|
|
538
|
+
return lineages
|
|
539
|
+
except Exception as e:
|
|
540
|
+
self.report.warning(
|
|
541
|
+
"Error resolving lineage for connector",
|
|
542
|
+
self.connector_manifest.name,
|
|
543
|
+
exc=e,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
@dataclass
|
|
550
|
+
class ConfigDrivenSourceConnector(BaseConnector):
|
|
551
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
552
|
+
lineages = []
|
|
553
|
+
for connector in self.config.generic_connectors:
|
|
554
|
+
if connector.connector_name == self.connector_manifest.name:
|
|
555
|
+
target_connector = connector
|
|
556
|
+
break
|
|
557
|
+
for topic in self.connector_manifest.topic_names:
|
|
558
|
+
lineage = KafkaConnectLineage(
|
|
559
|
+
source_dataset=target_connector.source_dataset,
|
|
560
|
+
source_platform=target_connector.source_platform,
|
|
561
|
+
target_dataset=topic,
|
|
562
|
+
target_platform=KAFKA,
|
|
563
|
+
)
|
|
564
|
+
lineages.append(lineage)
|
|
565
|
+
return lineages
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector"
|
|
569
|
+
DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector"
|
|
570
|
+
MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector"
|
|
@@ -31,6 +31,10 @@ from looker_sdk.sdk.api40.models import (
|
|
|
31
31
|
from pydantic.class_validators import validator
|
|
32
32
|
|
|
33
33
|
import datahub.emitter.mce_builder as builder
|
|
34
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
35
|
+
PlatformResource,
|
|
36
|
+
PlatformResourceKey,
|
|
37
|
+
)
|
|
34
38
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
35
39
|
from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp
|
|
36
40
|
from datahub.ingestion.api.report import Report
|
|
@@ -106,7 +110,7 @@ from datahub.utilities.lossy_collections import LossyList, LossySet
|
|
|
106
110
|
from datahub.utilities.url_util import remove_port_from_url
|
|
107
111
|
|
|
108
112
|
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
|
|
109
|
-
|
|
113
|
+
LOOKER = "looker"
|
|
110
114
|
logger = logging.getLogger(__name__)
|
|
111
115
|
|
|
112
116
|
|
|
@@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
1411
1415
|
|
|
1412
1416
|
resolved_user_ids: int = 0
|
|
1413
1417
|
email_ids_missing: int = 0 # resolved users with missing email addresses
|
|
1418
|
+
looker_user_count: int = 0
|
|
1414
1419
|
|
|
1415
1420
|
_looker_api: Optional[LookerAPI] = None
|
|
1416
1421
|
query_latency: Dict[str, datetime.timedelta] = dataclasses_field(
|
|
@@ -1614,9 +1619,21 @@ class LookerDashboard:
|
|
|
1614
1619
|
class LookerUserRegistry:
|
|
1615
1620
|
looker_api_wrapper: LookerAPI
|
|
1616
1621
|
fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"])
|
|
1622
|
+
_user_cache: Dict[str, LookerUser] = {}
|
|
1617
1623
|
|
|
1618
|
-
def __init__(self, looker_api: LookerAPI):
|
|
1624
|
+
def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport):
|
|
1619
1625
|
self.looker_api_wrapper = looker_api
|
|
1626
|
+
self.report = report
|
|
1627
|
+
self._initialize_user_cache()
|
|
1628
|
+
|
|
1629
|
+
def _initialize_user_cache(self) -> None:
|
|
1630
|
+
raw_users: Sequence[User] = self.looker_api_wrapper.all_users(
|
|
1631
|
+
user_fields=self.fields
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1634
|
+
for raw_user in raw_users:
|
|
1635
|
+
looker_user = LookerUser.create_looker_user(raw_user)
|
|
1636
|
+
self._user_cache[str(looker_user.id)] = looker_user
|
|
1620
1637
|
|
|
1621
1638
|
def get_by_id(self, id_: str) -> Optional[LookerUser]:
|
|
1622
1639
|
if not id_:
|
|
@@ -1624,6 +1641,9 @@ class LookerUserRegistry:
|
|
|
1624
1641
|
|
|
1625
1642
|
logger.debug(f"Will get user {id_}")
|
|
1626
1643
|
|
|
1644
|
+
if str(id_) in self._user_cache:
|
|
1645
|
+
return self._user_cache.get(str(id_))
|
|
1646
|
+
|
|
1627
1647
|
raw_user: Optional[User] = self.looker_api_wrapper.get_user(
|
|
1628
1648
|
str(id_), user_fields=self.fields
|
|
1629
1649
|
)
|
|
@@ -1632,3 +1652,35 @@ class LookerUserRegistry:
|
|
|
1632
1652
|
|
|
1633
1653
|
looker_user = LookerUser.create_looker_user(raw_user)
|
|
1634
1654
|
return looker_user
|
|
1655
|
+
|
|
1656
|
+
def to_platform_resource(
|
|
1657
|
+
self, platform_instance: Optional[str]
|
|
1658
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1659
|
+
try:
|
|
1660
|
+
platform_resource_key = PlatformResourceKey(
|
|
1661
|
+
platform=LOOKER,
|
|
1662
|
+
resource_type="USER_ID_MAPPING",
|
|
1663
|
+
platform_instance=platform_instance,
|
|
1664
|
+
primary_key="",
|
|
1665
|
+
)
|
|
1666
|
+
|
|
1667
|
+
# Extract user email mappings
|
|
1668
|
+
user_email_cache = {
|
|
1669
|
+
user_id: user.email
|
|
1670
|
+
for user_id, user in self._user_cache.items()
|
|
1671
|
+
if user.email
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
platform_resource = PlatformResource.create(
|
|
1675
|
+
key=platform_resource_key,
|
|
1676
|
+
value=user_email_cache,
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
self.report.looker_user_count = len(user_email_cache)
|
|
1680
|
+
yield from platform_resource.to_mcps()
|
|
1681
|
+
|
|
1682
|
+
except Exception as exc:
|
|
1683
|
+
self.report.warning(
|
|
1684
|
+
message="Failed to generate platform resource for looker id mappings",
|
|
1685
|
+
exc=exc,
|
|
1686
|
+
)
|
|
@@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel):
|
|
|
68
68
|
get_look_calls: int = 0
|
|
69
69
|
search_looks_calls: int = 0
|
|
70
70
|
search_dashboards_calls: int = 0
|
|
71
|
+
all_user_calls: int = 0
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
class LookerAPI:
|
|
@@ -135,7 +136,7 @@ class LookerAPI:
|
|
|
135
136
|
|
|
136
137
|
return permissions
|
|
137
138
|
|
|
138
|
-
@lru_cache(maxsize=
|
|
139
|
+
@lru_cache(maxsize=5000)
|
|
139
140
|
def get_user(self, id_: str, user_fields: str) -> Optional[User]:
|
|
140
141
|
self.client_stats.user_calls += 1
|
|
141
142
|
try:
|
|
@@ -154,6 +155,17 @@ class LookerAPI:
|
|
|
154
155
|
# User not found
|
|
155
156
|
return None
|
|
156
157
|
|
|
158
|
+
def all_users(self, user_fields: str) -> Sequence[User]:
|
|
159
|
+
self.client_stats.all_user_calls += 1
|
|
160
|
+
try:
|
|
161
|
+
return self.client.all_users(
|
|
162
|
+
fields=cast(str, user_fields),
|
|
163
|
+
transport_options=self.transport_options,
|
|
164
|
+
)
|
|
165
|
+
except SDKError as e:
|
|
166
|
+
logger.warning(f"Failure was {e}")
|
|
167
|
+
return []
|
|
168
|
+
|
|
157
169
|
def execute_query(self, write_query: WriteQuery) -> List[Dict]:
|
|
158
170
|
logger.debug(f"Executing query {write_query}")
|
|
159
171
|
self.client_stats.query_calls += 1
|