acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from sqlalchemy.engine.url import make_url
|
|
7
|
+
|
|
8
|
+
from datahub.ingestion.source.kafka_connect.common import (
|
|
9
|
+
CONNECTOR_CLASS,
|
|
10
|
+
KAFKA,
|
|
11
|
+
BaseConnector,
|
|
12
|
+
ConnectorManifest,
|
|
13
|
+
KafkaConnectLineage,
|
|
14
|
+
get_dataset_name,
|
|
15
|
+
has_three_level_hierarchy,
|
|
16
|
+
remove_prefix,
|
|
17
|
+
unquote,
|
|
18
|
+
)
|
|
19
|
+
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
20
|
+
get_platform_from_sqlalchemy_uri,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
26
|
+
REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
|
|
27
|
+
KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
|
|
28
|
+
# https://kafka.apache.org/documentation/#connect_included_transformation
|
|
29
|
+
KAFKA_NONTOPICROUTING_TRANSFORMS = [
|
|
30
|
+
"InsertField",
|
|
31
|
+
"InsertField$Key",
|
|
32
|
+
"InsertField$Value",
|
|
33
|
+
"ReplaceField",
|
|
34
|
+
"ReplaceField$Key",
|
|
35
|
+
"ReplaceField$Value",
|
|
36
|
+
"MaskField",
|
|
37
|
+
"MaskField$Key",
|
|
38
|
+
"MaskField$Value",
|
|
39
|
+
"ValueToKey",
|
|
40
|
+
"ValueToKey$Key",
|
|
41
|
+
"ValueToKey$Value",
|
|
42
|
+
"HoistField",
|
|
43
|
+
"HoistField$Key",
|
|
44
|
+
"HoistField$Value",
|
|
45
|
+
"ExtractField",
|
|
46
|
+
"ExtractField$Key",
|
|
47
|
+
"ExtractField$Value",
|
|
48
|
+
"SetSchemaMetadata",
|
|
49
|
+
"SetSchemaMetadata$Key",
|
|
50
|
+
"SetSchemaMetadata$Value",
|
|
51
|
+
"Flatten",
|
|
52
|
+
"Flatten$Key",
|
|
53
|
+
"Flatten$Value",
|
|
54
|
+
"Cast",
|
|
55
|
+
"Cast$Key",
|
|
56
|
+
"Cast$Value",
|
|
57
|
+
"HeadersFrom",
|
|
58
|
+
"HeadersFrom$Key",
|
|
59
|
+
"HeadersFrom$Value",
|
|
60
|
+
"TimestampConverter",
|
|
61
|
+
"Filter",
|
|
62
|
+
"InsertHeader",
|
|
63
|
+
"DropHeaders",
|
|
64
|
+
]
|
|
65
|
+
# https://docs.confluent.io/platform/current/connect/transforms/overview.html
|
|
66
|
+
CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
|
|
67
|
+
"Drop",
|
|
68
|
+
"Drop$Key",
|
|
69
|
+
"Drop$Value",
|
|
70
|
+
"Filter",
|
|
71
|
+
"Filter$Key",
|
|
72
|
+
"Filter$Value",
|
|
73
|
+
"TombstoneHandler",
|
|
74
|
+
]
|
|
75
|
+
KNOWN_NONTOPICROUTING_TRANSFORMS = (
|
|
76
|
+
KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
77
|
+
+ [
|
|
78
|
+
f"org.apache.kafka.connect.transforms.{t}"
|
|
79
|
+
for t in KAFKA_NONTOPICROUTING_TRANSFORMS
|
|
80
|
+
]
|
|
81
|
+
+ CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
82
|
+
+ [
|
|
83
|
+
f"io.confluent.connect.transforms.{t}"
|
|
84
|
+
for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
|
|
85
|
+
]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class JdbcParser:
|
|
90
|
+
db_connection_url: str
|
|
91
|
+
source_platform: str
|
|
92
|
+
database_name: str
|
|
93
|
+
topic_prefix: str
|
|
94
|
+
query: str
|
|
95
|
+
transforms: list
|
|
96
|
+
|
|
97
|
+
def get_parser(
|
|
98
|
+
self,
|
|
99
|
+
connector_manifest: ConnectorManifest,
|
|
100
|
+
) -> JdbcParser:
|
|
101
|
+
url = remove_prefix(
|
|
102
|
+
str(connector_manifest.config.get("connection.url")), "jdbc:"
|
|
103
|
+
)
|
|
104
|
+
url_instance = make_url(url)
|
|
105
|
+
source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
|
|
106
|
+
database_name = url_instance.database
|
|
107
|
+
assert database_name
|
|
108
|
+
db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
|
|
109
|
+
|
|
110
|
+
topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
|
|
111
|
+
|
|
112
|
+
query = self.connector_manifest.config.get("query", None)
|
|
113
|
+
|
|
114
|
+
transform_names = (
|
|
115
|
+
self.connector_manifest.config.get("transforms", "").split(",")
|
|
116
|
+
if self.connector_manifest.config.get("transforms")
|
|
117
|
+
else []
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
transforms = []
|
|
121
|
+
for name in transform_names:
|
|
122
|
+
transform = {"name": name}
|
|
123
|
+
transforms.append(transform)
|
|
124
|
+
for key in self.connector_manifest.config.keys():
|
|
125
|
+
if key.startswith(f"transforms.{name}."):
|
|
126
|
+
transform[
|
|
127
|
+
key.replace(f"transforms.{name}.", "")
|
|
128
|
+
] = self.connector_manifest.config[key]
|
|
129
|
+
|
|
130
|
+
return self.JdbcParser(
|
|
131
|
+
db_connection_url,
|
|
132
|
+
source_platform,
|
|
133
|
+
database_name,
|
|
134
|
+
topic_prefix,
|
|
135
|
+
query,
|
|
136
|
+
transforms,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def default_get_lineages(
|
|
140
|
+
self,
|
|
141
|
+
topic_prefix: str,
|
|
142
|
+
database_name: str,
|
|
143
|
+
source_platform: str,
|
|
144
|
+
topic_names: Optional[Iterable[str]] = None,
|
|
145
|
+
include_source_dataset: bool = True,
|
|
146
|
+
) -> List[KafkaConnectLineage]:
|
|
147
|
+
lineages: List[KafkaConnectLineage] = []
|
|
148
|
+
if not topic_names:
|
|
149
|
+
topic_names = self.connector_manifest.topic_names
|
|
150
|
+
table_name_tuples: List[Tuple] = self.get_table_names()
|
|
151
|
+
for topic in topic_names:
|
|
152
|
+
# All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
|
|
153
|
+
source_table: str = (
|
|
154
|
+
remove_prefix(topic, topic_prefix) if topic_prefix else topic
|
|
155
|
+
)
|
|
156
|
+
# include schema name for three-level hierarchies
|
|
157
|
+
if has_three_level_hierarchy(source_platform):
|
|
158
|
+
table_name_tuple: Tuple = next(
|
|
159
|
+
iter([t for t in table_name_tuples if t and t[-1] == source_table]),
|
|
160
|
+
(),
|
|
161
|
+
)
|
|
162
|
+
if len(table_name_tuple) > 1:
|
|
163
|
+
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
164
|
+
else:
|
|
165
|
+
include_source_dataset = False
|
|
166
|
+
self.report.warning(
|
|
167
|
+
"Could not find schema for table"
|
|
168
|
+
f"{self.connector_manifest.name} : {source_table}",
|
|
169
|
+
)
|
|
170
|
+
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
171
|
+
lineage = KafkaConnectLineage(
|
|
172
|
+
source_dataset=dataset_name if include_source_dataset else None,
|
|
173
|
+
source_platform=source_platform,
|
|
174
|
+
target_dataset=topic,
|
|
175
|
+
target_platform=KAFKA,
|
|
176
|
+
)
|
|
177
|
+
lineages.append(lineage)
|
|
178
|
+
return lineages
|
|
179
|
+
|
|
180
|
+
def get_table_names(self) -> List[Tuple]:
|
|
181
|
+
sep: str = "."
|
|
182
|
+
leading_quote_char: str = '"'
|
|
183
|
+
trailing_quote_char: str = leading_quote_char
|
|
184
|
+
|
|
185
|
+
table_ids: List[str] = []
|
|
186
|
+
if self.connector_manifest.tasks:
|
|
187
|
+
table_ids = (
|
|
188
|
+
",".join(
|
|
189
|
+
[
|
|
190
|
+
task["config"].get("tables")
|
|
191
|
+
for task in self.connector_manifest.tasks
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
).split(",")
|
|
195
|
+
quote_method = self.connector_manifest.config.get(
|
|
196
|
+
"quote.sql.identifiers", "always"
|
|
197
|
+
)
|
|
198
|
+
if (
|
|
199
|
+
quote_method == "always"
|
|
200
|
+
and table_ids
|
|
201
|
+
and table_ids[0]
|
|
202
|
+
and table_ids[-1]
|
|
203
|
+
):
|
|
204
|
+
leading_quote_char = table_ids[0][0]
|
|
205
|
+
trailing_quote_char = table_ids[-1][-1]
|
|
206
|
+
# This will only work for single character quotes
|
|
207
|
+
elif self.connector_manifest.config.get("table.whitelist"):
|
|
208
|
+
table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
|
|
209
|
+
|
|
210
|
+
# List of Tuple containing (schema, table)
|
|
211
|
+
tables: List[Tuple] = [
|
|
212
|
+
(
|
|
213
|
+
(
|
|
214
|
+
unquote(
|
|
215
|
+
table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
|
|
216
|
+
)
|
|
217
|
+
if len(table_id.split(sep)) > 1
|
|
218
|
+
else ""
|
|
219
|
+
),
|
|
220
|
+
unquote(
|
|
221
|
+
table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
|
|
222
|
+
),
|
|
223
|
+
)
|
|
224
|
+
for table_id in table_ids
|
|
225
|
+
]
|
|
226
|
+
return tables
|
|
227
|
+
|
|
228
|
+
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
229
|
+
flow_property_bag = {
|
|
230
|
+
k: v
|
|
231
|
+
for k, v in self.connector_manifest.config.items()
|
|
232
|
+
if k not in ["connection.password", "connection.user"]
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
# Mask/Remove properties that may reveal credentials
|
|
236
|
+
flow_property_bag["connection.url"] = self.get_parser(
|
|
237
|
+
self.connector_manifest
|
|
238
|
+
).db_connection_url
|
|
239
|
+
|
|
240
|
+
return flow_property_bag
|
|
241
|
+
|
|
242
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
243
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
244
|
+
parser = self.get_parser(self.connector_manifest)
|
|
245
|
+
source_platform = parser.source_platform
|
|
246
|
+
database_name = parser.database_name
|
|
247
|
+
query = parser.query
|
|
248
|
+
topic_prefix = parser.topic_prefix
|
|
249
|
+
transforms = parser.transforms
|
|
250
|
+
|
|
251
|
+
logging.debug(
|
|
252
|
+
f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if not self.connector_manifest.topic_names:
|
|
256
|
+
return lineages
|
|
257
|
+
|
|
258
|
+
if query:
|
|
259
|
+
# Lineage source_table can be extracted by parsing query
|
|
260
|
+
for topic in self.connector_manifest.topic_names:
|
|
261
|
+
# default method - as per earlier implementation
|
|
262
|
+
dataset_name: str = get_dataset_name(database_name, topic)
|
|
263
|
+
|
|
264
|
+
lineage = KafkaConnectLineage(
|
|
265
|
+
source_dataset=None,
|
|
266
|
+
source_platform=source_platform,
|
|
267
|
+
target_dataset=topic,
|
|
268
|
+
target_platform=KAFKA,
|
|
269
|
+
)
|
|
270
|
+
lineages.append(lineage)
|
|
271
|
+
self.report.warning(
|
|
272
|
+
"Could not find input dataset, the connector has query configuration set",
|
|
273
|
+
self.connector_manifest.name,
|
|
274
|
+
)
|
|
275
|
+
return lineages
|
|
276
|
+
|
|
277
|
+
SINGLE_TRANSFORM = len(transforms) == 1
|
|
278
|
+
NO_TRANSFORM = len(transforms) == 0
|
|
279
|
+
UNKNOWN_TRANSFORM = any(
|
|
280
|
+
[
|
|
281
|
+
transform["type"]
|
|
282
|
+
not in self.KNOWN_TOPICROUTING_TRANSFORMS
|
|
283
|
+
+ self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
284
|
+
for transform in transforms
|
|
285
|
+
]
|
|
286
|
+
)
|
|
287
|
+
ALL_TRANSFORMS_NON_TOPICROUTING = all(
|
|
288
|
+
[
|
|
289
|
+
transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
|
|
290
|
+
for transform in transforms
|
|
291
|
+
]
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
|
|
295
|
+
return self.default_get_lineages(
|
|
296
|
+
database_name=database_name,
|
|
297
|
+
source_platform=source_platform,
|
|
298
|
+
topic_prefix=topic_prefix,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
|
|
302
|
+
tables = self.get_table_names()
|
|
303
|
+
topic_names = list(self.connector_manifest.topic_names)
|
|
304
|
+
|
|
305
|
+
from java.util.regex import Pattern
|
|
306
|
+
|
|
307
|
+
for table in tables:
|
|
308
|
+
source_table: str = table[-1]
|
|
309
|
+
topic = topic_prefix + source_table if topic_prefix else source_table
|
|
310
|
+
|
|
311
|
+
transform_regex = Pattern.compile(transforms[0]["regex"])
|
|
312
|
+
transform_replacement = transforms[0]["replacement"]
|
|
313
|
+
|
|
314
|
+
matcher = transform_regex.matcher(topic)
|
|
315
|
+
if matcher.matches():
|
|
316
|
+
topic = str(matcher.replaceFirst(transform_replacement))
|
|
317
|
+
|
|
318
|
+
# Additional check to confirm that the topic present
|
|
319
|
+
# in connector topics
|
|
320
|
+
|
|
321
|
+
if topic in self.connector_manifest.topic_names:
|
|
322
|
+
# include schema name for three-level hierarchies
|
|
323
|
+
if has_three_level_hierarchy(source_platform) and len(table) > 1:
|
|
324
|
+
source_table = f"{table[-2]}.{table[-1]}"
|
|
325
|
+
|
|
326
|
+
dataset_name = get_dataset_name(database_name, source_table)
|
|
327
|
+
|
|
328
|
+
lineage = KafkaConnectLineage(
|
|
329
|
+
source_dataset=dataset_name,
|
|
330
|
+
source_platform=source_platform,
|
|
331
|
+
target_dataset=topic,
|
|
332
|
+
target_platform=KAFKA,
|
|
333
|
+
)
|
|
334
|
+
topic_names.remove(topic)
|
|
335
|
+
lineages.append(lineage)
|
|
336
|
+
|
|
337
|
+
if topic_names:
|
|
338
|
+
lineages.extend(
|
|
339
|
+
self.default_get_lineages(
|
|
340
|
+
database_name=database_name,
|
|
341
|
+
source_platform=source_platform,
|
|
342
|
+
topic_prefix=topic_prefix,
|
|
343
|
+
topic_names=topic_names,
|
|
344
|
+
include_source_dataset=False,
|
|
345
|
+
)
|
|
346
|
+
)
|
|
347
|
+
self.report.warning(
|
|
348
|
+
"Could not find input dataset for connector topics",
|
|
349
|
+
f"{self.connector_manifest.name} : {topic_names}",
|
|
350
|
+
)
|
|
351
|
+
return lineages
|
|
352
|
+
else:
|
|
353
|
+
include_source_dataset = True
|
|
354
|
+
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
355
|
+
self.report.warning(
|
|
356
|
+
"Could not find input dataset, connector has unknown transform",
|
|
357
|
+
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
358
|
+
)
|
|
359
|
+
include_source_dataset = False
|
|
360
|
+
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
361
|
+
self.report.warning(
|
|
362
|
+
"Could not find input dataset, connector has one or more unknown transforms",
|
|
363
|
+
self.connector_manifest.name,
|
|
364
|
+
)
|
|
365
|
+
include_source_dataset = False
|
|
366
|
+
lineages = self.default_get_lineages(
|
|
367
|
+
database_name=database_name,
|
|
368
|
+
source_platform=source_platform,
|
|
369
|
+
topic_prefix=topic_prefix,
|
|
370
|
+
include_source_dataset=include_source_dataset,
|
|
371
|
+
)
|
|
372
|
+
return lineages
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
@dataclass
|
|
376
|
+
class MongoSourceConnector(BaseConnector):
|
|
377
|
+
# https://www.mongodb.com/docs/kafka-connector/current/source-connector/
|
|
378
|
+
|
|
379
|
+
@dataclass
|
|
380
|
+
class MongoSourceParser:
|
|
381
|
+
db_connection_url: Optional[str]
|
|
382
|
+
source_platform: str
|
|
383
|
+
database_name: Optional[str]
|
|
384
|
+
topic_prefix: Optional[str]
|
|
385
|
+
transforms: List[str]
|
|
386
|
+
|
|
387
|
+
def get_parser(
|
|
388
|
+
self,
|
|
389
|
+
connector_manifest: ConnectorManifest,
|
|
390
|
+
) -> MongoSourceParser:
|
|
391
|
+
parser = self.MongoSourceParser(
|
|
392
|
+
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
|
+
source_platform="mongodb",
|
|
394
|
+
database_name=connector_manifest.config.get("database"),
|
|
395
|
+
topic_prefix=connector_manifest.config.get("topic_prefix"),
|
|
396
|
+
transforms=(
|
|
397
|
+
connector_manifest.config["transforms"].split(",")
|
|
398
|
+
if "transforms" in connector_manifest.config
|
|
399
|
+
else []
|
|
400
|
+
),
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
return parser
|
|
404
|
+
|
|
405
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
406
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
407
|
+
parser = self.get_parser(self.connector_manifest)
|
|
408
|
+
source_platform = parser.source_platform
|
|
409
|
+
topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
|
|
410
|
+
|
|
411
|
+
if not self.connector_manifest.topic_names:
|
|
412
|
+
return lineages
|
|
413
|
+
|
|
414
|
+
for topic in self.connector_manifest.topic_names:
|
|
415
|
+
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
416
|
+
|
|
417
|
+
if found:
|
|
418
|
+
table_name = get_dataset_name(found.group(1), found.group(2))
|
|
419
|
+
|
|
420
|
+
lineage = KafkaConnectLineage(
|
|
421
|
+
source_dataset=table_name,
|
|
422
|
+
source_platform=source_platform,
|
|
423
|
+
target_dataset=topic,
|
|
424
|
+
target_platform=KAFKA,
|
|
425
|
+
)
|
|
426
|
+
lineages.append(lineage)
|
|
427
|
+
return lineages
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@dataclass
|
|
431
|
+
class DebeziumSourceConnector(BaseConnector):
|
|
432
|
+
@dataclass
|
|
433
|
+
class DebeziumParser:
|
|
434
|
+
source_platform: str
|
|
435
|
+
server_name: Optional[str]
|
|
436
|
+
database_name: Optional[str]
|
|
437
|
+
|
|
438
|
+
def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
|
|
439
|
+
if "topic.prefix" in connector_manifest.config:
|
|
440
|
+
return connector_manifest.config["topic.prefix"]
|
|
441
|
+
else:
|
|
442
|
+
return connector_manifest.config.get("database.server.name", "")
|
|
443
|
+
|
|
444
|
+
def get_parser(
|
|
445
|
+
self,
|
|
446
|
+
connector_manifest: ConnectorManifest,
|
|
447
|
+
) -> DebeziumParser:
|
|
448
|
+
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
449
|
+
|
|
450
|
+
if connector_class == "io.debezium.connector.mysql.MySqlConnector":
|
|
451
|
+
parser = self.DebeziumParser(
|
|
452
|
+
source_platform="mysql",
|
|
453
|
+
server_name=self.get_server_name(connector_manifest),
|
|
454
|
+
database_name=None,
|
|
455
|
+
)
|
|
456
|
+
elif connector_class == "MySqlConnector":
|
|
457
|
+
parser = self.DebeziumParser(
|
|
458
|
+
source_platform="mysql",
|
|
459
|
+
server_name=self.get_server_name(connector_manifest),
|
|
460
|
+
database_name=None,
|
|
461
|
+
)
|
|
462
|
+
elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
|
|
463
|
+
parser = self.DebeziumParser(
|
|
464
|
+
source_platform="mongodb",
|
|
465
|
+
server_name=self.get_server_name(connector_manifest),
|
|
466
|
+
database_name=None,
|
|
467
|
+
)
|
|
468
|
+
elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
|
|
469
|
+
parser = self.DebeziumParser(
|
|
470
|
+
source_platform="postgres",
|
|
471
|
+
server_name=self.get_server_name(connector_manifest),
|
|
472
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
473
|
+
)
|
|
474
|
+
elif connector_class == "io.debezium.connector.oracle.OracleConnector":
|
|
475
|
+
parser = self.DebeziumParser(
|
|
476
|
+
source_platform="oracle",
|
|
477
|
+
server_name=self.get_server_name(connector_manifest),
|
|
478
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
479
|
+
)
|
|
480
|
+
elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
|
|
481
|
+
database_name = connector_manifest.config.get(
|
|
482
|
+
"database.names"
|
|
483
|
+
) or connector_manifest.config.get("database.dbname")
|
|
484
|
+
|
|
485
|
+
if "," in str(database_name):
|
|
486
|
+
raise Exception(
|
|
487
|
+
f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
parser = self.DebeziumParser(
|
|
491
|
+
source_platform="mssql",
|
|
492
|
+
server_name=self.get_server_name(connector_manifest),
|
|
493
|
+
database_name=database_name,
|
|
494
|
+
)
|
|
495
|
+
elif connector_class == "io.debezium.connector.db2.Db2Connector":
|
|
496
|
+
parser = self.DebeziumParser(
|
|
497
|
+
source_platform="db2",
|
|
498
|
+
server_name=self.get_server_name(connector_manifest),
|
|
499
|
+
database_name=connector_manifest.config.get("database.dbname"),
|
|
500
|
+
)
|
|
501
|
+
elif connector_class == "io.debezium.connector.vitess.VitessConnector":
|
|
502
|
+
parser = self.DebeziumParser(
|
|
503
|
+
source_platform="vitess",
|
|
504
|
+
server_name=self.get_server_name(connector_manifest),
|
|
505
|
+
database_name=connector_manifest.config.get("vitess.keyspace"),
|
|
506
|
+
)
|
|
507
|
+
else:
|
|
508
|
+
raise ValueError(f"Connector class '{connector_class}' is unknown.")
|
|
509
|
+
|
|
510
|
+
return parser
|
|
511
|
+
|
|
512
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
513
|
+
lineages: List[KafkaConnectLineage] = list()
|
|
514
|
+
|
|
515
|
+
try:
|
|
516
|
+
parser = self.get_parser(self.connector_manifest)
|
|
517
|
+
source_platform = parser.source_platform
|
|
518
|
+
server_name = parser.server_name
|
|
519
|
+
database_name = parser.database_name
|
|
520
|
+
topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
|
|
521
|
+
|
|
522
|
+
if not self.connector_manifest.topic_names:
|
|
523
|
+
return lineages
|
|
524
|
+
|
|
525
|
+
for topic in self.connector_manifest.topic_names:
|
|
526
|
+
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
527
|
+
|
|
528
|
+
if found:
|
|
529
|
+
table_name = get_dataset_name(database_name, found.group(2))
|
|
530
|
+
|
|
531
|
+
lineage = KafkaConnectLineage(
|
|
532
|
+
source_dataset=table_name,
|
|
533
|
+
source_platform=source_platform,
|
|
534
|
+
target_dataset=topic,
|
|
535
|
+
target_platform=KAFKA,
|
|
536
|
+
)
|
|
537
|
+
lineages.append(lineage)
|
|
538
|
+
return lineages
|
|
539
|
+
except Exception as e:
|
|
540
|
+
self.report.warning(
|
|
541
|
+
"Error resolving lineage for connector",
|
|
542
|
+
self.connector_manifest.name,
|
|
543
|
+
exc=e,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
@dataclass
|
|
550
|
+
class ConfigDrivenSourceConnector(BaseConnector):
|
|
551
|
+
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
552
|
+
lineages = []
|
|
553
|
+
for connector in self.config.generic_connectors:
|
|
554
|
+
if connector.connector_name == self.connector_manifest.name:
|
|
555
|
+
target_connector = connector
|
|
556
|
+
break
|
|
557
|
+
for topic in self.connector_manifest.topic_names:
|
|
558
|
+
lineage = KafkaConnectLineage(
|
|
559
|
+
source_dataset=target_connector.source_dataset,
|
|
560
|
+
source_platform=target_connector.source_platform,
|
|
561
|
+
target_dataset=topic,
|
|
562
|
+
target_platform=KAFKA,
|
|
563
|
+
)
|
|
564
|
+
lineages.append(lineage)
|
|
565
|
+
return lineages
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector"
|
|
569
|
+
DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector"
|
|
570
|
+
MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector"
|
|
@@ -31,6 +31,10 @@ from looker_sdk.sdk.api40.models import (
|
|
|
31
31
|
from pydantic.class_validators import validator
|
|
32
32
|
|
|
33
33
|
import datahub.emitter.mce_builder as builder
|
|
34
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
35
|
+
PlatformResource,
|
|
36
|
+
PlatformResourceKey,
|
|
37
|
+
)
|
|
34
38
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
35
39
|
from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp
|
|
36
40
|
from datahub.ingestion.api.report import Report
|
|
@@ -106,7 +110,7 @@ from datahub.utilities.lossy_collections import LossyList, LossySet
|
|
|
106
110
|
from datahub.utilities.url_util import remove_port_from_url
|
|
107
111
|
|
|
108
112
|
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
|
|
109
|
-
|
|
113
|
+
LOOKER = "looker"
|
|
110
114
|
logger = logging.getLogger(__name__)
|
|
111
115
|
|
|
112
116
|
|
|
@@ -1404,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
1404
1408
|
dashboards_with_activity: LossySet[str] = dataclasses_field(
|
|
1405
1409
|
default_factory=LossySet
|
|
1406
1410
|
)
|
|
1411
|
+
|
|
1412
|
+
# Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data
|
|
1413
|
+
dashboards_skipped_for_usage: LossySet[str] = dataclasses_field(
|
|
1414
|
+
default_factory=LossySet
|
|
1415
|
+
)
|
|
1416
|
+
charts_skipped_for_usage: LossySet[str] = dataclasses_field(
|
|
1417
|
+
default_factory=LossySet
|
|
1418
|
+
)
|
|
1419
|
+
|
|
1407
1420
|
stage_latency: List[StageLatency] = dataclasses_field(default_factory=list)
|
|
1408
1421
|
_looker_explore_registry: Optional[LookerExploreRegistry] = None
|
|
1409
1422
|
total_explores: int = 0
|
|
@@ -1411,6 +1424,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
1411
1424
|
|
|
1412
1425
|
resolved_user_ids: int = 0
|
|
1413
1426
|
email_ids_missing: int = 0 # resolved users with missing email addresses
|
|
1427
|
+
looker_user_count: int = 0
|
|
1414
1428
|
|
|
1415
1429
|
_looker_api: Optional[LookerAPI] = None
|
|
1416
1430
|
query_latency: Dict[str, datetime.timedelta] = dataclasses_field(
|
|
@@ -1614,9 +1628,21 @@ class LookerDashboard:
|
|
|
1614
1628
|
class LookerUserRegistry:
|
|
1615
1629
|
looker_api_wrapper: LookerAPI
|
|
1616
1630
|
fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"])
|
|
1631
|
+
_user_cache: Dict[str, LookerUser] = {}
|
|
1617
1632
|
|
|
1618
|
-
def __init__(self, looker_api: LookerAPI):
|
|
1633
|
+
def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport):
|
|
1619
1634
|
self.looker_api_wrapper = looker_api
|
|
1635
|
+
self.report = report
|
|
1636
|
+
self._initialize_user_cache()
|
|
1637
|
+
|
|
1638
|
+
def _initialize_user_cache(self) -> None:
|
|
1639
|
+
raw_users: Sequence[User] = self.looker_api_wrapper.all_users(
|
|
1640
|
+
user_fields=self.fields
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
for raw_user in raw_users:
|
|
1644
|
+
looker_user = LookerUser.create_looker_user(raw_user)
|
|
1645
|
+
self._user_cache[str(looker_user.id)] = looker_user
|
|
1620
1646
|
|
|
1621
1647
|
def get_by_id(self, id_: str) -> Optional[LookerUser]:
|
|
1622
1648
|
if not id_:
|
|
@@ -1624,6 +1650,9 @@ class LookerUserRegistry:
|
|
|
1624
1650
|
|
|
1625
1651
|
logger.debug(f"Will get user {id_}")
|
|
1626
1652
|
|
|
1653
|
+
if str(id_) in self._user_cache:
|
|
1654
|
+
return self._user_cache.get(str(id_))
|
|
1655
|
+
|
|
1627
1656
|
raw_user: Optional[User] = self.looker_api_wrapper.get_user(
|
|
1628
1657
|
str(id_), user_fields=self.fields
|
|
1629
1658
|
)
|
|
@@ -1632,3 +1661,35 @@ class LookerUserRegistry:
|
|
|
1632
1661
|
|
|
1633
1662
|
looker_user = LookerUser.create_looker_user(raw_user)
|
|
1634
1663
|
return looker_user
|
|
1664
|
+
|
|
1665
|
+
def to_platform_resource(
|
|
1666
|
+
self, platform_instance: Optional[str]
|
|
1667
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1668
|
+
try:
|
|
1669
|
+
platform_resource_key = PlatformResourceKey(
|
|
1670
|
+
platform=LOOKER,
|
|
1671
|
+
resource_type="USER_ID_MAPPING",
|
|
1672
|
+
platform_instance=platform_instance,
|
|
1673
|
+
primary_key="",
|
|
1674
|
+
)
|
|
1675
|
+
|
|
1676
|
+
# Extract user email mappings
|
|
1677
|
+
user_email_cache = {
|
|
1678
|
+
user_id: user.email
|
|
1679
|
+
for user_id, user in self._user_cache.items()
|
|
1680
|
+
if user.email
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
platform_resource = PlatformResource.create(
|
|
1684
|
+
key=platform_resource_key,
|
|
1685
|
+
value=user_email_cache,
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
self.report.looker_user_count = len(user_email_cache)
|
|
1689
|
+
yield from platform_resource.to_mcps()
|
|
1690
|
+
|
|
1691
|
+
except Exception as exc:
|
|
1692
|
+
self.report.warning(
|
|
1693
|
+
message="Failed to generate platform resource for looker id mappings",
|
|
1694
|
+
exc=exc,
|
|
1695
|
+
)
|