acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (32) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/METADATA +2456 -2456
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/RECORD +31 -27
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  8. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  9. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  10. datahub/ingestion/source/kafka_connect/common.py +202 -0
  11. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  12. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  13. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  14. datahub/ingestion/source/looker/looker_common.py +54 -2
  15. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  16. datahub/ingestion/source/looker/looker_source.py +12 -1
  17. datahub/ingestion/source/mlflow.py +30 -5
  18. datahub/ingestion/source/powerbi/config.py +1 -14
  19. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  20. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  21. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -0
  22. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  23. datahub/ingestion/source/sql/mssql/source.py +14 -0
  24. datahub/ingestion/source/tableau/tableau.py +4 -5
  25. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  26. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  27. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  28. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  29. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  30. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  31. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/WHEEL +0 -0
  32. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Iterable, List, Optional, Tuple
4
+
5
+ from datahub.ingestion.source.kafka_connect.common import (
6
+ KAFKA,
7
+ BaseConnector,
8
+ ConnectorManifest,
9
+ KafkaConnectLineage,
10
+ )
11
+
12
+
13
+ @dataclass
14
+ class ConfluentS3SinkConnector(BaseConnector):
15
+ @dataclass
16
+ class S3SinkParser:
17
+ target_platform: str
18
+ bucket: str
19
+ topics_dir: str
20
+ topics: Iterable[str]
21
+
22
+ def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
23
+ # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
24
+ bucket = connector_manifest.config.get("s3.bucket.name")
25
+ if not bucket:
26
+ raise ValueError(
27
+ "Could not find 's3.bucket.name' in connector configuration"
28
+ )
29
+
30
+ # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
31
+ topics_dir = connector_manifest.config.get("topics.dir", "topics")
32
+
33
+ return self.S3SinkParser(
34
+ target_platform="s3",
35
+ bucket=bucket,
36
+ topics_dir=topics_dir,
37
+ topics=connector_manifest.topic_names,
38
+ )
39
+
40
+ def extract_flow_property_bag(self) -> Dict[str, str]:
41
+ # Mask/Remove properties that may reveal credentials
42
+ flow_property_bag = {
43
+ k: v
44
+ for k, v in self.connector_manifest.config.items()
45
+ if k
46
+ not in [
47
+ "aws.access.key.id",
48
+ "aws.secret.access.key",
49
+ "s3.sse.customer.key",
50
+ "s3.proxy.password",
51
+ ]
52
+ }
53
+ return flow_property_bag
54
+
55
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
56
+ try:
57
+ parser = self._get_parser(self.connector_manifest)
58
+
59
+ lineages: List[KafkaConnectLineage] = list()
60
+ for topic in parser.topics:
61
+ target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
62
+
63
+ lineages.append(
64
+ KafkaConnectLineage(
65
+ source_dataset=topic,
66
+ source_platform="kafka",
67
+ target_dataset=target_dataset,
68
+ target_platform=parser.target_platform,
69
+ )
70
+ )
71
+ return lineages
72
+ except Exception as e:
73
+ self.report.warning(
74
+ "Error resolving lineage for connector",
75
+ self.connector_manifest.name,
76
+ exc=e,
77
+ )
78
+
79
+ return []
80
+
81
+
82
+ @dataclass
83
+ class SnowflakeSinkConnector(BaseConnector):
84
+ @dataclass
85
+ class SnowflakeParser:
86
+ database_name: str
87
+ schema_name: str
88
+ topics_to_tables: Dict[str, str]
89
+
90
+ def get_table_name_from_topic_name(self, topic_name: str) -> str:
91
+ """
92
+ This function converts the topic name to a valid Snowflake table name using some rules.
93
+ Refer below link for more info
94
+ https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
95
+ """
96
+ table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
97
+ if re.match("^[^a-zA-Z_].*", table_name):
98
+ table_name = "_" + table_name
99
+ # Connector may append original topic's hash code as suffix for conflict resolution
100
+ # if generated table names for 2 topics are similar. This corner case is not handled here.
101
+ # Note that Snowflake recommends to choose topic names that follow the rules for
102
+ # Snowflake identifier names so this case is not recommended by snowflake.
103
+ return table_name
104
+
105
+ def get_parser(
106
+ self,
107
+ connector_manifest: ConnectorManifest,
108
+ ) -> SnowflakeParser:
109
+ database_name = connector_manifest.config["snowflake.database.name"]
110
+ schema_name = connector_manifest.config["snowflake.schema.name"]
111
+
112
+ # Fetch user provided topic to table map
113
+ provided_topics_to_tables: Dict[str, str] = {}
114
+ if connector_manifest.config.get("snowflake.topic2table.map"):
115
+ for each in connector_manifest.config["snowflake.topic2table.map"].split(
116
+ ","
117
+ ):
118
+ topic, table = each.split(":")
119
+ provided_topics_to_tables[topic.strip()] = table.strip()
120
+
121
+ topics_to_tables: Dict[str, str] = {}
122
+ # Extract lineage for only those topics whose data ingestion started
123
+ for topic in connector_manifest.topic_names:
124
+ if topic in provided_topics_to_tables:
125
+ # If user provided which table to get mapped with this topic
126
+ topics_to_tables[topic] = provided_topics_to_tables[topic]
127
+ else:
128
+ # Else connector converts topic name to a valid Snowflake table name.
129
+ topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
130
+
131
+ return self.SnowflakeParser(
132
+ database_name=database_name,
133
+ schema_name=schema_name,
134
+ topics_to_tables=topics_to_tables,
135
+ )
136
+
137
+ def extract_flow_property_bag(self) -> Dict[str, str]:
138
+ # For all snowflake sink connector properties, refer below link
139
+ # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
140
+ # remove private keys, secrets from properties
141
+ flow_property_bag = {
142
+ k: v
143
+ for k, v in self.connector_manifest.config.items()
144
+ if k
145
+ not in [
146
+ "snowflake.private.key",
147
+ "snowflake.private.key.passphrase",
148
+ "value.converter.basic.auth.user.info",
149
+ ]
150
+ }
151
+
152
+ return flow_property_bag
153
+
154
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
155
+ lineages: List[KafkaConnectLineage] = list()
156
+ parser = self.get_parser(self.connector_manifest)
157
+
158
+ for topic, table in parser.topics_to_tables.items():
159
+ target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
160
+ lineages.append(
161
+ KafkaConnectLineage(
162
+ source_dataset=topic,
163
+ source_platform=KAFKA,
164
+ target_dataset=target_dataset,
165
+ target_platform="snowflake",
166
+ )
167
+ )
168
+
169
+ return lineages
170
+
171
+
172
+ @dataclass
173
+ class BigQuerySinkConnector(BaseConnector):
174
+ @dataclass
175
+ class BQParser:
176
+ project: str
177
+ target_platform: str
178
+ sanitizeTopics: str
179
+ transforms: list
180
+ topicsToTables: Optional[str] = None
181
+ datasets: Optional[str] = None
182
+ defaultDataset: Optional[str] = None
183
+ version: str = "v1"
184
+
185
+ def get_parser(
186
+ self,
187
+ connector_manifest: ConnectorManifest,
188
+ ) -> BQParser:
189
+ project = connector_manifest.config["project"]
190
+ sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
191
+ transform_names = (
192
+ self.connector_manifest.config.get("transforms", "").split(",")
193
+ if self.connector_manifest.config.get("transforms")
194
+ else []
195
+ )
196
+ transforms = []
197
+ for name in transform_names:
198
+ transform = {"name": name}
199
+ transforms.append(transform)
200
+ for key in self.connector_manifest.config.keys():
201
+ if key.startswith(f"transforms.{name}."):
202
+ transform[
203
+ key.replace(f"transforms.{name}.", "")
204
+ ] = self.connector_manifest.config[key]
205
+
206
+ if "defaultDataset" in connector_manifest.config:
207
+ defaultDataset = connector_manifest.config["defaultDataset"]
208
+ return self.BQParser(
209
+ project=project,
210
+ defaultDataset=defaultDataset,
211
+ target_platform="bigquery",
212
+ sanitizeTopics=sanitizeTopics.lower() == "true",
213
+ version="v2",
214
+ transforms=transforms,
215
+ )
216
+ else:
217
+ # version 1.6.x and similar configs supported
218
+ datasets = connector_manifest.config["datasets"]
219
+ topicsToTables = connector_manifest.config.get("topicsToTables")
220
+
221
+ return self.BQParser(
222
+ project=project,
223
+ topicsToTables=topicsToTables,
224
+ datasets=datasets,
225
+ target_platform="bigquery",
226
+ sanitizeTopics=sanitizeTopics.lower() == "true",
227
+ transforms=transforms,
228
+ )
229
+
230
+ def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
231
+ entries = property.split(",")
232
+ for entry in entries:
233
+ key, val = entry.rsplit("=")
234
+ yield (key.strip(), val.strip())
235
+
236
+ def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]:
237
+ topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore
238
+ from java.util.regex import Pattern
239
+
240
+ for pattern, dataset in topicregex_dataset_map.items():
241
+ patternMatcher = Pattern.compile(pattern).matcher(topic)
242
+ if patternMatcher.matches():
243
+ return dataset
244
+ return None
245
+
246
+ def sanitize_table_name(self, table_name):
247
+ table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
248
+ if re.match("^[^a-zA-Z_].*", table_name):
249
+ table_name = "_" + table_name
250
+
251
+ return table_name
252
+
253
+ def get_dataset_table_for_topic(
254
+ self, topic: str, parser: BQParser
255
+ ) -> Optional[str]:
256
+ if parser.version == "v2":
257
+ dataset = parser.defaultDataset
258
+ parts = topic.split(":")
259
+ if len(parts) == 2:
260
+ dataset = parts[0]
261
+ table = parts[1]
262
+ else:
263
+ table = parts[0]
264
+ else:
265
+ dataset = self.get_dataset_for_topic_v1(topic, parser)
266
+ if dataset is None:
267
+ return None
268
+
269
+ table = topic
270
+ if parser.topicsToTables:
271
+ topicregex_table_map: Dict[str, str] = dict(
272
+ self.get_list(parser.topicsToTables) # type: ignore
273
+ )
274
+ from java.util.regex import Pattern
275
+
276
+ for pattern, tbl in topicregex_table_map.items():
277
+ patternMatcher = Pattern.compile(pattern).matcher(topic)
278
+ if patternMatcher.matches():
279
+ table = tbl
280
+ break
281
+
282
+ if parser.sanitizeTopics:
283
+ table = self.sanitize_table_name(table)
284
+ return f"{dataset}.{table}"
285
+
286
+ def apply_transformations(
287
+ self, topic: str, transforms: List[Dict[str, str]]
288
+ ) -> str:
289
+ for transform in transforms:
290
+ if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
291
+ regex = transform["regex"]
292
+ replacement = transform["replacement"]
293
+ pattern = re.compile(regex)
294
+ if pattern.match(topic):
295
+ topic = pattern.sub(replacement, topic, count=1)
296
+ return topic
297
+
298
+ def extract_flow_property_bag(self) -> Dict[str, str]:
299
+ # Mask/Remove properties that may reveal credentials
300
+ flow_property_bag = {
301
+ k: v
302
+ for k, v in self.connector_manifest.config.items()
303
+ if k not in ["keyfile"]
304
+ }
305
+
306
+ return flow_property_bag
307
+
308
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
309
+ lineages: List[KafkaConnectLineage] = list()
310
+ parser = self.get_parser(self.connector_manifest)
311
+ if not parser:
312
+ return lineages
313
+ target_platform = parser.target_platform
314
+ project = parser.project
315
+ transforms = parser.transforms
316
+
317
+ for topic in self.connector_manifest.topic_names:
318
+ transformed_topic = self.apply_transformations(topic, transforms)
319
+ dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
320
+ if dataset_table is None:
321
+ self.report.warning(
322
+ "Could not find target dataset for topic, please check your connector configuration"
323
+ f"{self.connector_manifest.name} : {transformed_topic} ",
324
+ )
325
+ continue
326
+ target_dataset = f"{project}.{dataset_table}"
327
+
328
+ lineages.append(
329
+ KafkaConnectLineage(
330
+ source_dataset=transformed_topic,
331
+ source_platform=KAFKA,
332
+ target_dataset=target_dataset,
333
+ target_platform=target_platform,
334
+ )
335
+ )
336
+ return lineages
337
+
338
+
339
+ BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector"
340
+ S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector"
341
+ SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector"