acryl-datahub 1.1.0.5rc2__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/METADATA +2550 -2550
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/RECORD +42 -35
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/ingestion/api/report.py +123 -2
- datahub/ingestion/api/source.py +45 -44
- datahub/ingestion/autogenerated/lineage_helper.py +193 -0
- datahub/ingestion/run/pipeline.py +6 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +4 -4
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +5 -1
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/vertica.py +2 -1
- datahub/ingestion/source/unity/source.py +36 -20
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/metadata/_internal_schema_classes.py +601 -0
- datahub/metadata/_urns/urn_defs.py +112 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +383 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
- datahub/sdk/datajob.py +39 -15
- datahub/specific/dataproduct.py +4 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
|
|
|
9
10
|
KafkaConnectLineage,
|
|
10
11
|
)
|
|
11
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RegexRouterTransform:
|
|
17
|
+
"""Helper class to handle RegexRouter transformations for topic/table names."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Dict[str, str]) -> None:
|
|
20
|
+
self.transforms = self._parse_transforms(config)
|
|
21
|
+
|
|
22
|
+
def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
|
|
23
|
+
"""Parse transforms configuration from connector config."""
|
|
24
|
+
transforms_list: List[Dict[str, str]] = []
|
|
25
|
+
|
|
26
|
+
# Get the transforms parameter
|
|
27
|
+
transforms_param: str = config.get("transforms", "")
|
|
28
|
+
if not transforms_param:
|
|
29
|
+
return transforms_list
|
|
30
|
+
|
|
31
|
+
# Parse individual transforms
|
|
32
|
+
transform_names: List[str] = [
|
|
33
|
+
name.strip() for name in transforms_param.split(",")
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for transform_name in transform_names:
|
|
37
|
+
if not transform_name:
|
|
38
|
+
continue
|
|
39
|
+
transform_config: Dict[str, str] = {}
|
|
40
|
+
transform_prefix: str = f"transforms.{transform_name}."
|
|
41
|
+
|
|
42
|
+
# Extract transform configuration
|
|
43
|
+
for key, value in config.items():
|
|
44
|
+
if key.startswith(transform_prefix):
|
|
45
|
+
config_key: str = key[len(transform_prefix) :]
|
|
46
|
+
transform_config[config_key] = value
|
|
47
|
+
|
|
48
|
+
# Only process RegexRouter transforms
|
|
49
|
+
if (
|
|
50
|
+
transform_config.get("type")
|
|
51
|
+
== "org.apache.kafka.connect.transforms.RegexRouter"
|
|
52
|
+
):
|
|
53
|
+
transform_config["name"] = transform_name
|
|
54
|
+
transforms_list.append(transform_config)
|
|
55
|
+
|
|
56
|
+
return transforms_list
|
|
57
|
+
|
|
58
|
+
def apply_transforms(self, topic_name: str) -> str:
|
|
59
|
+
"""Apply RegexRouter transforms to the topic name using Java regex."""
|
|
60
|
+
result: str = topic_name
|
|
61
|
+
|
|
62
|
+
for transform in self.transforms:
|
|
63
|
+
regex_pattern: Optional[str] = transform.get("regex")
|
|
64
|
+
replacement: str = transform.get("replacement", "")
|
|
65
|
+
|
|
66
|
+
if regex_pattern:
|
|
67
|
+
try:
|
|
68
|
+
# Use Java Pattern and Matcher for exact Kafka Connect compatibility
|
|
69
|
+
from java.util.regex import Pattern
|
|
70
|
+
|
|
71
|
+
pattern = Pattern.compile(regex_pattern)
|
|
72
|
+
matcher = pattern.matcher(result)
|
|
73
|
+
|
|
74
|
+
if matcher.find():
|
|
75
|
+
# Reset matcher to beginning for replaceFirst
|
|
76
|
+
matcher.reset()
|
|
77
|
+
result = matcher.replaceFirst(replacement)
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Applied transform {transform['name']}: {topic_name} -> {result}"
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.warning(
|
|
83
|
+
f"Invalid regex pattern in transform {transform['name']}: {e}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return str(result)
|
|
87
|
+
|
|
12
88
|
|
|
13
89
|
@dataclass
|
|
14
90
|
class ConfluentS3SinkConnector(BaseConnector):
|
|
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
18
94
|
bucket: str
|
|
19
95
|
topics_dir: str
|
|
20
96
|
topics: Iterable[str]
|
|
97
|
+
regex_router: RegexRouterTransform
|
|
21
98
|
|
|
22
99
|
def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
|
|
23
100
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
|
|
24
|
-
bucket = connector_manifest.config.get("s3.bucket.name")
|
|
101
|
+
bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
|
|
25
102
|
if not bucket:
|
|
26
103
|
raise ValueError(
|
|
27
104
|
"Could not find 's3.bucket.name' in connector configuration"
|
|
28
105
|
)
|
|
29
106
|
|
|
30
107
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
|
|
31
|
-
topics_dir = connector_manifest.config.get("topics.dir", "topics")
|
|
108
|
+
topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
|
|
109
|
+
|
|
110
|
+
# Create RegexRouterTransform instance
|
|
111
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
112
|
+
connector_manifest.config
|
|
113
|
+
)
|
|
32
114
|
|
|
33
115
|
return self.S3SinkParser(
|
|
34
116
|
target_platform="s3",
|
|
35
117
|
bucket=bucket,
|
|
36
118
|
topics_dir=topics_dir,
|
|
37
119
|
topics=connector_manifest.topic_names,
|
|
120
|
+
regex_router=regex_router,
|
|
38
121
|
)
|
|
39
122
|
|
|
40
123
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
41
124
|
# Mask/Remove properties that may reveal credentials
|
|
42
|
-
flow_property_bag = {
|
|
125
|
+
flow_property_bag: Dict[str, str] = {
|
|
43
126
|
k: v
|
|
44
127
|
for k, v in self.connector_manifest.config.items()
|
|
45
128
|
if k
|
|
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
54
137
|
|
|
55
138
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
56
139
|
try:
|
|
57
|
-
parser = self._get_parser(
|
|
140
|
+
parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
|
|
141
|
+
self.connector_manifest
|
|
142
|
+
)
|
|
58
143
|
|
|
59
144
|
lineages: List[KafkaConnectLineage] = list()
|
|
60
145
|
for topic in parser.topics:
|
|
61
|
-
|
|
146
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
147
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
148
|
+
target_dataset: str = (
|
|
149
|
+
f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
|
|
150
|
+
)
|
|
62
151
|
|
|
63
152
|
lineages.append(
|
|
64
153
|
KafkaConnectLineage(
|
|
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
86
175
|
database_name: str
|
|
87
176
|
schema_name: str
|
|
88
177
|
topics_to_tables: Dict[str, str]
|
|
178
|
+
regex_router: RegexRouterTransform
|
|
89
179
|
|
|
90
180
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
91
181
|
"""
|
|
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
93
183
|
Refer below link for more info
|
|
94
184
|
https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
|
|
95
185
|
"""
|
|
96
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
186
|
+
table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
97
187
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
98
188
|
table_name = "_" + table_name
|
|
99
189
|
# Connector may append original topic's hash code as suffix for conflict resolution
|
|
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
106
196
|
self,
|
|
107
197
|
connector_manifest: ConnectorManifest,
|
|
108
198
|
) -> SnowflakeParser:
|
|
109
|
-
database_name = connector_manifest.config["snowflake.database.name"]
|
|
110
|
-
schema_name = connector_manifest.config["snowflake.schema.name"]
|
|
199
|
+
database_name: str = connector_manifest.config["snowflake.database.name"]
|
|
200
|
+
schema_name: str = connector_manifest.config["snowflake.schema.name"]
|
|
201
|
+
|
|
202
|
+
# Create RegexRouterTransform instance
|
|
203
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
204
|
+
connector_manifest.config
|
|
205
|
+
)
|
|
111
206
|
|
|
112
207
|
# Fetch user provided topic to table map
|
|
113
208
|
provided_topics_to_tables: Dict[str, str] = {}
|
|
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
121
216
|
topics_to_tables: Dict[str, str] = {}
|
|
122
217
|
# Extract lineage for only those topics whose data ingestion started
|
|
123
218
|
for topic in connector_manifest.topic_names:
|
|
219
|
+
# Apply transforms first to get the transformed topic name
|
|
220
|
+
transformed_topic: str = regex_router.apply_transforms(topic)
|
|
221
|
+
|
|
124
222
|
if topic in provided_topics_to_tables:
|
|
125
223
|
# If user provided which table to get mapped with this topic
|
|
126
224
|
topics_to_tables[topic] = provided_topics_to_tables[topic]
|
|
127
225
|
else:
|
|
128
|
-
#
|
|
129
|
-
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
226
|
+
# Use the transformed topic name to generate table name
|
|
227
|
+
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
228
|
+
transformed_topic
|
|
229
|
+
)
|
|
130
230
|
|
|
131
231
|
return self.SnowflakeParser(
|
|
132
232
|
database_name=database_name,
|
|
133
233
|
schema_name=schema_name,
|
|
134
234
|
topics_to_tables=topics_to_tables,
|
|
235
|
+
regex_router=regex_router,
|
|
135
236
|
)
|
|
136
237
|
|
|
137
238
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
138
239
|
# For all snowflake sink connector properties, refer below link
|
|
139
240
|
# https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
|
|
140
241
|
# remove private keys, secrets from properties
|
|
141
|
-
flow_property_bag = {
|
|
242
|
+
flow_property_bag: Dict[str, str] = {
|
|
142
243
|
k: v
|
|
143
244
|
for k, v in self.connector_manifest.config.items()
|
|
144
245
|
if k
|
|
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
153
254
|
|
|
154
255
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
155
256
|
lineages: List[KafkaConnectLineage] = list()
|
|
156
|
-
parser = self.get_parser(
|
|
257
|
+
parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
|
|
258
|
+
self.connector_manifest
|
|
259
|
+
)
|
|
157
260
|
|
|
158
261
|
for topic, table in parser.topics_to_tables.items():
|
|
159
|
-
target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
262
|
+
target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
160
263
|
lineages.append(
|
|
161
264
|
KafkaConnectLineage(
|
|
162
265
|
source_dataset=topic,
|
|
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
176
279
|
project: str
|
|
177
280
|
target_platform: str
|
|
178
281
|
sanitizeTopics: bool
|
|
179
|
-
transforms:
|
|
282
|
+
transforms: List[Dict[str, str]]
|
|
283
|
+
regex_router: RegexRouterTransform
|
|
180
284
|
topicsToTables: Optional[str] = None
|
|
181
285
|
datasets: Optional[str] = None
|
|
182
286
|
defaultDataset: Optional[str] = None
|
|
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
186
290
|
self,
|
|
187
291
|
connector_manifest: ConnectorManifest,
|
|
188
292
|
) -> BQParser:
|
|
189
|
-
project = connector_manifest.config["project"]
|
|
190
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
191
|
-
|
|
293
|
+
project: str = connector_manifest.config["project"]
|
|
294
|
+
sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
295
|
+
|
|
296
|
+
# Parse ALL transforms (original BigQuery logic)
|
|
297
|
+
transform_names: List[str] = (
|
|
192
298
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
193
299
|
if self.connector_manifest.config.get("transforms")
|
|
194
300
|
else []
|
|
195
301
|
)
|
|
196
|
-
transforms = []
|
|
302
|
+
transforms: List[Dict[str, str]] = []
|
|
197
303
|
for name in transform_names:
|
|
198
|
-
transform = {"name": name}
|
|
304
|
+
transform: Dict[str, str] = {"name": name}
|
|
199
305
|
transforms.append(transform)
|
|
200
306
|
for key in self.connector_manifest.config:
|
|
201
307
|
if key.startswith(f"transforms.{name}."):
|
|
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
203
309
|
self.connector_manifest.config[key]
|
|
204
310
|
)
|
|
205
311
|
|
|
312
|
+
# Create RegexRouterTransform instance for RegexRouter-specific handling
|
|
313
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
314
|
+
connector_manifest.config
|
|
315
|
+
)
|
|
316
|
+
|
|
206
317
|
if "defaultDataset" in connector_manifest.config:
|
|
207
|
-
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
318
|
+
defaultDataset: str = connector_manifest.config["defaultDataset"]
|
|
208
319
|
return self.BQParser(
|
|
209
320
|
project=project,
|
|
210
321
|
defaultDataset=defaultDataset,
|
|
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
212
323
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
213
324
|
version="v2",
|
|
214
325
|
transforms=transforms,
|
|
326
|
+
regex_router=regex_router,
|
|
215
327
|
)
|
|
216
328
|
else:
|
|
217
329
|
# version 1.6.x and similar configs supported
|
|
218
|
-
datasets = connector_manifest.config["datasets"]
|
|
219
|
-
topicsToTables = connector_manifest.config.get(
|
|
330
|
+
datasets: str = connector_manifest.config["datasets"]
|
|
331
|
+
topicsToTables: Optional[str] = connector_manifest.config.get(
|
|
332
|
+
"topicsToTables"
|
|
333
|
+
)
|
|
220
334
|
|
|
221
335
|
return self.BQParser(
|
|
222
336
|
project=project,
|
|
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
225
339
|
target_platform="bigquery",
|
|
226
340
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
227
341
|
transforms=transforms,
|
|
342
|
+
regex_router=regex_router,
|
|
228
343
|
)
|
|
229
344
|
|
|
230
345
|
def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
|
|
231
|
-
entries = property.split(",")
|
|
346
|
+
entries: List[str] = property.split(",")
|
|
232
347
|
for entry in entries:
|
|
233
348
|
key, val = entry.rsplit("=")
|
|
234
349
|
yield (key.strip(), val.strip())
|
|
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
243
358
|
return dataset
|
|
244
359
|
return None
|
|
245
360
|
|
|
246
|
-
def sanitize_table_name(self, table_name):
|
|
361
|
+
def sanitize_table_name(self, table_name: str) -> str:
|
|
247
362
|
table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
|
|
248
363
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
249
364
|
table_name = "_" + table_name
|
|
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
254
369
|
self, topic: str, parser: BQParser
|
|
255
370
|
) -> Optional[str]:
|
|
256
371
|
if parser.version == "v2":
|
|
257
|
-
dataset = parser.defaultDataset
|
|
258
|
-
parts = topic.split(":")
|
|
372
|
+
dataset: Optional[str] = parser.defaultDataset
|
|
373
|
+
parts: List[str] = topic.split(":")
|
|
259
374
|
if len(parts) == 2:
|
|
260
375
|
dataset = parts[0]
|
|
261
376
|
table = parts[1]
|
|
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
283
398
|
table = self.sanitize_table_name(table)
|
|
284
399
|
return f"{dataset}.{table}"
|
|
285
400
|
|
|
286
|
-
def apply_transformations(
|
|
287
|
-
self, topic: str, transforms: List[Dict[str, str]]
|
|
288
|
-
) -> str:
|
|
289
|
-
for transform in transforms:
|
|
290
|
-
if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
|
|
291
|
-
regex = transform["regex"]
|
|
292
|
-
replacement = transform["replacement"]
|
|
293
|
-
pattern = re.compile(regex)
|
|
294
|
-
if pattern.match(topic):
|
|
295
|
-
topic = pattern.sub(replacement, topic, count=1)
|
|
296
|
-
return topic
|
|
297
|
-
|
|
298
401
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
299
402
|
# Mask/Remove properties that may reveal credentials
|
|
300
|
-
flow_property_bag = {
|
|
403
|
+
flow_property_bag: Dict[str, str] = {
|
|
301
404
|
k: v
|
|
302
405
|
for k, v in self.connector_manifest.config.items()
|
|
303
406
|
if k not in ["keyfile"]
|
|
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
307
410
|
|
|
308
411
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
309
412
|
lineages: List[KafkaConnectLineage] = list()
|
|
310
|
-
parser = self.get_parser(
|
|
413
|
+
parser: BigQuerySinkConnector.BQParser = self.get_parser(
|
|
414
|
+
self.connector_manifest
|
|
415
|
+
)
|
|
311
416
|
if not parser:
|
|
312
417
|
return lineages
|
|
313
|
-
target_platform = parser.target_platform
|
|
314
|
-
project = parser.project
|
|
315
|
-
transforms = parser.transforms
|
|
418
|
+
target_platform: str = parser.target_platform
|
|
419
|
+
project: str = parser.project
|
|
316
420
|
|
|
317
421
|
for topic in self.connector_manifest.topic_names:
|
|
318
|
-
|
|
319
|
-
|
|
422
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
423
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
424
|
+
|
|
425
|
+
# Use the transformed topic to determine dataset/table
|
|
426
|
+
dataset_table: Optional[str] = self.get_dataset_table_for_topic(
|
|
427
|
+
transformed_topic, parser
|
|
428
|
+
)
|
|
320
429
|
if dataset_table is None:
|
|
321
430
|
self.report.warning(
|
|
322
431
|
"Could not find target dataset for topic, please check your connector configuration"
|
|
323
432
|
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
324
433
|
)
|
|
325
434
|
continue
|
|
326
|
-
target_dataset = f"{project}.{dataset_table}"
|
|
435
|
+
target_dataset: str = f"{project}.{dataset_table}"
|
|
327
436
|
|
|
328
437
|
lineages.append(
|
|
329
438
|
KafkaConnectLineage(
|
|
330
|
-
source_dataset=
|
|
439
|
+
source_dataset=topic, # Keep original topic as source
|
|
331
440
|
source_platform=KAFKA,
|
|
332
441
|
target_dataset=target_dataset,
|
|
333
442
|
target_platform=target_platform,
|
|
@@ -15,6 +15,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
15
15
|
)
|
|
16
16
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
18
19
|
from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
19
20
|
DataHubMockDataReport,
|
|
20
21
|
)
|
|
@@ -211,15 +212,19 @@ class DataHubMockDataSource(Source):
|
|
|
211
212
|
pattern = self.config.gen_1.subtype_pattern
|
|
212
213
|
|
|
213
214
|
if pattern == SubTypePattern.ALTERNATING:
|
|
214
|
-
return
|
|
215
|
+
return (
|
|
216
|
+
DatasetSubTypes.TABLE if table_index % 2 == 0 else DatasetSubTypes.VIEW
|
|
217
|
+
)
|
|
215
218
|
elif pattern == SubTypePattern.LEVEL_BASED:
|
|
216
|
-
return self.config.gen_1.level_subtypes.get(
|
|
219
|
+
return self.config.gen_1.level_subtypes.get(
|
|
220
|
+
table_level, DatasetSubTypes.TABLE
|
|
221
|
+
)
|
|
217
222
|
elif pattern == SubTypePattern.ALL_TABLE:
|
|
218
|
-
return
|
|
223
|
+
return DatasetSubTypes.TABLE
|
|
219
224
|
elif pattern == SubTypePattern.ALL_VIEW:
|
|
220
|
-
return
|
|
225
|
+
return DatasetSubTypes.VIEW
|
|
221
226
|
else:
|
|
222
|
-
return
|
|
227
|
+
return DatasetSubTypes.TABLE # default
|
|
223
228
|
|
|
224
229
|
def _get_subtypes_aspect(
|
|
225
230
|
self, table_name: str, table_level: int, table_index: int
|
|
@@ -261,11 +266,8 @@ class DataHubMockDataSource(Source):
|
|
|
261
266
|
fan_out, hops, fan_out_after_first
|
|
262
267
|
)
|
|
263
268
|
|
|
264
|
-
logger.info(
|
|
265
|
-
f"About to create {tables_to_be_created} tables for lineage testing"
|
|
266
|
-
)
|
|
269
|
+
logger.info(f"About to create {tables_to_be_created} datasets mock data")
|
|
267
270
|
|
|
268
|
-
current_progress = 0
|
|
269
271
|
for i in range(hops + 1):
|
|
270
272
|
tables_at_level = tables_at_levels[i]
|
|
271
273
|
|
|
@@ -286,12 +288,6 @@ class DataHubMockDataSource(Source):
|
|
|
286
288
|
tables_at_levels=tables_at_levels,
|
|
287
289
|
)
|
|
288
290
|
|
|
289
|
-
current_progress += 1
|
|
290
|
-
if current_progress % 1000 == 0:
|
|
291
|
-
logger.info(
|
|
292
|
-
f"Progress: {current_progress}/{tables_to_be_created} tables processed"
|
|
293
|
-
)
|
|
294
|
-
|
|
295
291
|
def _generate_lineage_for_table(
|
|
296
292
|
self,
|
|
297
293
|
table_name: str,
|
|
@@ -23,6 +23,7 @@ from datahub.ingestion.api.source import (
|
|
|
23
23
|
SourceReport,
|
|
24
24
|
)
|
|
25
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
26
27
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
27
28
|
StaleEntityRemovalHandler,
|
|
28
29
|
StaleEntityRemovalSourceReport,
|
|
@@ -493,7 +494,7 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
493
494
|
mcp=MetadataChangeProposalWrapper(
|
|
494
495
|
entityUrn=urn_channel,
|
|
495
496
|
aspect=SubTypesClass(
|
|
496
|
-
typeNames=[
|
|
497
|
+
typeNames=[DatasetSubTypes.SLACK_CHANNEL],
|
|
497
498
|
),
|
|
498
499
|
),
|
|
499
500
|
)
|
|
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
127
127
|
users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
128
128
|
|
|
129
129
|
audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
130
|
+
aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
131
|
+
|
|
130
132
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
131
133
|
|
|
132
134
|
num_ddl_queries_dropped: int = 0
|
|
@@ -282,7 +284,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
282
284
|
|
|
283
285
|
self.aggregator.add(query)
|
|
284
286
|
|
|
285
|
-
|
|
287
|
+
with self.report.aggregator_generate_timer:
|
|
288
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
286
289
|
|
|
287
290
|
def fetch_users(self) -> UsersMapping:
|
|
288
291
|
users: UsersMapping = dict()
|
|
@@ -660,6 +663,7 @@ class SnowflakeQueriesSource(Source):
|
|
|
660
663
|
def close(self) -> None:
|
|
661
664
|
self.connection.close()
|
|
662
665
|
self.queries_extractor.close()
|
|
666
|
+
super().close()
|
|
663
667
|
|
|
664
668
|
|
|
665
669
|
# Make sure we don't try to generate too much info for a single query.
|
|
@@ -57,10 +57,11 @@ class GenericProfiler:
|
|
|
57
57
|
platform: Optional[str] = None,
|
|
58
58
|
profiler_args: Optional[Dict] = None,
|
|
59
59
|
) -> Iterable[MetadataWorkUnit]:
|
|
60
|
+
# We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
|
|
60
61
|
ge_profile_requests: List[GEProfilerRequest] = [
|
|
61
62
|
cast(GEProfilerRequest, request)
|
|
62
63
|
for request in requests
|
|
63
|
-
if not request.profile_table_level_only
|
|
64
|
+
if not request.profile_table_level_only or request.table.rows_count == 0
|
|
64
65
|
]
|
|
65
66
|
table_level_profile_requests: List[TableProfilerRequest] = [
|
|
66
67
|
request for request in requests if request.profile_table_level_only
|
|
@@ -25,6 +25,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
25
|
)
|
|
26
26
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
27
27
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
28
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
28
29
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
30
|
SQLAlchemySource,
|
|
30
31
|
SqlWorkUnit,
|
|
@@ -497,7 +498,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
497
498
|
changeType=ChangeTypeClass.UPSERT,
|
|
498
499
|
entityUrn=dataset_urn,
|
|
499
500
|
aspectName="subTypes",
|
|
500
|
-
aspect=SubTypesClass(typeNames=[
|
|
501
|
+
aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
|
|
501
502
|
).as_workunit()
|
|
502
503
|
|
|
503
504
|
if self.config.domain:
|
|
@@ -1020,29 +1020,45 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1020
1020
|
) -> Iterable[MetadataWorkUnit]:
|
|
1021
1021
|
if self.ctx.graph and self.platform_resource_repository:
|
|
1022
1022
|
for tag in tags:
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1023
|
+
try:
|
|
1024
|
+
platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
|
|
1025
|
+
platform_instance=self.platform_instance_name,
|
|
1026
|
+
platform_resource_repository=self.platform_resource_repository,
|
|
1027
|
+
tag=tag,
|
|
1028
|
+
)
|
|
1029
|
+
logger.debug(f"Created platform resource {platform_resource_id}")
|
|
1029
1030
|
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
):
|
|
1037
|
-
unity_catalog_tag.datahub_linked_resources().add(
|
|
1038
|
-
tag.to_datahub_tag_urn().urn()
|
|
1031
|
+
unity_catalog_tag = (
|
|
1032
|
+
UnityCatalogTagPlatformResource.get_from_datahub(
|
|
1033
|
+
platform_resource_id,
|
|
1034
|
+
self.platform_resource_repository,
|
|
1035
|
+
False,
|
|
1036
|
+
)
|
|
1039
1037
|
)
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1038
|
+
if (
|
|
1039
|
+
tag.to_datahub_tag_urn().urn()
|
|
1040
|
+
not in unity_catalog_tag.datahub_linked_resources().urns
|
|
1041
|
+
):
|
|
1042
|
+
unity_catalog_tag.datahub_linked_resources().add(
|
|
1043
|
+
tag.to_datahub_tag_urn().urn()
|
|
1045
1044
|
)
|
|
1045
|
+
platform_resource = unity_catalog_tag.as_platform_resource()
|
|
1046
|
+
for mcp in platform_resource.to_mcps():
|
|
1047
|
+
yield MetadataWorkUnit(
|
|
1048
|
+
id=f"platform_resource-{platform_resource.id}",
|
|
1049
|
+
mcp=mcp,
|
|
1050
|
+
)
|
|
1051
|
+
except Exception as e:
|
|
1052
|
+
logger.exception(
|
|
1053
|
+
f"Error processing platform resource for tag {tag}"
|
|
1054
|
+
)
|
|
1055
|
+
self.report.report_warning(
|
|
1056
|
+
message="Error processing platform resource for tag",
|
|
1057
|
+
context=str(tag),
|
|
1058
|
+
title="Error processing platform resource for tag",
|
|
1059
|
+
exc=e,
|
|
1060
|
+
)
|
|
1061
|
+
continue
|
|
1046
1062
|
|
|
1047
1063
|
def _create_schema_metadata_aspect(
|
|
1048
1064
|
self, table: Table
|
|
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
|
|
|
71
71
|
|
|
72
72
|
server_ownership = graph.get_ownership(entity_urn=urn)
|
|
73
73
|
if server_ownership:
|
|
74
|
-
owners = {
|
|
75
|
-
|
|
74
|
+
owners = {
|
|
75
|
+
(
|
|
76
|
+
owner.owner,
|
|
77
|
+
owner.type,
|
|
78
|
+
owner.typeUrn,
|
|
79
|
+
): owner
|
|
80
|
+
for owner in server_ownership.owners
|
|
81
|
+
}
|
|
82
|
+
owners.update(
|
|
83
|
+
{
|
|
84
|
+
(
|
|
85
|
+
owner.owner,
|
|
86
|
+
owner.type,
|
|
87
|
+
owner.typeUrn,
|
|
88
|
+
): owner
|
|
89
|
+
for owner in mce_ownership.owners
|
|
90
|
+
}
|
|
91
|
+
)
|
|
76
92
|
mce_ownership.owners = list(owners.values())
|
|
77
93
|
|
|
78
94
|
return mce_ownership
|