acryl-datahub 1.1.0.5rc2__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (42) hide show
  1. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/METADATA +2550 -2550
  2. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/RECORD +42 -35
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +1 -1
  5. datahub/ingestion/api/report.py +123 -2
  6. datahub/ingestion/api/source.py +45 -44
  7. datahub/ingestion/autogenerated/lineage_helper.py +193 -0
  8. datahub/ingestion/run/pipeline.py +6 -0
  9. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  10. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  11. datahub/ingestion/source/bigquery_v2/queries.py +4 -4
  12. datahub/ingestion/source/common/subtypes.py +2 -0
  13. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  14. datahub/ingestion/source/hex/api.py +26 -1
  15. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  16. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
  17. datahub/ingestion/source/slack/slack.py +2 -1
  18. datahub/ingestion/source/snowflake/snowflake_queries.py +5 -1
  19. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  20. datahub/ingestion/source/sql/vertica.py +2 -1
  21. datahub/ingestion/source/unity/source.py +36 -20
  22. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  23. datahub/metadata/_internal_schema_classes.py +601 -0
  24. datahub/metadata/_urns/urn_defs.py +112 -0
  25. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  26. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  27. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  28. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  29. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  30. datahub/metadata/schema.avsc +383 -0
  31. datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
  32. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  33. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
  34. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  35. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  36. datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
  37. datahub/sdk/datajob.py +39 -15
  38. datahub/specific/dataproduct.py +4 -0
  39. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/WHEEL +0 -0
  40. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/entry_points.txt +0 -0
  41. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/licenses/LICENSE +0 -0
  42. {acryl_datahub-1.1.0.5rc2.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import re
2
3
  from dataclasses import dataclass
3
4
  from typing import Dict, Iterable, List, Optional, Tuple
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
9
10
  KafkaConnectLineage,
10
11
  )
11
12
 
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RegexRouterTransform:
17
+ """Helper class to handle RegexRouter transformations for topic/table names."""
18
+
19
+ def __init__(self, config: Dict[str, str]) -> None:
20
+ self.transforms = self._parse_transforms(config)
21
+
22
+ def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
23
+ """Parse transforms configuration from connector config."""
24
+ transforms_list: List[Dict[str, str]] = []
25
+
26
+ # Get the transforms parameter
27
+ transforms_param: str = config.get("transforms", "")
28
+ if not transforms_param:
29
+ return transforms_list
30
+
31
+ # Parse individual transforms
32
+ transform_names: List[str] = [
33
+ name.strip() for name in transforms_param.split(",")
34
+ ]
35
+
36
+ for transform_name in transform_names:
37
+ if not transform_name:
38
+ continue
39
+ transform_config: Dict[str, str] = {}
40
+ transform_prefix: str = f"transforms.{transform_name}."
41
+
42
+ # Extract transform configuration
43
+ for key, value in config.items():
44
+ if key.startswith(transform_prefix):
45
+ config_key: str = key[len(transform_prefix) :]
46
+ transform_config[config_key] = value
47
+
48
+ # Only process RegexRouter transforms
49
+ if (
50
+ transform_config.get("type")
51
+ == "org.apache.kafka.connect.transforms.RegexRouter"
52
+ ):
53
+ transform_config["name"] = transform_name
54
+ transforms_list.append(transform_config)
55
+
56
+ return transforms_list
57
+
58
+ def apply_transforms(self, topic_name: str) -> str:
59
+ """Apply RegexRouter transforms to the topic name using Java regex."""
60
+ result: str = topic_name
61
+
62
+ for transform in self.transforms:
63
+ regex_pattern: Optional[str] = transform.get("regex")
64
+ replacement: str = transform.get("replacement", "")
65
+
66
+ if regex_pattern:
67
+ try:
68
+ # Use Java Pattern and Matcher for exact Kafka Connect compatibility
69
+ from java.util.regex import Pattern
70
+
71
+ pattern = Pattern.compile(regex_pattern)
72
+ matcher = pattern.matcher(result)
73
+
74
+ if matcher.find():
75
+ # Reset matcher to beginning for replaceFirst
76
+ matcher.reset()
77
+ result = matcher.replaceFirst(replacement)
78
+ logger.debug(
79
+ f"Applied transform {transform['name']}: {topic_name} -> {result}"
80
+ )
81
+ except Exception as e:
82
+ logger.warning(
83
+ f"Invalid regex pattern in transform {transform['name']}: {e}"
84
+ )
85
+
86
+ return str(result)
87
+
12
88
 
13
89
  @dataclass
14
90
  class ConfluentS3SinkConnector(BaseConnector):
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
18
94
  bucket: str
19
95
  topics_dir: str
20
96
  topics: Iterable[str]
97
+ regex_router: RegexRouterTransform
21
98
 
22
99
  def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
23
100
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
24
- bucket = connector_manifest.config.get("s3.bucket.name")
101
+ bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
25
102
  if not bucket:
26
103
  raise ValueError(
27
104
  "Could not find 's3.bucket.name' in connector configuration"
28
105
  )
29
106
 
30
107
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
31
- topics_dir = connector_manifest.config.get("topics.dir", "topics")
108
+ topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
109
+
110
+ # Create RegexRouterTransform instance
111
+ regex_router: RegexRouterTransform = RegexRouterTransform(
112
+ connector_manifest.config
113
+ )
32
114
 
33
115
  return self.S3SinkParser(
34
116
  target_platform="s3",
35
117
  bucket=bucket,
36
118
  topics_dir=topics_dir,
37
119
  topics=connector_manifest.topic_names,
120
+ regex_router=regex_router,
38
121
  )
39
122
 
40
123
  def extract_flow_property_bag(self) -> Dict[str, str]:
41
124
  # Mask/Remove properties that may reveal credentials
42
- flow_property_bag = {
125
+ flow_property_bag: Dict[str, str] = {
43
126
  k: v
44
127
  for k, v in self.connector_manifest.config.items()
45
128
  if k
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
54
137
 
55
138
  def extract_lineages(self) -> List[KafkaConnectLineage]:
56
139
  try:
57
- parser = self._get_parser(self.connector_manifest)
140
+ parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
141
+ self.connector_manifest
142
+ )
58
143
 
59
144
  lineages: List[KafkaConnectLineage] = list()
60
145
  for topic in parser.topics:
61
- target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
146
+ # Apply RegexRouter transformations using the RegexRouterTransform class
147
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
148
+ target_dataset: str = (
149
+ f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
150
+ )
62
151
 
63
152
  lineages.append(
64
153
  KafkaConnectLineage(
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
86
175
  database_name: str
87
176
  schema_name: str
88
177
  topics_to_tables: Dict[str, str]
178
+ regex_router: RegexRouterTransform
89
179
 
90
180
  def get_table_name_from_topic_name(self, topic_name: str) -> str:
91
181
  """
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
93
183
  Refer below link for more info
94
184
  https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
95
185
  """
96
- table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
186
+ table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
97
187
  if re.match("^[^a-zA-Z_].*", table_name):
98
188
  table_name = "_" + table_name
99
189
  # Connector may append original topic's hash code as suffix for conflict resolution
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
106
196
  self,
107
197
  connector_manifest: ConnectorManifest,
108
198
  ) -> SnowflakeParser:
109
- database_name = connector_manifest.config["snowflake.database.name"]
110
- schema_name = connector_manifest.config["snowflake.schema.name"]
199
+ database_name: str = connector_manifest.config["snowflake.database.name"]
200
+ schema_name: str = connector_manifest.config["snowflake.schema.name"]
201
+
202
+ # Create RegexRouterTransform instance
203
+ regex_router: RegexRouterTransform = RegexRouterTransform(
204
+ connector_manifest.config
205
+ )
111
206
 
112
207
  # Fetch user provided topic to table map
113
208
  provided_topics_to_tables: Dict[str, str] = {}
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
121
216
  topics_to_tables: Dict[str, str] = {}
122
217
  # Extract lineage for only those topics whose data ingestion started
123
218
  for topic in connector_manifest.topic_names:
219
+ # Apply transforms first to get the transformed topic name
220
+ transformed_topic: str = regex_router.apply_transforms(topic)
221
+
124
222
  if topic in provided_topics_to_tables:
125
223
  # If user provided which table to get mapped with this topic
126
224
  topics_to_tables[topic] = provided_topics_to_tables[topic]
127
225
  else:
128
- # Else connector converts topic name to a valid Snowflake table name.
129
- topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
226
+ # Use the transformed topic name to generate table name
227
+ topics_to_tables[topic] = self.get_table_name_from_topic_name(
228
+ transformed_topic
229
+ )
130
230
 
131
231
  return self.SnowflakeParser(
132
232
  database_name=database_name,
133
233
  schema_name=schema_name,
134
234
  topics_to_tables=topics_to_tables,
235
+ regex_router=regex_router,
135
236
  )
136
237
 
137
238
  def extract_flow_property_bag(self) -> Dict[str, str]:
138
239
  # For all snowflake sink connector properties, refer below link
139
240
  # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
140
241
  # remove private keys, secrets from properties
141
- flow_property_bag = {
242
+ flow_property_bag: Dict[str, str] = {
142
243
  k: v
143
244
  for k, v in self.connector_manifest.config.items()
144
245
  if k
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
153
254
 
154
255
  def extract_lineages(self) -> List[KafkaConnectLineage]:
155
256
  lineages: List[KafkaConnectLineage] = list()
156
- parser = self.get_parser(self.connector_manifest)
257
+ parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
258
+ self.connector_manifest
259
+ )
157
260
 
158
261
  for topic, table in parser.topics_to_tables.items():
159
- target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
262
+ target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
160
263
  lineages.append(
161
264
  KafkaConnectLineage(
162
265
  source_dataset=topic,
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
176
279
  project: str
177
280
  target_platform: str
178
281
  sanitizeTopics: bool
179
- transforms: list
282
+ transforms: List[Dict[str, str]]
283
+ regex_router: RegexRouterTransform
180
284
  topicsToTables: Optional[str] = None
181
285
  datasets: Optional[str] = None
182
286
  defaultDataset: Optional[str] = None
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
186
290
  self,
187
291
  connector_manifest: ConnectorManifest,
188
292
  ) -> BQParser:
189
- project = connector_manifest.config["project"]
190
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
191
- transform_names = (
293
+ project: str = connector_manifest.config["project"]
294
+ sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
295
+
296
+ # Parse ALL transforms (original BigQuery logic)
297
+ transform_names: List[str] = (
192
298
  self.connector_manifest.config.get("transforms", "").split(",")
193
299
  if self.connector_manifest.config.get("transforms")
194
300
  else []
195
301
  )
196
- transforms = []
302
+ transforms: List[Dict[str, str]] = []
197
303
  for name in transform_names:
198
- transform = {"name": name}
304
+ transform: Dict[str, str] = {"name": name}
199
305
  transforms.append(transform)
200
306
  for key in self.connector_manifest.config:
201
307
  if key.startswith(f"transforms.{name}."):
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
203
309
  self.connector_manifest.config[key]
204
310
  )
205
311
 
312
+ # Create RegexRouterTransform instance for RegexRouter-specific handling
313
+ regex_router: RegexRouterTransform = RegexRouterTransform(
314
+ connector_manifest.config
315
+ )
316
+
206
317
  if "defaultDataset" in connector_manifest.config:
207
- defaultDataset = connector_manifest.config["defaultDataset"]
318
+ defaultDataset: str = connector_manifest.config["defaultDataset"]
208
319
  return self.BQParser(
209
320
  project=project,
210
321
  defaultDataset=defaultDataset,
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
212
323
  sanitizeTopics=sanitizeTopics.lower() == "true",
213
324
  version="v2",
214
325
  transforms=transforms,
326
+ regex_router=regex_router,
215
327
  )
216
328
  else:
217
329
  # version 1.6.x and similar configs supported
218
- datasets = connector_manifest.config["datasets"]
219
- topicsToTables = connector_manifest.config.get("topicsToTables")
330
+ datasets: str = connector_manifest.config["datasets"]
331
+ topicsToTables: Optional[str] = connector_manifest.config.get(
332
+ "topicsToTables"
333
+ )
220
334
 
221
335
  return self.BQParser(
222
336
  project=project,
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
225
339
  target_platform="bigquery",
226
340
  sanitizeTopics=sanitizeTopics.lower() == "true",
227
341
  transforms=transforms,
342
+ regex_router=regex_router,
228
343
  )
229
344
 
230
345
  def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
231
- entries = property.split(",")
346
+ entries: List[str] = property.split(",")
232
347
  for entry in entries:
233
348
  key, val = entry.rsplit("=")
234
349
  yield (key.strip(), val.strip())
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
243
358
  return dataset
244
359
  return None
245
360
 
246
- def sanitize_table_name(self, table_name):
361
+ def sanitize_table_name(self, table_name: str) -> str:
247
362
  table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
248
363
  if re.match("^[^a-zA-Z_].*", table_name):
249
364
  table_name = "_" + table_name
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
254
369
  self, topic: str, parser: BQParser
255
370
  ) -> Optional[str]:
256
371
  if parser.version == "v2":
257
- dataset = parser.defaultDataset
258
- parts = topic.split(":")
372
+ dataset: Optional[str] = parser.defaultDataset
373
+ parts: List[str] = topic.split(":")
259
374
  if len(parts) == 2:
260
375
  dataset = parts[0]
261
376
  table = parts[1]
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
283
398
  table = self.sanitize_table_name(table)
284
399
  return f"{dataset}.{table}"
285
400
 
286
- def apply_transformations(
287
- self, topic: str, transforms: List[Dict[str, str]]
288
- ) -> str:
289
- for transform in transforms:
290
- if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
291
- regex = transform["regex"]
292
- replacement = transform["replacement"]
293
- pattern = re.compile(regex)
294
- if pattern.match(topic):
295
- topic = pattern.sub(replacement, topic, count=1)
296
- return topic
297
-
298
401
  def extract_flow_property_bag(self) -> Dict[str, str]:
299
402
  # Mask/Remove properties that may reveal credentials
300
- flow_property_bag = {
403
+ flow_property_bag: Dict[str, str] = {
301
404
  k: v
302
405
  for k, v in self.connector_manifest.config.items()
303
406
  if k not in ["keyfile"]
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
307
410
 
308
411
  def extract_lineages(self) -> List[KafkaConnectLineage]:
309
412
  lineages: List[KafkaConnectLineage] = list()
310
- parser = self.get_parser(self.connector_manifest)
413
+ parser: BigQuerySinkConnector.BQParser = self.get_parser(
414
+ self.connector_manifest
415
+ )
311
416
  if not parser:
312
417
  return lineages
313
- target_platform = parser.target_platform
314
- project = parser.project
315
- transforms = parser.transforms
418
+ target_platform: str = parser.target_platform
419
+ project: str = parser.project
316
420
 
317
421
  for topic in self.connector_manifest.topic_names:
318
- transformed_topic = self.apply_transformations(topic, transforms)
319
- dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
422
+ # Apply RegexRouter transformations using the RegexRouterTransform class
423
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
424
+
425
+ # Use the transformed topic to determine dataset/table
426
+ dataset_table: Optional[str] = self.get_dataset_table_for_topic(
427
+ transformed_topic, parser
428
+ )
320
429
  if dataset_table is None:
321
430
  self.report.warning(
322
431
  "Could not find target dataset for topic, please check your connector configuration"
323
432
  f"{self.connector_manifest.name} : {transformed_topic} ",
324
433
  )
325
434
  continue
326
- target_dataset = f"{project}.{dataset_table}"
435
+ target_dataset: str = f"{project}.{dataset_table}"
327
436
 
328
437
  lineages.append(
329
438
  KafkaConnectLineage(
330
- source_dataset=transformed_topic,
439
+ source_dataset=topic, # Keep original topic as source
331
440
  source_platform=KAFKA,
332
441
  target_dataset=target_dataset,
333
442
  target_platform=target_platform,
@@ -15,6 +15,7 @@ from datahub.ingestion.api.decorators import (
15
15
  )
16
16
  from datahub.ingestion.api.source import Source, SourceReport
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
18
19
  from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
19
20
  DataHubMockDataReport,
20
21
  )
@@ -211,15 +212,19 @@ class DataHubMockDataSource(Source):
211
212
  pattern = self.config.gen_1.subtype_pattern
212
213
 
213
214
  if pattern == SubTypePattern.ALTERNATING:
214
- return "Table" if table_index % 2 == 0 else "View"
215
+ return (
216
+ DatasetSubTypes.TABLE if table_index % 2 == 0 else DatasetSubTypes.VIEW
217
+ )
215
218
  elif pattern == SubTypePattern.LEVEL_BASED:
216
- return self.config.gen_1.level_subtypes.get(table_level, "Table")
219
+ return self.config.gen_1.level_subtypes.get(
220
+ table_level, DatasetSubTypes.TABLE
221
+ )
217
222
  elif pattern == SubTypePattern.ALL_TABLE:
218
- return "Table"
223
+ return DatasetSubTypes.TABLE
219
224
  elif pattern == SubTypePattern.ALL_VIEW:
220
- return "View"
225
+ return DatasetSubTypes.VIEW
221
226
  else:
222
- return "Table" # default
227
+ return DatasetSubTypes.TABLE # default
223
228
 
224
229
  def _get_subtypes_aspect(
225
230
  self, table_name: str, table_level: int, table_index: int
@@ -261,11 +266,8 @@ class DataHubMockDataSource(Source):
261
266
  fan_out, hops, fan_out_after_first
262
267
  )
263
268
 
264
- logger.info(
265
- f"About to create {tables_to_be_created} tables for lineage testing"
266
- )
269
+ logger.info(f"About to create {tables_to_be_created} datasets mock data")
267
270
 
268
- current_progress = 0
269
271
  for i in range(hops + 1):
270
272
  tables_at_level = tables_at_levels[i]
271
273
 
@@ -286,12 +288,6 @@ class DataHubMockDataSource(Source):
286
288
  tables_at_levels=tables_at_levels,
287
289
  )
288
290
 
289
- current_progress += 1
290
- if current_progress % 1000 == 0:
291
- logger.info(
292
- f"Progress: {current_progress}/{tables_to_be_created} tables processed"
293
- )
294
-
295
291
  def _generate_lineage_for_table(
296
292
  self,
297
293
  table_name: str,
@@ -23,6 +23,7 @@ from datahub.ingestion.api.source import (
23
23
  SourceReport,
24
24
  )
25
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
26
27
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
27
28
  StaleEntityRemovalHandler,
28
29
  StaleEntityRemovalSourceReport,
@@ -493,7 +494,7 @@ class SlackSource(StatefulIngestionSourceBase):
493
494
  mcp=MetadataChangeProposalWrapper(
494
495
  entityUrn=urn_channel,
495
496
  aspect=SubTypesClass(
496
- typeNames=["Slack Channel"],
497
+ typeNames=[DatasetSubTypes.SLACK_CHANNEL],
497
498
  ),
498
499
  ),
499
500
  )
@@ -127,6 +127,8 @@ class SnowflakeQueriesExtractorReport(Report):
127
127
  users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
128
128
 
129
129
  audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
130
+ aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
131
+
130
132
  sql_aggregator: Optional[SqlAggregatorReport] = None
131
133
 
132
134
  num_ddl_queries_dropped: int = 0
@@ -282,7 +284,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
282
284
 
283
285
  self.aggregator.add(query)
284
286
 
285
- yield from auto_workunit(self.aggregator.gen_metadata())
287
+ with self.report.aggregator_generate_timer:
288
+ yield from auto_workunit(self.aggregator.gen_metadata())
286
289
 
287
290
  def fetch_users(self) -> UsersMapping:
288
291
  users: UsersMapping = dict()
@@ -660,6 +663,7 @@ class SnowflakeQueriesSource(Source):
660
663
  def close(self) -> None:
661
664
  self.connection.close()
662
665
  self.queries_extractor.close()
666
+ super().close()
663
667
 
664
668
 
665
669
  # Make sure we don't try to generate too much info for a single query.
@@ -57,10 +57,11 @@ class GenericProfiler:
57
57
  platform: Optional[str] = None,
58
58
  profiler_args: Optional[Dict] = None,
59
59
  ) -> Iterable[MetadataWorkUnit]:
60
+ # We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
60
61
  ge_profile_requests: List[GEProfilerRequest] = [
61
62
  cast(GEProfilerRequest, request)
62
63
  for request in requests
63
- if not request.profile_table_level_only
64
+ if not request.profile_table_level_only or request.table.rows_count == 0
64
65
  ]
65
66
  table_level_profile_requests: List[TableProfilerRequest] = [
66
67
  request for request in requests if request.profile_table_level_only
@@ -25,6 +25,7 @@ from datahub.ingestion.api.decorators import (
25
25
  )
26
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
28
29
  from datahub.ingestion.source.sql.sql_common import (
29
30
  SQLAlchemySource,
30
31
  SqlWorkUnit,
@@ -497,7 +498,7 @@ class VerticaSource(SQLAlchemySource):
497
498
  changeType=ChangeTypeClass.UPSERT,
498
499
  entityUrn=dataset_urn,
499
500
  aspectName="subTypes",
500
- aspect=SubTypesClass(typeNames=["Projections"]),
501
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
501
502
  ).as_workunit()
502
503
 
503
504
  if self.config.domain:
@@ -1020,29 +1020,45 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1020
1020
  ) -> Iterable[MetadataWorkUnit]:
1021
1021
  if self.ctx.graph and self.platform_resource_repository:
1022
1022
  for tag in tags:
1023
- platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1024
- platform_instance=self.platform_instance_name,
1025
- platform_resource_repository=self.platform_resource_repository,
1026
- tag=tag,
1027
- )
1028
- logger.debug(f"Created platform resource {platform_resource_id}")
1023
+ try:
1024
+ platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1025
+ platform_instance=self.platform_instance_name,
1026
+ platform_resource_repository=self.platform_resource_repository,
1027
+ tag=tag,
1028
+ )
1029
+ logger.debug(f"Created platform resource {platform_resource_id}")
1029
1030
 
1030
- unity_catalog_tag = UnityCatalogTagPlatformResource.get_from_datahub(
1031
- platform_resource_id, self.platform_resource_repository, False
1032
- )
1033
- if (
1034
- tag.to_datahub_tag_urn().urn()
1035
- not in unity_catalog_tag.datahub_linked_resources().urns
1036
- ):
1037
- unity_catalog_tag.datahub_linked_resources().add(
1038
- tag.to_datahub_tag_urn().urn()
1031
+ unity_catalog_tag = (
1032
+ UnityCatalogTagPlatformResource.get_from_datahub(
1033
+ platform_resource_id,
1034
+ self.platform_resource_repository,
1035
+ False,
1036
+ )
1039
1037
  )
1040
- platform_resource = unity_catalog_tag.as_platform_resource()
1041
- for mcp in platform_resource.to_mcps():
1042
- yield MetadataWorkUnit(
1043
- id=f"platform_resource-{platform_resource.id}",
1044
- mcp=mcp,
1038
+ if (
1039
+ tag.to_datahub_tag_urn().urn()
1040
+ not in unity_catalog_tag.datahub_linked_resources().urns
1041
+ ):
1042
+ unity_catalog_tag.datahub_linked_resources().add(
1043
+ tag.to_datahub_tag_urn().urn()
1045
1044
  )
1045
+ platform_resource = unity_catalog_tag.as_platform_resource()
1046
+ for mcp in platform_resource.to_mcps():
1047
+ yield MetadataWorkUnit(
1048
+ id=f"platform_resource-{platform_resource.id}",
1049
+ mcp=mcp,
1050
+ )
1051
+ except Exception as e:
1052
+ logger.exception(
1053
+ f"Error processing platform resource for tag {tag}"
1054
+ )
1055
+ self.report.report_warning(
1056
+ message="Error processing platform resource for tag",
1057
+ context=str(tag),
1058
+ title="Error processing platform resource for tag",
1059
+ exc=e,
1060
+ )
1061
+ continue
1046
1062
 
1047
1063
  def _create_schema_metadata_aspect(
1048
1064
  self, table: Table
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
71
71
 
72
72
  server_ownership = graph.get_ownership(entity_urn=urn)
73
73
  if server_ownership:
74
- owners = {owner.owner: owner for owner in server_ownership.owners}
75
- owners.update({owner.owner: owner for owner in mce_ownership.owners})
74
+ owners = {
75
+ (
76
+ owner.owner,
77
+ owner.type,
78
+ owner.typeUrn,
79
+ ): owner
80
+ for owner in server_ownership.owners
81
+ }
82
+ owners.update(
83
+ {
84
+ (
85
+ owner.owner,
86
+ owner.type,
87
+ owner.typeUrn,
88
+ ): owner
89
+ for owner in mce_ownership.owners
90
+ }
91
+ )
76
92
  mce_ownership.owners = list(owners.values())
77
93
 
78
94
  return mce_ownership