acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/METADATA +2575 -2575
  2. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/RECORD +52 -45
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +21 -4
  5. datahub/ingestion/api/decorators.py +14 -3
  6. datahub/ingestion/api/report.py +123 -2
  7. datahub/ingestion/api/source.py +45 -44
  8. datahub/ingestion/autogenerated/lineage_helper.py +193 -0
  9. datahub/ingestion/graph/client.py +71 -28
  10. datahub/ingestion/run/pipeline.py +6 -0
  11. datahub/ingestion/source/aws/glue.py +1 -1
  12. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  13. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  14. datahub/ingestion/source/bigquery_v2/queries.py +4 -4
  15. datahub/ingestion/source/common/subtypes.py +43 -0
  16. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  17. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  18. datahub/ingestion/source/hex/api.py +26 -1
  19. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  20. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
  21. datahub/ingestion/source/salesforce.py +6 -3
  22. datahub/ingestion/source/slack/slack.py +2 -1
  23. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
  24. datahub/ingestion/source/sql/athena.py +15 -3
  25. datahub/ingestion/source/sql/mssql/source.py +9 -0
  26. datahub/ingestion/source/sql/sql_common.py +3 -0
  27. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  28. datahub/ingestion/source/sql/teradata.py +4 -1
  29. datahub/ingestion/source/sql/vertica.py +9 -1
  30. datahub/ingestion/source/tableau/tableau.py +6 -1
  31. datahub/ingestion/source/unity/source.py +36 -20
  32. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  33. datahub/metadata/_internal_schema_classes.py +601 -0
  34. datahub/metadata/_urns/urn_defs.py +112 -0
  35. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  36. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  37. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  38. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  39. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  40. datahub/metadata/schema.avsc +383 -0
  41. datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
  42. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  43. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
  44. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  45. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  46. datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
  47. datahub/sdk/datajob.py +39 -15
  48. datahub/specific/dataproduct.py +4 -0
  49. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/licenses/LICENSE +0 -0
  52. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
5
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
6
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
7
  DataProcessInstance,
8
8
  InstanceRunResult,
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
42
42
  FineGrainedLineageDownstreamType,
43
43
  FineGrainedLineageUpstreamType,
44
44
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
45
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
46
+ from datahub.sdk.dataflow import DataFlow
47
+ from datahub.sdk.datajob import DataJob
48
+ from datahub.sdk.entity import Entity
47
49
 
48
50
  # Logger instance
49
51
  logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
75
77
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
76
78
 
77
79
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
80
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
81
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
82
  fine_grained_lineage: List[FineGrainedLineage] = []
81
83
 
82
84
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
180
  )
179
181
  )
180
182
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
183
+ datajob.set_inlets(input_dataset_urn_list)
184
+ datajob.set_outlets(output_dataset_urn_list)
185
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
186
 
185
187
  return dict(
186
188
  **{
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
199
 
198
200
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
201
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
202
+ platform=Constant.ORCHESTRATOR,
203
+ name=connector.connector_id,
202
204
  env=self.config.env,
203
- name=connector.connector_name,
205
+ display_name=connector.connector_name,
204
206
  platform_instance=self.config.platform_instance,
205
207
  )
206
208
 
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
215
  )
214
216
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
217
  datajob = DataJob(
216
- id=connector.connector_id,
218
+ name=connector.connector_id,
217
219
  flow_urn=dataflow_urn,
218
220
  platform_instance=self.config.platform_instance,
219
- name=connector.connector_name,
220
- owners={owner_email} if owner_email else set(),
221
+ display_name=connector.connector_name,
222
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
221
223
  )
222
224
 
223
225
  # Map connector source and destination table with dataset entity
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
232
234
  "sync_frequency": str(connector.sync_frequency),
233
235
  "destination_id": connector.destination_id,
234
236
  }
235
- datajob.properties = {
236
- **connector_properties,
237
- **lineage_properties,
238
- }
237
+
238
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
239
239
 
240
240
  return datajob
241
241
 
242
242
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
243
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
244
+ datajob_v1 = DataJobV1(
245
+ id=datajob.name,
246
+ flow_urn=datajob.flow_urn,
247
+ platform_instance=self.config.platform_instance,
248
+ name=datajob.name,
249
+ inlets=datajob.inlets,
250
+ outlets=datajob.outlets,
251
+ fine_grained_lineages=datajob.fine_grained_lineages,
252
+ )
243
253
  return DataProcessInstance.from_datajob(
244
- datajob=datajob,
254
+ datajob=datajob_v1,
245
255
  id=job.job_id,
246
256
  clone_inlets=True,
247
257
  clone_outlets=True,
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
278
288
 
279
289
  def _get_connector_workunits(
280
290
  self, connector: Connector
281
- ) -> Iterable[MetadataWorkUnit]:
291
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
282
292
  self.report.report_connectors_scanned()
283
293
  # Create dataflow entity with same name as connector name
284
294
  dataflow = self._generate_dataflow_from_connector(connector)
285
- for mcp in dataflow.generate_mcp():
286
- yield mcp.as_workunit()
295
+ yield dataflow
287
296
 
288
297
  # Map Fivetran's connector entity with Datahub's datajob entity
289
298
  datajob = self._generate_datajob_from_connector(connector)
290
- for mcp in datajob.generate_mcp(materialize_iolets=False):
291
- yield mcp.as_workunit()
299
+ yield datajob
292
300
 
293
301
  # Map Fivetran's job/sync history entity with Datahub's data process entity
294
302
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
310
318
  ).workunit_processor,
311
319
  ]
312
320
 
313
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
321
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
314
322
  """
315
323
  Datahub Ingestion framework invoke this method
316
324
  """
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
5
5
 
6
6
  import requests
7
7
  from pydantic import BaseModel, Field, ValidationError, validator
8
+ from requests.adapters import HTTPAdapter
8
9
  from typing_extensions import assert_never
10
+ from urllib3.util.retry import Retry
9
11
 
10
12
  from datahub.ingestion.api.source import SourceReport
11
13
  from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
220
222
  self.base_url = base_url
221
223
  self.report = report
222
224
  self.page_size = page_size
225
+ self.session = self._create_retry_session()
223
226
 
224
227
  def _list_projects_url(self):
225
228
  return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
227
230
  def _auth_header(self):
228
231
  return {"Authorization": f"Bearer {self.token}"}
229
232
 
233
+ def _create_retry_session(self) -> requests.Session:
234
+ """Create a requests session with retry logic for rate limiting.
235
+
236
+ Hex API rate limit: 60 requests per minute
237
+ https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
238
+ """
239
+ session = requests.Session()
240
+
241
+ # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
242
+ retry_strategy = Retry(
243
+ total=5, # Maximum number of retries
244
+ status_forcelist=[429], # Only retry on 429 status code
245
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
246
+ raise_on_status=True, # Raise exception after max retries
247
+ )
248
+
249
+ adapter = HTTPAdapter(max_retries=retry_strategy)
250
+ session.mount("http://", adapter)
251
+ session.mount("https://", adapter)
252
+
253
+ return session
254
+
230
255
  def fetch_projects(
231
256
  self,
232
257
  include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
259
284
  logger.debug(f"Fetching projects page with params: {params}")
260
285
  self.report.fetch_projects_page_calls += 1
261
286
  try:
262
- response = requests.get(
287
+ response = self.session.get(
263
288
  url=self._list_projects_url(),
264
289
  headers=self._auth_header(),
265
290
  params=params,
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import re
2
3
  from dataclasses import dataclass
3
4
  from typing import Dict, Iterable, List, Optional, Tuple
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
9
10
  KafkaConnectLineage,
10
11
  )
11
12
 
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RegexRouterTransform:
17
+ """Helper class to handle RegexRouter transformations for topic/table names."""
18
+
19
+ def __init__(self, config: Dict[str, str]) -> None:
20
+ self.transforms = self._parse_transforms(config)
21
+
22
+ def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
23
+ """Parse transforms configuration from connector config."""
24
+ transforms_list: List[Dict[str, str]] = []
25
+
26
+ # Get the transforms parameter
27
+ transforms_param: str = config.get("transforms", "")
28
+ if not transforms_param:
29
+ return transforms_list
30
+
31
+ # Parse individual transforms
32
+ transform_names: List[str] = [
33
+ name.strip() for name in transforms_param.split(",")
34
+ ]
35
+
36
+ for transform_name in transform_names:
37
+ if not transform_name:
38
+ continue
39
+ transform_config: Dict[str, str] = {}
40
+ transform_prefix: str = f"transforms.{transform_name}."
41
+
42
+ # Extract transform configuration
43
+ for key, value in config.items():
44
+ if key.startswith(transform_prefix):
45
+ config_key: str = key[len(transform_prefix) :]
46
+ transform_config[config_key] = value
47
+
48
+ # Only process RegexRouter transforms
49
+ if (
50
+ transform_config.get("type")
51
+ == "org.apache.kafka.connect.transforms.RegexRouter"
52
+ ):
53
+ transform_config["name"] = transform_name
54
+ transforms_list.append(transform_config)
55
+
56
+ return transforms_list
57
+
58
+ def apply_transforms(self, topic_name: str) -> str:
59
+ """Apply RegexRouter transforms to the topic name using Java regex."""
60
+ result: str = topic_name
61
+
62
+ for transform in self.transforms:
63
+ regex_pattern: Optional[str] = transform.get("regex")
64
+ replacement: str = transform.get("replacement", "")
65
+
66
+ if regex_pattern:
67
+ try:
68
+ # Use Java Pattern and Matcher for exact Kafka Connect compatibility
69
+ from java.util.regex import Pattern
70
+
71
+ pattern = Pattern.compile(regex_pattern)
72
+ matcher = pattern.matcher(result)
73
+
74
+ if matcher.find():
75
+ # Reset matcher to beginning for replaceFirst
76
+ matcher.reset()
77
+ result = matcher.replaceFirst(replacement)
78
+ logger.debug(
79
+ f"Applied transform {transform['name']}: {topic_name} -> {result}"
80
+ )
81
+ except Exception as e:
82
+ logger.warning(
83
+ f"Invalid regex pattern in transform {transform['name']}: {e}"
84
+ )
85
+
86
+ return str(result)
87
+
12
88
 
13
89
  @dataclass
14
90
  class ConfluentS3SinkConnector(BaseConnector):
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
18
94
  bucket: str
19
95
  topics_dir: str
20
96
  topics: Iterable[str]
97
+ regex_router: RegexRouterTransform
21
98
 
22
99
  def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
23
100
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
24
- bucket = connector_manifest.config.get("s3.bucket.name")
101
+ bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
25
102
  if not bucket:
26
103
  raise ValueError(
27
104
  "Could not find 's3.bucket.name' in connector configuration"
28
105
  )
29
106
 
30
107
  # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
31
- topics_dir = connector_manifest.config.get("topics.dir", "topics")
108
+ topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
109
+
110
+ # Create RegexRouterTransform instance
111
+ regex_router: RegexRouterTransform = RegexRouterTransform(
112
+ connector_manifest.config
113
+ )
32
114
 
33
115
  return self.S3SinkParser(
34
116
  target_platform="s3",
35
117
  bucket=bucket,
36
118
  topics_dir=topics_dir,
37
119
  topics=connector_manifest.topic_names,
120
+ regex_router=regex_router,
38
121
  )
39
122
 
40
123
  def extract_flow_property_bag(self) -> Dict[str, str]:
41
124
  # Mask/Remove properties that may reveal credentials
42
- flow_property_bag = {
125
+ flow_property_bag: Dict[str, str] = {
43
126
  k: v
44
127
  for k, v in self.connector_manifest.config.items()
45
128
  if k
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
54
137
 
55
138
  def extract_lineages(self) -> List[KafkaConnectLineage]:
56
139
  try:
57
- parser = self._get_parser(self.connector_manifest)
140
+ parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
141
+ self.connector_manifest
142
+ )
58
143
 
59
144
  lineages: List[KafkaConnectLineage] = list()
60
145
  for topic in parser.topics:
61
- target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
146
+ # Apply RegexRouter transformations using the RegexRouterTransform class
147
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
148
+ target_dataset: str = (
149
+ f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
150
+ )
62
151
 
63
152
  lineages.append(
64
153
  KafkaConnectLineage(
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
86
175
  database_name: str
87
176
  schema_name: str
88
177
  topics_to_tables: Dict[str, str]
178
+ regex_router: RegexRouterTransform
89
179
 
90
180
  def get_table_name_from_topic_name(self, topic_name: str) -> str:
91
181
  """
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
93
183
  Refer below link for more info
94
184
  https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
95
185
  """
96
- table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
186
+ table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
97
187
  if re.match("^[^a-zA-Z_].*", table_name):
98
188
  table_name = "_" + table_name
99
189
  # Connector may append original topic's hash code as suffix for conflict resolution
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
106
196
  self,
107
197
  connector_manifest: ConnectorManifest,
108
198
  ) -> SnowflakeParser:
109
- database_name = connector_manifest.config["snowflake.database.name"]
110
- schema_name = connector_manifest.config["snowflake.schema.name"]
199
+ database_name: str = connector_manifest.config["snowflake.database.name"]
200
+ schema_name: str = connector_manifest.config["snowflake.schema.name"]
201
+
202
+ # Create RegexRouterTransform instance
203
+ regex_router: RegexRouterTransform = RegexRouterTransform(
204
+ connector_manifest.config
205
+ )
111
206
 
112
207
  # Fetch user provided topic to table map
113
208
  provided_topics_to_tables: Dict[str, str] = {}
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
121
216
  topics_to_tables: Dict[str, str] = {}
122
217
  # Extract lineage for only those topics whose data ingestion started
123
218
  for topic in connector_manifest.topic_names:
219
+ # Apply transforms first to get the transformed topic name
220
+ transformed_topic: str = regex_router.apply_transforms(topic)
221
+
124
222
  if topic in provided_topics_to_tables:
125
223
  # If user provided which table to get mapped with this topic
126
224
  topics_to_tables[topic] = provided_topics_to_tables[topic]
127
225
  else:
128
- # Else connector converts topic name to a valid Snowflake table name.
129
- topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
226
+ # Use the transformed topic name to generate table name
227
+ topics_to_tables[topic] = self.get_table_name_from_topic_name(
228
+ transformed_topic
229
+ )
130
230
 
131
231
  return self.SnowflakeParser(
132
232
  database_name=database_name,
133
233
  schema_name=schema_name,
134
234
  topics_to_tables=topics_to_tables,
235
+ regex_router=regex_router,
135
236
  )
136
237
 
137
238
  def extract_flow_property_bag(self) -> Dict[str, str]:
138
239
  # For all snowflake sink connector properties, refer below link
139
240
  # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
140
241
  # remove private keys, secrets from properties
141
- flow_property_bag = {
242
+ flow_property_bag: Dict[str, str] = {
142
243
  k: v
143
244
  for k, v in self.connector_manifest.config.items()
144
245
  if k
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
153
254
 
154
255
  def extract_lineages(self) -> List[KafkaConnectLineage]:
155
256
  lineages: List[KafkaConnectLineage] = list()
156
- parser = self.get_parser(self.connector_manifest)
257
+ parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
258
+ self.connector_manifest
259
+ )
157
260
 
158
261
  for topic, table in parser.topics_to_tables.items():
159
- target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
262
+ target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
160
263
  lineages.append(
161
264
  KafkaConnectLineage(
162
265
  source_dataset=topic,
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
176
279
  project: str
177
280
  target_platform: str
178
281
  sanitizeTopics: bool
179
- transforms: list
282
+ transforms: List[Dict[str, str]]
283
+ regex_router: RegexRouterTransform
180
284
  topicsToTables: Optional[str] = None
181
285
  datasets: Optional[str] = None
182
286
  defaultDataset: Optional[str] = None
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
186
290
  self,
187
291
  connector_manifest: ConnectorManifest,
188
292
  ) -> BQParser:
189
- project = connector_manifest.config["project"]
190
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
191
- transform_names = (
293
+ project: str = connector_manifest.config["project"]
294
+ sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
295
+
296
+ # Parse ALL transforms (original BigQuery logic)
297
+ transform_names: List[str] = (
192
298
  self.connector_manifest.config.get("transforms", "").split(",")
193
299
  if self.connector_manifest.config.get("transforms")
194
300
  else []
195
301
  )
196
- transforms = []
302
+ transforms: List[Dict[str, str]] = []
197
303
  for name in transform_names:
198
- transform = {"name": name}
304
+ transform: Dict[str, str] = {"name": name}
199
305
  transforms.append(transform)
200
306
  for key in self.connector_manifest.config:
201
307
  if key.startswith(f"transforms.{name}."):
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
203
309
  self.connector_manifest.config[key]
204
310
  )
205
311
 
312
+ # Create RegexRouterTransform instance for RegexRouter-specific handling
313
+ regex_router: RegexRouterTransform = RegexRouterTransform(
314
+ connector_manifest.config
315
+ )
316
+
206
317
  if "defaultDataset" in connector_manifest.config:
207
- defaultDataset = connector_manifest.config["defaultDataset"]
318
+ defaultDataset: str = connector_manifest.config["defaultDataset"]
208
319
  return self.BQParser(
209
320
  project=project,
210
321
  defaultDataset=defaultDataset,
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
212
323
  sanitizeTopics=sanitizeTopics.lower() == "true",
213
324
  version="v2",
214
325
  transforms=transforms,
326
+ regex_router=regex_router,
215
327
  )
216
328
  else:
217
329
  # version 1.6.x and similar configs supported
218
- datasets = connector_manifest.config["datasets"]
219
- topicsToTables = connector_manifest.config.get("topicsToTables")
330
+ datasets: str = connector_manifest.config["datasets"]
331
+ topicsToTables: Optional[str] = connector_manifest.config.get(
332
+ "topicsToTables"
333
+ )
220
334
 
221
335
  return self.BQParser(
222
336
  project=project,
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
225
339
  target_platform="bigquery",
226
340
  sanitizeTopics=sanitizeTopics.lower() == "true",
227
341
  transforms=transforms,
342
+ regex_router=regex_router,
228
343
  )
229
344
 
230
345
  def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
231
- entries = property.split(",")
346
+ entries: List[str] = property.split(",")
232
347
  for entry in entries:
233
348
  key, val = entry.rsplit("=")
234
349
  yield (key.strip(), val.strip())
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
243
358
  return dataset
244
359
  return None
245
360
 
246
- def sanitize_table_name(self, table_name):
361
+ def sanitize_table_name(self, table_name: str) -> str:
247
362
  table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
248
363
  if re.match("^[^a-zA-Z_].*", table_name):
249
364
  table_name = "_" + table_name
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
254
369
  self, topic: str, parser: BQParser
255
370
  ) -> Optional[str]:
256
371
  if parser.version == "v2":
257
- dataset = parser.defaultDataset
258
- parts = topic.split(":")
372
+ dataset: Optional[str] = parser.defaultDataset
373
+ parts: List[str] = topic.split(":")
259
374
  if len(parts) == 2:
260
375
  dataset = parts[0]
261
376
  table = parts[1]
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
283
398
  table = self.sanitize_table_name(table)
284
399
  return f"{dataset}.{table}"
285
400
 
286
- def apply_transformations(
287
- self, topic: str, transforms: List[Dict[str, str]]
288
- ) -> str:
289
- for transform in transforms:
290
- if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
291
- regex = transform["regex"]
292
- replacement = transform["replacement"]
293
- pattern = re.compile(regex)
294
- if pattern.match(topic):
295
- topic = pattern.sub(replacement, topic, count=1)
296
- return topic
297
-
298
401
  def extract_flow_property_bag(self) -> Dict[str, str]:
299
402
  # Mask/Remove properties that may reveal credentials
300
- flow_property_bag = {
403
+ flow_property_bag: Dict[str, str] = {
301
404
  k: v
302
405
  for k, v in self.connector_manifest.config.items()
303
406
  if k not in ["keyfile"]
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
307
410
 
308
411
  def extract_lineages(self) -> List[KafkaConnectLineage]:
309
412
  lineages: List[KafkaConnectLineage] = list()
310
- parser = self.get_parser(self.connector_manifest)
413
+ parser: BigQuerySinkConnector.BQParser = self.get_parser(
414
+ self.connector_manifest
415
+ )
311
416
  if not parser:
312
417
  return lineages
313
- target_platform = parser.target_platform
314
- project = parser.project
315
- transforms = parser.transforms
418
+ target_platform: str = parser.target_platform
419
+ project: str = parser.project
316
420
 
317
421
  for topic in self.connector_manifest.topic_names:
318
- transformed_topic = self.apply_transformations(topic, transforms)
319
- dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
422
+ # Apply RegexRouter transformations using the RegexRouterTransform class
423
+ transformed_topic: str = parser.regex_router.apply_transforms(topic)
424
+
425
+ # Use the transformed topic to determine dataset/table
426
+ dataset_table: Optional[str] = self.get_dataset_table_for_topic(
427
+ transformed_topic, parser
428
+ )
320
429
  if dataset_table is None:
321
430
  self.report.warning(
322
431
  "Could not find target dataset for topic, please check your connector configuration"
323
432
  f"{self.connector_manifest.name} : {transformed_topic} ",
324
433
  )
325
434
  continue
326
- target_dataset = f"{project}.{dataset_table}"
435
+ target_dataset: str = f"{project}.{dataset_table}"
327
436
 
328
437
  lineages.append(
329
438
  KafkaConnectLineage(
330
- source_dataset=transformed_topic,
439
+ source_dataset=topic, # Keep original topic as source
331
440
  source_platform=KAFKA,
332
441
  target_dataset=target_dataset,
333
442
  target_platform=target_platform,
@@ -15,6 +15,7 @@ from datahub.ingestion.api.decorators import (
15
15
  )
16
16
  from datahub.ingestion.api.source import Source, SourceReport
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
18
19
  from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
19
20
  DataHubMockDataReport,
20
21
  )
@@ -211,15 +212,19 @@ class DataHubMockDataSource(Source):
211
212
  pattern = self.config.gen_1.subtype_pattern
212
213
 
213
214
  if pattern == SubTypePattern.ALTERNATING:
214
- return "Table" if table_index % 2 == 0 else "View"
215
+ return (
216
+ DatasetSubTypes.TABLE if table_index % 2 == 0 else DatasetSubTypes.VIEW
217
+ )
215
218
  elif pattern == SubTypePattern.LEVEL_BASED:
216
- return self.config.gen_1.level_subtypes.get(table_level, "Table")
219
+ return self.config.gen_1.level_subtypes.get(
220
+ table_level, DatasetSubTypes.TABLE
221
+ )
217
222
  elif pattern == SubTypePattern.ALL_TABLE:
218
- return "Table"
223
+ return DatasetSubTypes.TABLE
219
224
  elif pattern == SubTypePattern.ALL_VIEW:
220
- return "View"
225
+ return DatasetSubTypes.VIEW
221
226
  else:
222
- return "Table" # default
227
+ return DatasetSubTypes.TABLE # default
223
228
 
224
229
  def _get_subtypes_aspect(
225
230
  self, table_name: str, table_level: int, table_index: int
@@ -261,11 +266,8 @@ class DataHubMockDataSource(Source):
261
266
  fan_out, hops, fan_out_after_first
262
267
  )
263
268
 
264
- logger.info(
265
- f"About to create {tables_to_be_created} tables for lineage testing"
266
- )
269
+ logger.info(f"About to create {tables_to_be_created} datasets mock data")
267
270
 
268
- current_progress = 0
269
271
  for i in range(hops + 1):
270
272
  tables_at_level = tables_at_levels[i]
271
273
 
@@ -286,12 +288,6 @@ class DataHubMockDataSource(Source):
286
288
  tables_at_levels=tables_at_levels,
287
289
  )
288
290
 
289
- current_progress += 1
290
- if current_progress % 1000 == 0:
291
- logger.info(
292
- f"Progress: {current_progress}/{tables_to_be_created} tables processed"
293
- )
294
-
295
291
  def _generate_lineage_for_table(
296
292
  self,
297
293
  table_name: str,
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
33
33
  )
34
34
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
- from datahub.ingestion.source.common.subtypes import DatasetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ DatasetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -532,11 +535,11 @@ class SalesforceApi:
532
535
  @capability(
533
536
  capability_name=SourceCapability.DATA_PROFILING,
534
537
  description="Only table level profiling is supported via `profiling.enabled` config field",
538
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
535
539
  )
536
540
  @capability(
537
541
  capability_name=SourceCapability.DELETION_DETECTION,
538
- description="Not supported yet",
539
- supported=False,
542
+ description="Enabled by default via stateful ingestion",
540
543
  )
541
544
  @capability(
542
545
  capability_name=SourceCapability.SCHEMA_METADATA,
@@ -23,6 +23,7 @@ from datahub.ingestion.api.source import (
23
23
  SourceReport,
24
24
  )
25
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
26
27
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
27
28
  StaleEntityRemovalHandler,
28
29
  StaleEntityRemovalSourceReport,
@@ -493,7 +494,7 @@ class SlackSource(StatefulIngestionSourceBase):
493
494
  mcp=MetadataChangeProposalWrapper(
494
495
  entityUrn=urn_channel,
495
496
  aspect=SubTypesClass(
496
- typeNames=["Slack Channel"],
497
+ typeNames=[DatasetSubTypes.SLACK_CHANNEL],
497
498
  ),
498
499
  ),
499
500
  )