acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/METADATA +2575 -2575
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/RECORD +52 -45
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +21 -4
- datahub/ingestion/api/decorators.py +14 -3
- datahub/ingestion/api/report.py +123 -2
- datahub/ingestion/api/source.py +45 -44
- datahub/ingestion/autogenerated/lineage_helper.py +193 -0
- datahub/ingestion/graph/client.py +71 -28
- datahub/ingestion/run/pipeline.py +6 -0
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +4 -4
- datahub/ingestion/source/common/subtypes.py +43 -0
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
- datahub/ingestion/source/sql/athena.py +15 -3
- datahub/ingestion/source/sql/mssql/source.py +9 -0
- datahub/ingestion/source/sql/sql_common.py +3 -0
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -1
- datahub/ingestion/source/sql/vertica.py +9 -1
- datahub/ingestion/source/tableau/tableau.py +6 -1
- datahub/ingestion/source/unity/source.py +36 -20
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/metadata/_internal_schema_classes.py +601 -0
- datahub/metadata/_urns/urn_defs.py +112 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +383 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
- datahub/sdk/datajob.py +39 -15
- datahub/specific/dataproduct.py +4 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict, Iterable, List, Optional
|
|
2
|
+
from typing import Dict, Iterable, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
|
-
from datahub.api.entities.datajob import
|
|
5
|
+
from datahub.api.entities.datajob import DataJob as DataJobV1
|
|
6
6
|
from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
7
7
|
DataProcessInstance,
|
|
8
8
|
InstanceRunResult,
|
|
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
42
42
|
FineGrainedLineageDownstreamType,
|
|
43
43
|
FineGrainedLineageUpstreamType,
|
|
44
44
|
)
|
|
45
|
-
from datahub.
|
|
46
|
-
from datahub.
|
|
45
|
+
from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
|
|
46
|
+
from datahub.sdk.dataflow import DataFlow
|
|
47
|
+
from datahub.sdk.datajob import DataJob
|
|
48
|
+
from datahub.sdk.entity import Entity
|
|
47
49
|
|
|
48
50
|
# Logger instance
|
|
49
51
|
logger = logging.getLogger(__name__)
|
|
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
75
77
|
self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
|
|
76
78
|
|
|
77
79
|
def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
|
|
78
|
-
input_dataset_urn_list: List[DatasetUrn] = []
|
|
79
|
-
output_dataset_urn_list: List[DatasetUrn] = []
|
|
80
|
+
input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
81
|
+
output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
|
|
80
82
|
fine_grained_lineage: List[FineGrainedLineage] = []
|
|
81
83
|
|
|
82
84
|
# TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
|
|
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
178
180
|
)
|
|
179
181
|
)
|
|
180
182
|
|
|
181
|
-
datajob.
|
|
182
|
-
datajob.
|
|
183
|
-
datajob.
|
|
183
|
+
datajob.set_inlets(input_dataset_urn_list)
|
|
184
|
+
datajob.set_outlets(output_dataset_urn_list)
|
|
185
|
+
datajob.set_fine_grained_lineages(fine_grained_lineage)
|
|
184
186
|
|
|
185
187
|
return dict(
|
|
186
188
|
**{
|
|
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
197
199
|
|
|
198
200
|
def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
|
|
199
201
|
return DataFlow(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
platform=Constant.ORCHESTRATOR,
|
|
203
|
+
name=connector.connector_id,
|
|
202
204
|
env=self.config.env,
|
|
203
|
-
|
|
205
|
+
display_name=connector.connector_name,
|
|
204
206
|
platform_instance=self.config.platform_instance,
|
|
205
207
|
)
|
|
206
208
|
|
|
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
213
215
|
)
|
|
214
216
|
owner_email = self.audit_log.get_user_email(connector.user_id)
|
|
215
217
|
datajob = DataJob(
|
|
216
|
-
|
|
218
|
+
name=connector.connector_id,
|
|
217
219
|
flow_urn=dataflow_urn,
|
|
218
220
|
platform_instance=self.config.platform_instance,
|
|
219
|
-
|
|
220
|
-
owners=
|
|
221
|
+
display_name=connector.connector_name,
|
|
222
|
+
owners=[CorpUserUrn(owner_email)] if owner_email else None,
|
|
221
223
|
)
|
|
222
224
|
|
|
223
225
|
# Map connector source and destination table with dataset entity
|
|
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
232
234
|
"sync_frequency": str(connector.sync_frequency),
|
|
233
235
|
"destination_id": connector.destination_id,
|
|
234
236
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
**lineage_properties,
|
|
238
|
-
}
|
|
237
|
+
|
|
238
|
+
datajob.set_custom_properties({**connector_properties, **lineage_properties})
|
|
239
239
|
|
|
240
240
|
return datajob
|
|
241
241
|
|
|
242
242
|
def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
|
|
243
|
+
# hack: convert to old instance for DataProcessInstance.from_datajob compatibility
|
|
244
|
+
datajob_v1 = DataJobV1(
|
|
245
|
+
id=datajob.name,
|
|
246
|
+
flow_urn=datajob.flow_urn,
|
|
247
|
+
platform_instance=self.config.platform_instance,
|
|
248
|
+
name=datajob.name,
|
|
249
|
+
inlets=datajob.inlets,
|
|
250
|
+
outlets=datajob.outlets,
|
|
251
|
+
fine_grained_lineages=datajob.fine_grained_lineages,
|
|
252
|
+
)
|
|
243
253
|
return DataProcessInstance.from_datajob(
|
|
244
|
-
datajob=
|
|
254
|
+
datajob=datajob_v1,
|
|
245
255
|
id=job.job_id,
|
|
246
256
|
clone_inlets=True,
|
|
247
257
|
clone_outlets=True,
|
|
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
278
288
|
|
|
279
289
|
def _get_connector_workunits(
|
|
280
290
|
self, connector: Connector
|
|
281
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
291
|
+
) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
282
292
|
self.report.report_connectors_scanned()
|
|
283
293
|
# Create dataflow entity with same name as connector name
|
|
284
294
|
dataflow = self._generate_dataflow_from_connector(connector)
|
|
285
|
-
|
|
286
|
-
yield mcp.as_workunit()
|
|
295
|
+
yield dataflow
|
|
287
296
|
|
|
288
297
|
# Map Fivetran's connector entity with Datahub's datajob entity
|
|
289
298
|
datajob = self._generate_datajob_from_connector(connector)
|
|
290
|
-
|
|
291
|
-
yield mcp.as_workunit()
|
|
299
|
+
yield datajob
|
|
292
300
|
|
|
293
301
|
# Map Fivetran's job/sync history entity with Datahub's data process entity
|
|
294
302
|
if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
|
|
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
310
318
|
).workunit_processor,
|
|
311
319
|
]
|
|
312
320
|
|
|
313
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
321
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
314
322
|
"""
|
|
315
323
|
Datahub Ingestion framework invoke this method
|
|
316
324
|
"""
|
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
7
|
from pydantic import BaseModel, Field, ValidationError, validator
|
|
8
|
+
from requests.adapters import HTTPAdapter
|
|
8
9
|
from typing_extensions import assert_never
|
|
10
|
+
from urllib3.util.retry import Retry
|
|
9
11
|
|
|
10
12
|
from datahub.ingestion.api.source import SourceReport
|
|
11
13
|
from datahub.ingestion.source.hex.constants import (
|
|
@@ -220,6 +222,7 @@ class HexApi:
|
|
|
220
222
|
self.base_url = base_url
|
|
221
223
|
self.report = report
|
|
222
224
|
self.page_size = page_size
|
|
225
|
+
self.session = self._create_retry_session()
|
|
223
226
|
|
|
224
227
|
def _list_projects_url(self):
|
|
225
228
|
return f"{self.base_url}/projects"
|
|
@@ -227,6 +230,28 @@ class HexApi:
|
|
|
227
230
|
def _auth_header(self):
|
|
228
231
|
return {"Authorization": f"Bearer {self.token}"}
|
|
229
232
|
|
|
233
|
+
def _create_retry_session(self) -> requests.Session:
|
|
234
|
+
"""Create a requests session with retry logic for rate limiting.
|
|
235
|
+
|
|
236
|
+
Hex API rate limit: 60 requests per minute
|
|
237
|
+
https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
|
|
238
|
+
"""
|
|
239
|
+
session = requests.Session()
|
|
240
|
+
|
|
241
|
+
# Configure retry strategy for 429 (Too Many Requests) with exponential backoff
|
|
242
|
+
retry_strategy = Retry(
|
|
243
|
+
total=5, # Maximum number of retries
|
|
244
|
+
status_forcelist=[429], # Only retry on 429 status code
|
|
245
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
|
|
246
|
+
raise_on_status=True, # Raise exception after max retries
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
250
|
+
session.mount("http://", adapter)
|
|
251
|
+
session.mount("https://", adapter)
|
|
252
|
+
|
|
253
|
+
return session
|
|
254
|
+
|
|
230
255
|
def fetch_projects(
|
|
231
256
|
self,
|
|
232
257
|
include_components: bool = True,
|
|
@@ -259,7 +284,7 @@ class HexApi:
|
|
|
259
284
|
logger.debug(f"Fetching projects page with params: {params}")
|
|
260
285
|
self.report.fetch_projects_page_calls += 1
|
|
261
286
|
try:
|
|
262
|
-
response =
|
|
287
|
+
response = self.session.get(
|
|
263
288
|
url=self._list_projects_url(),
|
|
264
289
|
headers=self._auth_header(),
|
|
265
290
|
params=params,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
|
|
|
9
10
|
KafkaConnectLineage,
|
|
10
11
|
)
|
|
11
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RegexRouterTransform:
|
|
17
|
+
"""Helper class to handle RegexRouter transformations for topic/table names."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Dict[str, str]) -> None:
|
|
20
|
+
self.transforms = self._parse_transforms(config)
|
|
21
|
+
|
|
22
|
+
def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
|
|
23
|
+
"""Parse transforms configuration from connector config."""
|
|
24
|
+
transforms_list: List[Dict[str, str]] = []
|
|
25
|
+
|
|
26
|
+
# Get the transforms parameter
|
|
27
|
+
transforms_param: str = config.get("transforms", "")
|
|
28
|
+
if not transforms_param:
|
|
29
|
+
return transforms_list
|
|
30
|
+
|
|
31
|
+
# Parse individual transforms
|
|
32
|
+
transform_names: List[str] = [
|
|
33
|
+
name.strip() for name in transforms_param.split(",")
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for transform_name in transform_names:
|
|
37
|
+
if not transform_name:
|
|
38
|
+
continue
|
|
39
|
+
transform_config: Dict[str, str] = {}
|
|
40
|
+
transform_prefix: str = f"transforms.{transform_name}."
|
|
41
|
+
|
|
42
|
+
# Extract transform configuration
|
|
43
|
+
for key, value in config.items():
|
|
44
|
+
if key.startswith(transform_prefix):
|
|
45
|
+
config_key: str = key[len(transform_prefix) :]
|
|
46
|
+
transform_config[config_key] = value
|
|
47
|
+
|
|
48
|
+
# Only process RegexRouter transforms
|
|
49
|
+
if (
|
|
50
|
+
transform_config.get("type")
|
|
51
|
+
== "org.apache.kafka.connect.transforms.RegexRouter"
|
|
52
|
+
):
|
|
53
|
+
transform_config["name"] = transform_name
|
|
54
|
+
transforms_list.append(transform_config)
|
|
55
|
+
|
|
56
|
+
return transforms_list
|
|
57
|
+
|
|
58
|
+
def apply_transforms(self, topic_name: str) -> str:
|
|
59
|
+
"""Apply RegexRouter transforms to the topic name using Java regex."""
|
|
60
|
+
result: str = topic_name
|
|
61
|
+
|
|
62
|
+
for transform in self.transforms:
|
|
63
|
+
regex_pattern: Optional[str] = transform.get("regex")
|
|
64
|
+
replacement: str = transform.get("replacement", "")
|
|
65
|
+
|
|
66
|
+
if regex_pattern:
|
|
67
|
+
try:
|
|
68
|
+
# Use Java Pattern and Matcher for exact Kafka Connect compatibility
|
|
69
|
+
from java.util.regex import Pattern
|
|
70
|
+
|
|
71
|
+
pattern = Pattern.compile(regex_pattern)
|
|
72
|
+
matcher = pattern.matcher(result)
|
|
73
|
+
|
|
74
|
+
if matcher.find():
|
|
75
|
+
# Reset matcher to beginning for replaceFirst
|
|
76
|
+
matcher.reset()
|
|
77
|
+
result = matcher.replaceFirst(replacement)
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Applied transform {transform['name']}: {topic_name} -> {result}"
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.warning(
|
|
83
|
+
f"Invalid regex pattern in transform {transform['name']}: {e}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return str(result)
|
|
87
|
+
|
|
12
88
|
|
|
13
89
|
@dataclass
|
|
14
90
|
class ConfluentS3SinkConnector(BaseConnector):
|
|
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
18
94
|
bucket: str
|
|
19
95
|
topics_dir: str
|
|
20
96
|
topics: Iterable[str]
|
|
97
|
+
regex_router: RegexRouterTransform
|
|
21
98
|
|
|
22
99
|
def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
|
|
23
100
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
|
|
24
|
-
bucket = connector_manifest.config.get("s3.bucket.name")
|
|
101
|
+
bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
|
|
25
102
|
if not bucket:
|
|
26
103
|
raise ValueError(
|
|
27
104
|
"Could not find 's3.bucket.name' in connector configuration"
|
|
28
105
|
)
|
|
29
106
|
|
|
30
107
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
|
|
31
|
-
topics_dir = connector_manifest.config.get("topics.dir", "topics")
|
|
108
|
+
topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
|
|
109
|
+
|
|
110
|
+
# Create RegexRouterTransform instance
|
|
111
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
112
|
+
connector_manifest.config
|
|
113
|
+
)
|
|
32
114
|
|
|
33
115
|
return self.S3SinkParser(
|
|
34
116
|
target_platform="s3",
|
|
35
117
|
bucket=bucket,
|
|
36
118
|
topics_dir=topics_dir,
|
|
37
119
|
topics=connector_manifest.topic_names,
|
|
120
|
+
regex_router=regex_router,
|
|
38
121
|
)
|
|
39
122
|
|
|
40
123
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
41
124
|
# Mask/Remove properties that may reveal credentials
|
|
42
|
-
flow_property_bag = {
|
|
125
|
+
flow_property_bag: Dict[str, str] = {
|
|
43
126
|
k: v
|
|
44
127
|
for k, v in self.connector_manifest.config.items()
|
|
45
128
|
if k
|
|
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
54
137
|
|
|
55
138
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
56
139
|
try:
|
|
57
|
-
parser = self._get_parser(
|
|
140
|
+
parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
|
|
141
|
+
self.connector_manifest
|
|
142
|
+
)
|
|
58
143
|
|
|
59
144
|
lineages: List[KafkaConnectLineage] = list()
|
|
60
145
|
for topic in parser.topics:
|
|
61
|
-
|
|
146
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
147
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
148
|
+
target_dataset: str = (
|
|
149
|
+
f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
|
|
150
|
+
)
|
|
62
151
|
|
|
63
152
|
lineages.append(
|
|
64
153
|
KafkaConnectLineage(
|
|
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
86
175
|
database_name: str
|
|
87
176
|
schema_name: str
|
|
88
177
|
topics_to_tables: Dict[str, str]
|
|
178
|
+
regex_router: RegexRouterTransform
|
|
89
179
|
|
|
90
180
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
91
181
|
"""
|
|
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
93
183
|
Refer below link for more info
|
|
94
184
|
https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
|
|
95
185
|
"""
|
|
96
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
186
|
+
table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
97
187
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
98
188
|
table_name = "_" + table_name
|
|
99
189
|
# Connector may append original topic's hash code as suffix for conflict resolution
|
|
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
106
196
|
self,
|
|
107
197
|
connector_manifest: ConnectorManifest,
|
|
108
198
|
) -> SnowflakeParser:
|
|
109
|
-
database_name = connector_manifest.config["snowflake.database.name"]
|
|
110
|
-
schema_name = connector_manifest.config["snowflake.schema.name"]
|
|
199
|
+
database_name: str = connector_manifest.config["snowflake.database.name"]
|
|
200
|
+
schema_name: str = connector_manifest.config["snowflake.schema.name"]
|
|
201
|
+
|
|
202
|
+
# Create RegexRouterTransform instance
|
|
203
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
204
|
+
connector_manifest.config
|
|
205
|
+
)
|
|
111
206
|
|
|
112
207
|
# Fetch user provided topic to table map
|
|
113
208
|
provided_topics_to_tables: Dict[str, str] = {}
|
|
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
121
216
|
topics_to_tables: Dict[str, str] = {}
|
|
122
217
|
# Extract lineage for only those topics whose data ingestion started
|
|
123
218
|
for topic in connector_manifest.topic_names:
|
|
219
|
+
# Apply transforms first to get the transformed topic name
|
|
220
|
+
transformed_topic: str = regex_router.apply_transforms(topic)
|
|
221
|
+
|
|
124
222
|
if topic in provided_topics_to_tables:
|
|
125
223
|
# If user provided which table to get mapped with this topic
|
|
126
224
|
topics_to_tables[topic] = provided_topics_to_tables[topic]
|
|
127
225
|
else:
|
|
128
|
-
#
|
|
129
|
-
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
226
|
+
# Use the transformed topic name to generate table name
|
|
227
|
+
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
228
|
+
transformed_topic
|
|
229
|
+
)
|
|
130
230
|
|
|
131
231
|
return self.SnowflakeParser(
|
|
132
232
|
database_name=database_name,
|
|
133
233
|
schema_name=schema_name,
|
|
134
234
|
topics_to_tables=topics_to_tables,
|
|
235
|
+
regex_router=regex_router,
|
|
135
236
|
)
|
|
136
237
|
|
|
137
238
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
138
239
|
# For all snowflake sink connector properties, refer below link
|
|
139
240
|
# https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
|
|
140
241
|
# remove private keys, secrets from properties
|
|
141
|
-
flow_property_bag = {
|
|
242
|
+
flow_property_bag: Dict[str, str] = {
|
|
142
243
|
k: v
|
|
143
244
|
for k, v in self.connector_manifest.config.items()
|
|
144
245
|
if k
|
|
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
153
254
|
|
|
154
255
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
155
256
|
lineages: List[KafkaConnectLineage] = list()
|
|
156
|
-
parser = self.get_parser(
|
|
257
|
+
parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
|
|
258
|
+
self.connector_manifest
|
|
259
|
+
)
|
|
157
260
|
|
|
158
261
|
for topic, table in parser.topics_to_tables.items():
|
|
159
|
-
target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
262
|
+
target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
160
263
|
lineages.append(
|
|
161
264
|
KafkaConnectLineage(
|
|
162
265
|
source_dataset=topic,
|
|
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
176
279
|
project: str
|
|
177
280
|
target_platform: str
|
|
178
281
|
sanitizeTopics: bool
|
|
179
|
-
transforms:
|
|
282
|
+
transforms: List[Dict[str, str]]
|
|
283
|
+
regex_router: RegexRouterTransform
|
|
180
284
|
topicsToTables: Optional[str] = None
|
|
181
285
|
datasets: Optional[str] = None
|
|
182
286
|
defaultDataset: Optional[str] = None
|
|
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
186
290
|
self,
|
|
187
291
|
connector_manifest: ConnectorManifest,
|
|
188
292
|
) -> BQParser:
|
|
189
|
-
project = connector_manifest.config["project"]
|
|
190
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
191
|
-
|
|
293
|
+
project: str = connector_manifest.config["project"]
|
|
294
|
+
sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
295
|
+
|
|
296
|
+
# Parse ALL transforms (original BigQuery logic)
|
|
297
|
+
transform_names: List[str] = (
|
|
192
298
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
193
299
|
if self.connector_manifest.config.get("transforms")
|
|
194
300
|
else []
|
|
195
301
|
)
|
|
196
|
-
transforms = []
|
|
302
|
+
transforms: List[Dict[str, str]] = []
|
|
197
303
|
for name in transform_names:
|
|
198
|
-
transform = {"name": name}
|
|
304
|
+
transform: Dict[str, str] = {"name": name}
|
|
199
305
|
transforms.append(transform)
|
|
200
306
|
for key in self.connector_manifest.config:
|
|
201
307
|
if key.startswith(f"transforms.{name}."):
|
|
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
203
309
|
self.connector_manifest.config[key]
|
|
204
310
|
)
|
|
205
311
|
|
|
312
|
+
# Create RegexRouterTransform instance for RegexRouter-specific handling
|
|
313
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
314
|
+
connector_manifest.config
|
|
315
|
+
)
|
|
316
|
+
|
|
206
317
|
if "defaultDataset" in connector_manifest.config:
|
|
207
|
-
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
318
|
+
defaultDataset: str = connector_manifest.config["defaultDataset"]
|
|
208
319
|
return self.BQParser(
|
|
209
320
|
project=project,
|
|
210
321
|
defaultDataset=defaultDataset,
|
|
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
212
323
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
213
324
|
version="v2",
|
|
214
325
|
transforms=transforms,
|
|
326
|
+
regex_router=regex_router,
|
|
215
327
|
)
|
|
216
328
|
else:
|
|
217
329
|
# version 1.6.x and similar configs supported
|
|
218
|
-
datasets = connector_manifest.config["datasets"]
|
|
219
|
-
topicsToTables = connector_manifest.config.get(
|
|
330
|
+
datasets: str = connector_manifest.config["datasets"]
|
|
331
|
+
topicsToTables: Optional[str] = connector_manifest.config.get(
|
|
332
|
+
"topicsToTables"
|
|
333
|
+
)
|
|
220
334
|
|
|
221
335
|
return self.BQParser(
|
|
222
336
|
project=project,
|
|
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
225
339
|
target_platform="bigquery",
|
|
226
340
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
227
341
|
transforms=transforms,
|
|
342
|
+
regex_router=regex_router,
|
|
228
343
|
)
|
|
229
344
|
|
|
230
345
|
def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
|
|
231
|
-
entries = property.split(",")
|
|
346
|
+
entries: List[str] = property.split(",")
|
|
232
347
|
for entry in entries:
|
|
233
348
|
key, val = entry.rsplit("=")
|
|
234
349
|
yield (key.strip(), val.strip())
|
|
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
243
358
|
return dataset
|
|
244
359
|
return None
|
|
245
360
|
|
|
246
|
-
def sanitize_table_name(self, table_name):
|
|
361
|
+
def sanitize_table_name(self, table_name: str) -> str:
|
|
247
362
|
table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
|
|
248
363
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
249
364
|
table_name = "_" + table_name
|
|
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
254
369
|
self, topic: str, parser: BQParser
|
|
255
370
|
) -> Optional[str]:
|
|
256
371
|
if parser.version == "v2":
|
|
257
|
-
dataset = parser.defaultDataset
|
|
258
|
-
parts = topic.split(":")
|
|
372
|
+
dataset: Optional[str] = parser.defaultDataset
|
|
373
|
+
parts: List[str] = topic.split(":")
|
|
259
374
|
if len(parts) == 2:
|
|
260
375
|
dataset = parts[0]
|
|
261
376
|
table = parts[1]
|
|
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
283
398
|
table = self.sanitize_table_name(table)
|
|
284
399
|
return f"{dataset}.{table}"
|
|
285
400
|
|
|
286
|
-
def apply_transformations(
|
|
287
|
-
self, topic: str, transforms: List[Dict[str, str]]
|
|
288
|
-
) -> str:
|
|
289
|
-
for transform in transforms:
|
|
290
|
-
if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
|
|
291
|
-
regex = transform["regex"]
|
|
292
|
-
replacement = transform["replacement"]
|
|
293
|
-
pattern = re.compile(regex)
|
|
294
|
-
if pattern.match(topic):
|
|
295
|
-
topic = pattern.sub(replacement, topic, count=1)
|
|
296
|
-
return topic
|
|
297
|
-
|
|
298
401
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
299
402
|
# Mask/Remove properties that may reveal credentials
|
|
300
|
-
flow_property_bag = {
|
|
403
|
+
flow_property_bag: Dict[str, str] = {
|
|
301
404
|
k: v
|
|
302
405
|
for k, v in self.connector_manifest.config.items()
|
|
303
406
|
if k not in ["keyfile"]
|
|
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
307
410
|
|
|
308
411
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
309
412
|
lineages: List[KafkaConnectLineage] = list()
|
|
310
|
-
parser = self.get_parser(
|
|
413
|
+
parser: BigQuerySinkConnector.BQParser = self.get_parser(
|
|
414
|
+
self.connector_manifest
|
|
415
|
+
)
|
|
311
416
|
if not parser:
|
|
312
417
|
return lineages
|
|
313
|
-
target_platform = parser.target_platform
|
|
314
|
-
project = parser.project
|
|
315
|
-
transforms = parser.transforms
|
|
418
|
+
target_platform: str = parser.target_platform
|
|
419
|
+
project: str = parser.project
|
|
316
420
|
|
|
317
421
|
for topic in self.connector_manifest.topic_names:
|
|
318
|
-
|
|
319
|
-
|
|
422
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
423
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
424
|
+
|
|
425
|
+
# Use the transformed topic to determine dataset/table
|
|
426
|
+
dataset_table: Optional[str] = self.get_dataset_table_for_topic(
|
|
427
|
+
transformed_topic, parser
|
|
428
|
+
)
|
|
320
429
|
if dataset_table is None:
|
|
321
430
|
self.report.warning(
|
|
322
431
|
"Could not find target dataset for topic, please check your connector configuration"
|
|
323
432
|
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
324
433
|
)
|
|
325
434
|
continue
|
|
326
|
-
target_dataset = f"{project}.{dataset_table}"
|
|
435
|
+
target_dataset: str = f"{project}.{dataset_table}"
|
|
327
436
|
|
|
328
437
|
lineages.append(
|
|
329
438
|
KafkaConnectLineage(
|
|
330
|
-
source_dataset=
|
|
439
|
+
source_dataset=topic, # Keep original topic as source
|
|
331
440
|
source_platform=KAFKA,
|
|
332
441
|
target_dataset=target_dataset,
|
|
333
442
|
target_platform=target_platform,
|
|
@@ -15,6 +15,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
15
15
|
)
|
|
16
16
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
17
17
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
18
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
18
19
|
from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
|
|
19
20
|
DataHubMockDataReport,
|
|
20
21
|
)
|
|
@@ -211,15 +212,19 @@ class DataHubMockDataSource(Source):
|
|
|
211
212
|
pattern = self.config.gen_1.subtype_pattern
|
|
212
213
|
|
|
213
214
|
if pattern == SubTypePattern.ALTERNATING:
|
|
214
|
-
return
|
|
215
|
+
return (
|
|
216
|
+
DatasetSubTypes.TABLE if table_index % 2 == 0 else DatasetSubTypes.VIEW
|
|
217
|
+
)
|
|
215
218
|
elif pattern == SubTypePattern.LEVEL_BASED:
|
|
216
|
-
return self.config.gen_1.level_subtypes.get(
|
|
219
|
+
return self.config.gen_1.level_subtypes.get(
|
|
220
|
+
table_level, DatasetSubTypes.TABLE
|
|
221
|
+
)
|
|
217
222
|
elif pattern == SubTypePattern.ALL_TABLE:
|
|
218
|
-
return
|
|
223
|
+
return DatasetSubTypes.TABLE
|
|
219
224
|
elif pattern == SubTypePattern.ALL_VIEW:
|
|
220
|
-
return
|
|
225
|
+
return DatasetSubTypes.VIEW
|
|
221
226
|
else:
|
|
222
|
-
return
|
|
227
|
+
return DatasetSubTypes.TABLE # default
|
|
223
228
|
|
|
224
229
|
def _get_subtypes_aspect(
|
|
225
230
|
self, table_name: str, table_level: int, table_index: int
|
|
@@ -261,11 +266,8 @@ class DataHubMockDataSource(Source):
|
|
|
261
266
|
fan_out, hops, fan_out_after_first
|
|
262
267
|
)
|
|
263
268
|
|
|
264
|
-
logger.info(
|
|
265
|
-
f"About to create {tables_to_be_created} tables for lineage testing"
|
|
266
|
-
)
|
|
269
|
+
logger.info(f"About to create {tables_to_be_created} datasets mock data")
|
|
267
270
|
|
|
268
|
-
current_progress = 0
|
|
269
271
|
for i in range(hops + 1):
|
|
270
272
|
tables_at_level = tables_at_levels[i]
|
|
271
273
|
|
|
@@ -286,12 +288,6 @@ class DataHubMockDataSource(Source):
|
|
|
286
288
|
tables_at_levels=tables_at_levels,
|
|
287
289
|
)
|
|
288
290
|
|
|
289
|
-
current_progress += 1
|
|
290
|
-
if current_progress % 1000 == 0:
|
|
291
|
-
logger.info(
|
|
292
|
-
f"Progress: {current_progress}/{tables_to_be_created} tables processed"
|
|
293
|
-
)
|
|
294
|
-
|
|
295
291
|
def _generate_lineage_for_table(
|
|
296
292
|
self,
|
|
297
293
|
table_name: str,
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
33
|
)
|
|
34
34
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
|
-
from datahub.ingestion.source.common.subtypes import
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
DatasetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
41
|
StaleEntityRemovalHandler,
|
|
39
42
|
StaleEntityRemovalSourceReport,
|
|
@@ -532,11 +535,11 @@ class SalesforceApi:
|
|
|
532
535
|
@capability(
|
|
533
536
|
capability_name=SourceCapability.DATA_PROFILING,
|
|
534
537
|
description="Only table level profiling is supported via `profiling.enabled` config field",
|
|
538
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
535
539
|
)
|
|
536
540
|
@capability(
|
|
537
541
|
capability_name=SourceCapability.DELETION_DETECTION,
|
|
538
|
-
description="
|
|
539
|
-
supported=False,
|
|
542
|
+
description="Enabled by default via stateful ingestion",
|
|
540
543
|
)
|
|
541
544
|
@capability(
|
|
542
545
|
capability_name=SourceCapability.SCHEMA_METADATA,
|
|
@@ -23,6 +23,7 @@ from datahub.ingestion.api.source import (
|
|
|
23
23
|
SourceReport,
|
|
24
24
|
)
|
|
25
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
26
27
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
27
28
|
StaleEntityRemovalHandler,
|
|
28
29
|
StaleEntityRemovalSourceReport,
|
|
@@ -493,7 +494,7 @@ class SlackSource(StatefulIngestionSourceBase):
|
|
|
493
494
|
mcp=MetadataChangeProposalWrapper(
|
|
494
495
|
entityUrn=urn_channel,
|
|
495
496
|
aspect=SubTypesClass(
|
|
496
|
-
typeNames=[
|
|
497
|
+
typeNames=[DatasetSubTypes.SLACK_CHANNEL],
|
|
497
498
|
),
|
|
498
499
|
),
|
|
499
500
|
)
|