acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
@@ -9,6 +10,81 @@ from datahub.ingestion.source.kafka_connect.common import (
|
|
|
9
10
|
KafkaConnectLineage,
|
|
10
11
|
)
|
|
11
12
|
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RegexRouterTransform:
|
|
17
|
+
"""Helper class to handle RegexRouter transformations for topic/table names."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Dict[str, str]) -> None:
|
|
20
|
+
self.transforms = self._parse_transforms(config)
|
|
21
|
+
|
|
22
|
+
def _parse_transforms(self, config: Dict[str, str]) -> List[Dict[str, str]]:
|
|
23
|
+
"""Parse transforms configuration from connector config."""
|
|
24
|
+
transforms_list: List[Dict[str, str]] = []
|
|
25
|
+
|
|
26
|
+
# Get the transforms parameter
|
|
27
|
+
transforms_param: str = config.get("transforms", "")
|
|
28
|
+
if not transforms_param:
|
|
29
|
+
return transforms_list
|
|
30
|
+
|
|
31
|
+
# Parse individual transforms
|
|
32
|
+
transform_names: List[str] = [
|
|
33
|
+
name.strip() for name in transforms_param.split(",")
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for transform_name in transform_names:
|
|
37
|
+
if not transform_name:
|
|
38
|
+
continue
|
|
39
|
+
transform_config: Dict[str, str] = {}
|
|
40
|
+
transform_prefix: str = f"transforms.{transform_name}."
|
|
41
|
+
|
|
42
|
+
# Extract transform configuration
|
|
43
|
+
for key, value in config.items():
|
|
44
|
+
if key.startswith(transform_prefix):
|
|
45
|
+
config_key: str = key[len(transform_prefix) :]
|
|
46
|
+
transform_config[config_key] = value
|
|
47
|
+
|
|
48
|
+
# Only process RegexRouter transforms
|
|
49
|
+
if (
|
|
50
|
+
transform_config.get("type")
|
|
51
|
+
== "org.apache.kafka.connect.transforms.RegexRouter"
|
|
52
|
+
):
|
|
53
|
+
transform_config["name"] = transform_name
|
|
54
|
+
transforms_list.append(transform_config)
|
|
55
|
+
|
|
56
|
+
return transforms_list
|
|
57
|
+
|
|
58
|
+
def apply_transforms(self, topic_name: str) -> str:
|
|
59
|
+
"""Apply RegexRouter transforms to the topic name using Java regex."""
|
|
60
|
+
result: str = topic_name
|
|
61
|
+
|
|
62
|
+
for transform in self.transforms:
|
|
63
|
+
regex_pattern: Optional[str] = transform.get("regex")
|
|
64
|
+
replacement: str = transform.get("replacement", "")
|
|
65
|
+
|
|
66
|
+
if regex_pattern:
|
|
67
|
+
try:
|
|
68
|
+
# Use Java Pattern and Matcher for exact Kafka Connect compatibility
|
|
69
|
+
from java.util.regex import Pattern
|
|
70
|
+
|
|
71
|
+
pattern = Pattern.compile(regex_pattern)
|
|
72
|
+
matcher = pattern.matcher(result)
|
|
73
|
+
|
|
74
|
+
if matcher.find():
|
|
75
|
+
# Reset matcher to beginning for replaceFirst
|
|
76
|
+
matcher.reset()
|
|
77
|
+
result = matcher.replaceFirst(replacement)
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"Applied transform {transform['name']}: {topic_name} -> {result}"
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.warning(
|
|
83
|
+
f"Invalid regex pattern in transform {transform['name']}: {e}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return str(result)
|
|
87
|
+
|
|
12
88
|
|
|
13
89
|
@dataclass
|
|
14
90
|
class ConfluentS3SinkConnector(BaseConnector):
|
|
@@ -18,28 +94,35 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
18
94
|
bucket: str
|
|
19
95
|
topics_dir: str
|
|
20
96
|
topics: Iterable[str]
|
|
97
|
+
regex_router: RegexRouterTransform
|
|
21
98
|
|
|
22
99
|
def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
|
|
23
100
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
|
|
24
|
-
bucket = connector_manifest.config.get("s3.bucket.name")
|
|
101
|
+
bucket: Optional[str] = connector_manifest.config.get("s3.bucket.name")
|
|
25
102
|
if not bucket:
|
|
26
103
|
raise ValueError(
|
|
27
104
|
"Could not find 's3.bucket.name' in connector configuration"
|
|
28
105
|
)
|
|
29
106
|
|
|
30
107
|
# https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
|
|
31
|
-
topics_dir = connector_manifest.config.get("topics.dir", "topics")
|
|
108
|
+
topics_dir: str = connector_manifest.config.get("topics.dir", "topics")
|
|
109
|
+
|
|
110
|
+
# Create RegexRouterTransform instance
|
|
111
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
112
|
+
connector_manifest.config
|
|
113
|
+
)
|
|
32
114
|
|
|
33
115
|
return self.S3SinkParser(
|
|
34
116
|
target_platform="s3",
|
|
35
117
|
bucket=bucket,
|
|
36
118
|
topics_dir=topics_dir,
|
|
37
119
|
topics=connector_manifest.topic_names,
|
|
120
|
+
regex_router=regex_router,
|
|
38
121
|
)
|
|
39
122
|
|
|
40
123
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
41
124
|
# Mask/Remove properties that may reveal credentials
|
|
42
|
-
flow_property_bag = {
|
|
125
|
+
flow_property_bag: Dict[str, str] = {
|
|
43
126
|
k: v
|
|
44
127
|
for k, v in self.connector_manifest.config.items()
|
|
45
128
|
if k
|
|
@@ -54,11 +137,17 @@ class ConfluentS3SinkConnector(BaseConnector):
|
|
|
54
137
|
|
|
55
138
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
56
139
|
try:
|
|
57
|
-
parser = self._get_parser(
|
|
140
|
+
parser: ConfluentS3SinkConnector.S3SinkParser = self._get_parser(
|
|
141
|
+
self.connector_manifest
|
|
142
|
+
)
|
|
58
143
|
|
|
59
144
|
lineages: List[KafkaConnectLineage] = list()
|
|
60
145
|
for topic in parser.topics:
|
|
61
|
-
|
|
146
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
147
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
148
|
+
target_dataset: str = (
|
|
149
|
+
f"{parser.bucket}/{parser.topics_dir}/{transformed_topic}"
|
|
150
|
+
)
|
|
62
151
|
|
|
63
152
|
lineages.append(
|
|
64
153
|
KafkaConnectLineage(
|
|
@@ -86,6 +175,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
86
175
|
database_name: str
|
|
87
176
|
schema_name: str
|
|
88
177
|
topics_to_tables: Dict[str, str]
|
|
178
|
+
regex_router: RegexRouterTransform
|
|
89
179
|
|
|
90
180
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
91
181
|
"""
|
|
@@ -93,7 +183,7 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
93
183
|
Refer below link for more info
|
|
94
184
|
https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
|
|
95
185
|
"""
|
|
96
|
-
table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
186
|
+
table_name: str = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
|
|
97
187
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
98
188
|
table_name = "_" + table_name
|
|
99
189
|
# Connector may append original topic's hash code as suffix for conflict resolution
|
|
@@ -106,8 +196,13 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
106
196
|
self,
|
|
107
197
|
connector_manifest: ConnectorManifest,
|
|
108
198
|
) -> SnowflakeParser:
|
|
109
|
-
database_name = connector_manifest.config["snowflake.database.name"]
|
|
110
|
-
schema_name = connector_manifest.config["snowflake.schema.name"]
|
|
199
|
+
database_name: str = connector_manifest.config["snowflake.database.name"]
|
|
200
|
+
schema_name: str = connector_manifest.config["snowflake.schema.name"]
|
|
201
|
+
|
|
202
|
+
# Create RegexRouterTransform instance
|
|
203
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
204
|
+
connector_manifest.config
|
|
205
|
+
)
|
|
111
206
|
|
|
112
207
|
# Fetch user provided topic to table map
|
|
113
208
|
provided_topics_to_tables: Dict[str, str] = {}
|
|
@@ -121,24 +216,30 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
121
216
|
topics_to_tables: Dict[str, str] = {}
|
|
122
217
|
# Extract lineage for only those topics whose data ingestion started
|
|
123
218
|
for topic in connector_manifest.topic_names:
|
|
219
|
+
# Apply transforms first to get the transformed topic name
|
|
220
|
+
transformed_topic: str = regex_router.apply_transforms(topic)
|
|
221
|
+
|
|
124
222
|
if topic in provided_topics_to_tables:
|
|
125
223
|
# If user provided which table to get mapped with this topic
|
|
126
224
|
topics_to_tables[topic] = provided_topics_to_tables[topic]
|
|
127
225
|
else:
|
|
128
|
-
#
|
|
129
|
-
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
226
|
+
# Use the transformed topic name to generate table name
|
|
227
|
+
topics_to_tables[topic] = self.get_table_name_from_topic_name(
|
|
228
|
+
transformed_topic
|
|
229
|
+
)
|
|
130
230
|
|
|
131
231
|
return self.SnowflakeParser(
|
|
132
232
|
database_name=database_name,
|
|
133
233
|
schema_name=schema_name,
|
|
134
234
|
topics_to_tables=topics_to_tables,
|
|
235
|
+
regex_router=regex_router,
|
|
135
236
|
)
|
|
136
237
|
|
|
137
238
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
138
239
|
# For all snowflake sink connector properties, refer below link
|
|
139
240
|
# https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
|
|
140
241
|
# remove private keys, secrets from properties
|
|
141
|
-
flow_property_bag = {
|
|
242
|
+
flow_property_bag: Dict[str, str] = {
|
|
142
243
|
k: v
|
|
143
244
|
for k, v in self.connector_manifest.config.items()
|
|
144
245
|
if k
|
|
@@ -153,10 +254,12 @@ class SnowflakeSinkConnector(BaseConnector):
|
|
|
153
254
|
|
|
154
255
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
155
256
|
lineages: List[KafkaConnectLineage] = list()
|
|
156
|
-
parser = self.get_parser(
|
|
257
|
+
parser: SnowflakeSinkConnector.SnowflakeParser = self.get_parser(
|
|
258
|
+
self.connector_manifest
|
|
259
|
+
)
|
|
157
260
|
|
|
158
261
|
for topic, table in parser.topics_to_tables.items():
|
|
159
|
-
target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
262
|
+
target_dataset: str = f"{parser.database_name}.{parser.schema_name}.{table}"
|
|
160
263
|
lineages.append(
|
|
161
264
|
KafkaConnectLineage(
|
|
162
265
|
source_dataset=topic,
|
|
@@ -176,7 +279,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
176
279
|
project: str
|
|
177
280
|
target_platform: str
|
|
178
281
|
sanitizeTopics: bool
|
|
179
|
-
transforms:
|
|
282
|
+
transforms: List[Dict[str, str]]
|
|
283
|
+
regex_router: RegexRouterTransform
|
|
180
284
|
topicsToTables: Optional[str] = None
|
|
181
285
|
datasets: Optional[str] = None
|
|
182
286
|
defaultDataset: Optional[str] = None
|
|
@@ -186,16 +290,18 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
186
290
|
self,
|
|
187
291
|
connector_manifest: ConnectorManifest,
|
|
188
292
|
) -> BQParser:
|
|
189
|
-
project = connector_manifest.config["project"]
|
|
190
|
-
sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
191
|
-
|
|
293
|
+
project: str = connector_manifest.config["project"]
|
|
294
|
+
sanitizeTopics: str = connector_manifest.config.get("sanitizeTopics") or "false"
|
|
295
|
+
|
|
296
|
+
# Parse ALL transforms (original BigQuery logic)
|
|
297
|
+
transform_names: List[str] = (
|
|
192
298
|
self.connector_manifest.config.get("transforms", "").split(",")
|
|
193
299
|
if self.connector_manifest.config.get("transforms")
|
|
194
300
|
else []
|
|
195
301
|
)
|
|
196
|
-
transforms = []
|
|
302
|
+
transforms: List[Dict[str, str]] = []
|
|
197
303
|
for name in transform_names:
|
|
198
|
-
transform = {"name": name}
|
|
304
|
+
transform: Dict[str, str] = {"name": name}
|
|
199
305
|
transforms.append(transform)
|
|
200
306
|
for key in self.connector_manifest.config:
|
|
201
307
|
if key.startswith(f"transforms.{name}."):
|
|
@@ -203,8 +309,13 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
203
309
|
self.connector_manifest.config[key]
|
|
204
310
|
)
|
|
205
311
|
|
|
312
|
+
# Create RegexRouterTransform instance for RegexRouter-specific handling
|
|
313
|
+
regex_router: RegexRouterTransform = RegexRouterTransform(
|
|
314
|
+
connector_manifest.config
|
|
315
|
+
)
|
|
316
|
+
|
|
206
317
|
if "defaultDataset" in connector_manifest.config:
|
|
207
|
-
defaultDataset = connector_manifest.config["defaultDataset"]
|
|
318
|
+
defaultDataset: str = connector_manifest.config["defaultDataset"]
|
|
208
319
|
return self.BQParser(
|
|
209
320
|
project=project,
|
|
210
321
|
defaultDataset=defaultDataset,
|
|
@@ -212,11 +323,14 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
212
323
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
213
324
|
version="v2",
|
|
214
325
|
transforms=transforms,
|
|
326
|
+
regex_router=regex_router,
|
|
215
327
|
)
|
|
216
328
|
else:
|
|
217
329
|
# version 1.6.x and similar configs supported
|
|
218
|
-
datasets = connector_manifest.config["datasets"]
|
|
219
|
-
topicsToTables = connector_manifest.config.get(
|
|
330
|
+
datasets: str = connector_manifest.config["datasets"]
|
|
331
|
+
topicsToTables: Optional[str] = connector_manifest.config.get(
|
|
332
|
+
"topicsToTables"
|
|
333
|
+
)
|
|
220
334
|
|
|
221
335
|
return self.BQParser(
|
|
222
336
|
project=project,
|
|
@@ -225,10 +339,11 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
225
339
|
target_platform="bigquery",
|
|
226
340
|
sanitizeTopics=sanitizeTopics.lower() == "true",
|
|
227
341
|
transforms=transforms,
|
|
342
|
+
regex_router=regex_router,
|
|
228
343
|
)
|
|
229
344
|
|
|
230
345
|
def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
|
|
231
|
-
entries = property.split(",")
|
|
346
|
+
entries: List[str] = property.split(",")
|
|
232
347
|
for entry in entries:
|
|
233
348
|
key, val = entry.rsplit("=")
|
|
234
349
|
yield (key.strip(), val.strip())
|
|
@@ -243,7 +358,7 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
243
358
|
return dataset
|
|
244
359
|
return None
|
|
245
360
|
|
|
246
|
-
def sanitize_table_name(self, table_name):
|
|
361
|
+
def sanitize_table_name(self, table_name: str) -> str:
|
|
247
362
|
table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
|
|
248
363
|
if re.match("^[^a-zA-Z_].*", table_name):
|
|
249
364
|
table_name = "_" + table_name
|
|
@@ -254,8 +369,8 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
254
369
|
self, topic: str, parser: BQParser
|
|
255
370
|
) -> Optional[str]:
|
|
256
371
|
if parser.version == "v2":
|
|
257
|
-
dataset = parser.defaultDataset
|
|
258
|
-
parts = topic.split(":")
|
|
372
|
+
dataset: Optional[str] = parser.defaultDataset
|
|
373
|
+
parts: List[str] = topic.split(":")
|
|
259
374
|
if len(parts) == 2:
|
|
260
375
|
dataset = parts[0]
|
|
261
376
|
table = parts[1]
|
|
@@ -283,21 +398,9 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
283
398
|
table = self.sanitize_table_name(table)
|
|
284
399
|
return f"{dataset}.{table}"
|
|
285
400
|
|
|
286
|
-
def apply_transformations(
|
|
287
|
-
self, topic: str, transforms: List[Dict[str, str]]
|
|
288
|
-
) -> str:
|
|
289
|
-
for transform in transforms:
|
|
290
|
-
if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
|
|
291
|
-
regex = transform["regex"]
|
|
292
|
-
replacement = transform["replacement"]
|
|
293
|
-
pattern = re.compile(regex)
|
|
294
|
-
if pattern.match(topic):
|
|
295
|
-
topic = pattern.sub(replacement, topic, count=1)
|
|
296
|
-
return topic
|
|
297
|
-
|
|
298
401
|
def extract_flow_property_bag(self) -> Dict[str, str]:
|
|
299
402
|
# Mask/Remove properties that may reveal credentials
|
|
300
|
-
flow_property_bag = {
|
|
403
|
+
flow_property_bag: Dict[str, str] = {
|
|
301
404
|
k: v
|
|
302
405
|
for k, v in self.connector_manifest.config.items()
|
|
303
406
|
if k not in ["keyfile"]
|
|
@@ -307,27 +410,33 @@ class BigQuerySinkConnector(BaseConnector):
|
|
|
307
410
|
|
|
308
411
|
def extract_lineages(self) -> List[KafkaConnectLineage]:
|
|
309
412
|
lineages: List[KafkaConnectLineage] = list()
|
|
310
|
-
parser = self.get_parser(
|
|
413
|
+
parser: BigQuerySinkConnector.BQParser = self.get_parser(
|
|
414
|
+
self.connector_manifest
|
|
415
|
+
)
|
|
311
416
|
if not parser:
|
|
312
417
|
return lineages
|
|
313
|
-
target_platform = parser.target_platform
|
|
314
|
-
project = parser.project
|
|
315
|
-
transforms = parser.transforms
|
|
418
|
+
target_platform: str = parser.target_platform
|
|
419
|
+
project: str = parser.project
|
|
316
420
|
|
|
317
421
|
for topic in self.connector_manifest.topic_names:
|
|
318
|
-
|
|
319
|
-
|
|
422
|
+
# Apply RegexRouter transformations using the RegexRouterTransform class
|
|
423
|
+
transformed_topic: str = parser.regex_router.apply_transforms(topic)
|
|
424
|
+
|
|
425
|
+
# Use the transformed topic to determine dataset/table
|
|
426
|
+
dataset_table: Optional[str] = self.get_dataset_table_for_topic(
|
|
427
|
+
transformed_topic, parser
|
|
428
|
+
)
|
|
320
429
|
if dataset_table is None:
|
|
321
430
|
self.report.warning(
|
|
322
431
|
"Could not find target dataset for topic, please check your connector configuration"
|
|
323
432
|
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
324
433
|
)
|
|
325
434
|
continue
|
|
326
|
-
target_dataset = f"{project}.{dataset_table}"
|
|
435
|
+
target_dataset: str = f"{project}.{dataset_table}"
|
|
327
436
|
|
|
328
437
|
lineages.append(
|
|
329
438
|
KafkaConnectLineage(
|
|
330
|
-
source_dataset=
|
|
439
|
+
source_dataset=topic, # Keep original topic as source
|
|
331
440
|
source_platform=KAFKA,
|
|
332
441
|
target_dataset=target_dataset,
|
|
333
442
|
target_platform=target_platform,
|
|
@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
|
20
20
|
get_platform_from_sqlalchemy_uri,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
@dataclass
|
|
25
27
|
class ConfluentJDBCSourceConnector(BaseConnector):
|
|
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
|
|
|
392
394
|
db_connection_url=connector_manifest.config.get("connection.uri"),
|
|
393
395
|
source_platform="mongodb",
|
|
394
396
|
database_name=connector_manifest.config.get("database"),
|
|
395
|
-
topic_prefix=connector_manifest.config.get("
|
|
397
|
+
topic_prefix=connector_manifest.config.get("topic.prefix"),
|
|
396
398
|
transforms=(
|
|
397
399
|
connector_manifest.config["transforms"].split(",")
|
|
398
400
|
if "transforms" in connector_manifest.config
|
|
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
|
|
|
406
408
|
lineages: List[KafkaConnectLineage] = list()
|
|
407
409
|
parser = self.get_parser(self.connector_manifest)
|
|
408
410
|
source_platform = parser.source_platform
|
|
409
|
-
|
|
411
|
+
topic_prefix = parser.topic_prefix or ""
|
|
412
|
+
|
|
413
|
+
# Escape topic_prefix to handle cases where it contains dots
|
|
414
|
+
# Some users configure topic.prefix like "my.mongodb" which breaks the regex
|
|
415
|
+
topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
|
|
410
416
|
|
|
411
417
|
if not self.connector_manifest.topic_names:
|
|
412
418
|
return lineages
|
|
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
|
|
|
429
435
|
|
|
430
436
|
@dataclass
|
|
431
437
|
class DebeziumSourceConnector(BaseConnector):
|
|
438
|
+
# Debezium topic naming patterns by connector type
|
|
439
|
+
# - MySQL: {topic.prefix}.{database}.{table}
|
|
440
|
+
# - PostgreSQL: {topic.prefix}.{schema}.{table}
|
|
441
|
+
# - SQL Server: {topic.prefix}.{database}.{schema}.{table}
|
|
442
|
+
# - Oracle: {topic.prefix}.{schema}.{table}
|
|
443
|
+
# - DB2: {topic.prefix}.{schema}.{table}
|
|
444
|
+
# - MongoDB: {topic.prefix}.{database}.{collection}
|
|
445
|
+
# - Vitess: {topic.prefix}.{keyspace}.{table}
|
|
446
|
+
|
|
447
|
+
# Note SQL Server allows for "database.names" (multiple databases) config,
|
|
448
|
+
# and so database is in the topic naming pattern.
|
|
449
|
+
# However, others have "database.dbname" which is a single database name. For these connectors,
|
|
450
|
+
# additional databases would require a different connector instance
|
|
451
|
+
|
|
452
|
+
# Connectors with 2-level container in pattern (database + schema)
|
|
453
|
+
# Others have either database XOR schema, but not both
|
|
454
|
+
DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
|
|
455
|
+
"io.debezium.connector.sqlserver.SqlServerConnector",
|
|
456
|
+
}
|
|
457
|
+
|
|
432
458
|
@dataclass
|
|
433
459
|
class DebeziumParser:
|
|
434
460
|
source_platform: str
|
|
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
514
540
|
source_platform = parser.source_platform
|
|
515
541
|
server_name = parser.server_name
|
|
516
542
|
database_name = parser.database_name
|
|
517
|
-
|
|
543
|
+
# Escape server_name to handle cases where topic.prefix contains dots
|
|
544
|
+
# Some users configure topic.prefix like "my.server" which breaks the regex
|
|
545
|
+
server_name = server_name or ""
|
|
546
|
+
# Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
|
|
547
|
+
topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
|
|
518
548
|
|
|
519
549
|
if not self.connector_manifest.topic_names:
|
|
520
550
|
return lineages
|
|
521
551
|
|
|
552
|
+
# Handle connectors with 2-level container (database + schema) in topic pattern
|
|
553
|
+
connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
554
|
+
maybe_duplicated_database_name = (
|
|
555
|
+
connector_class
|
|
556
|
+
in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
|
|
557
|
+
)
|
|
558
|
+
|
|
522
559
|
for topic in self.connector_manifest.topic_names:
|
|
523
560
|
found = re.search(re.compile(topic_naming_pattern), topic)
|
|
561
|
+
logger.debug(
|
|
562
|
+
f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
|
|
563
|
+
)
|
|
524
564
|
|
|
525
565
|
if found:
|
|
526
|
-
|
|
566
|
+
# Extract the table part after server_name
|
|
567
|
+
table_part = found.group(2)
|
|
568
|
+
|
|
569
|
+
if (
|
|
570
|
+
maybe_duplicated_database_name
|
|
571
|
+
and database_name
|
|
572
|
+
and table_part.startswith(f"{database_name}.")
|
|
573
|
+
):
|
|
574
|
+
table_part = table_part[len(database_name) + 1 :]
|
|
575
|
+
|
|
576
|
+
logger.debug(
|
|
577
|
+
f"Extracted table part: '{table_part}' from topic '{topic}'"
|
|
578
|
+
)
|
|
579
|
+
# Apply database name to create final dataset name
|
|
580
|
+
table_name = get_dataset_name(database_name, table_part)
|
|
581
|
+
logger.debug(f"Final table name: '{table_name}'")
|
|
527
582
|
|
|
528
583
|
lineage = KafkaConnectLineage(
|
|
529
584
|
source_dataset=table_name,
|
|
@@ -126,6 +126,7 @@ logger = logging.getLogger(__name__)
|
|
|
126
126
|
SourceCapability.USAGE_STATS,
|
|
127
127
|
"Enabled by default, configured using `extract_usage_history`",
|
|
128
128
|
)
|
|
129
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
129
130
|
class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
130
131
|
"""
|
|
131
132
|
This plugin extracts the following:
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
|
|
|
33
33
|
)
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
35
|
from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
|
|
36
|
-
from datahub.ingestion.source.common.subtypes import
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
MLAssetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
41
|
StaleEntityRemovalHandler,
|
|
39
42
|
StaleEntityRemovalSourceReport,
|
|
@@ -138,6 +141,13 @@ class MLflowRegisteredModelStageInfo:
|
|
|
138
141
|
SourceCapability.DESCRIPTIONS,
|
|
139
142
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
140
143
|
)
|
|
144
|
+
@capability(
|
|
145
|
+
SourceCapability.CONTAINERS,
|
|
146
|
+
"Extract ML experiments",
|
|
147
|
+
subtype_modifier=[
|
|
148
|
+
SourceCapabilityModifier.MLFLOW_EXPERIMENT,
|
|
149
|
+
],
|
|
150
|
+
)
|
|
141
151
|
@capability(SourceCapability.TAGS, "Extract tags for MLflow Registered Model Stages")
|
|
142
152
|
class MLflowSource(StatefulIngestionSourceBase):
|
|
143
153
|
platform = "mlflow"
|
|
File without changes
|