acryl-datahub 1.0.0.4rc4__py3-none-any.whl → 1.0.0.4rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/METADATA +2630 -2630
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/RECORD +12 -11
- datahub/_version.py +1 -1
- datahub/ingestion/source/kafka/kafka.py +11 -67
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/sql/hive.py +15 -0
- datahub/ingestion/source/superset.py +163 -9
- datahub/sdk/lineage_client.py +134 -7
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.4rc4.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.0.0.
|
|
1
|
+
acryl_datahub-1.0.0.4rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=C9cJOHt7RVINEp5u9HEU9BQHiZS7hSsr2uIJMS3Pdjw,323
|
|
5
5
|
datahub/entrypoints.py,sha256=AQN5MzCe6q3LKI4SS6WmwN56kgjF6AC1ld7yELWVP2w,8953
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -218,7 +218,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
|
|
|
218
218
|
datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
|
|
219
219
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
220
220
|
datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
|
|
221
|
-
datahub/ingestion/source/superset.py,sha256=
|
|
221
|
+
datahub/ingestion/source/superset.py,sha256=acxKU8XkaCNvhcv0CwU27_dYTdV5iR45BPcc83SR_T0,48380
|
|
222
222
|
datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
223
223
|
datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
|
|
224
224
|
datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
|
|
@@ -343,7 +343,8 @@ datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
343
343
|
datahub/ingestion/source/identity/azure_ad.py,sha256=9Hrvm4CSfc02yjnPUsCYSY4Qw9fXPnDFWLexab0mcpc,28559
|
|
344
344
|
datahub/ingestion/source/identity/okta.py,sha256=jC21myJuMRTaPgj0OD9heaC-mz8ECjqpy2hSJwlUSwM,31943
|
|
345
345
|
datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
346
|
-
datahub/ingestion/source/kafka/kafka.py,sha256=
|
|
346
|
+
datahub/ingestion/source/kafka/kafka.py,sha256=HMoe1P0QE9JlcX6MNEALTgz7LsmG-HUXVuWnk3jkRo8,22900
|
|
347
|
+
datahub/ingestion/source/kafka/kafka_config.py,sha256=ijUB8PS5p-o3uLCHkAxAJAIM88s47rVaAUYXmi_lR4M,4406
|
|
347
348
|
datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
|
|
348
349
|
datahub/ingestion/source/kafka_connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
349
350
|
datahub/ingestion/source/kafka_connect/common.py,sha256=lH64n1v_rJamWGfidBeuQJj8W1_IvOBpXQLR2YZaEvQ,7057
|
|
@@ -475,7 +476,7 @@ datahub/ingestion/source/sql/clickhouse.py,sha256=9Fvaic9FZufRKdhVz2EcPUnEt5cA9V
|
|
|
475
476
|
datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
|
|
476
477
|
datahub/ingestion/source/sql/druid.py,sha256=_tzgTa5jhPUXk6WCmS7p10feCwJm6yUFcOgMZA-OcE8,2922
|
|
477
478
|
datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
|
|
478
|
-
datahub/ingestion/source/sql/hive.py,sha256=
|
|
479
|
+
datahub/ingestion/source/sql/hive.py,sha256=E5ZuGHoJmLQDMpUQFXPUc69Zbjv9QxGqtocFu_S4hbw,31590
|
|
479
480
|
datahub/ingestion/source/sql/hive_metastore.py,sha256=qpX9eCRm-zq3DKC49MaZP9vzGot9QIDfaaeFgXGbOuM,36283
|
|
480
481
|
datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
|
|
481
482
|
datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
|
|
@@ -910,7 +911,7 @@ datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
|
|
|
910
911
|
datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
|
|
911
912
|
datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
|
|
912
913
|
datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
|
|
913
|
-
datahub/sdk/lineage_client.py,sha256=
|
|
914
|
+
datahub/sdk/lineage_client.py,sha256=NJu9SbTNMWplFsiT5WWgqJ4ypD76y7Sm6I3btZ78rdE,13368
|
|
914
915
|
datahub/sdk/main_client.py,sha256=agOPt93N2uYLuHdiDSJyk2xXZtZiYHvEbJC1VN5PCyo,4355
|
|
915
916
|
datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
|
|
916
917
|
datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
|
|
@@ -1053,8 +1054,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1053
1054
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1054
1055
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1055
1056
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1056
|
-
acryl_datahub-1.0.0.
|
|
1057
|
-
acryl_datahub-1.0.0.
|
|
1058
|
-
acryl_datahub-1.0.0.
|
|
1059
|
-
acryl_datahub-1.0.0.
|
|
1060
|
-
acryl_datahub-1.0.0.
|
|
1057
|
+
acryl_datahub-1.0.0.4rc5.dist-info/METADATA,sha256=B-WcxIPWm4TjOoKAuGZzHhvkLHMLkjg4SfUMPMhYVhc,179949
|
|
1058
|
+
acryl_datahub-1.0.0.4rc5.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
|
1059
|
+
acryl_datahub-1.0.0.4rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
|
|
1060
|
+
acryl_datahub-1.0.0.4rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1061
|
+
acryl_datahub-1.0.0.4rc5.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -7,7 +7,6 @@ from typing import Any, Dict, Iterable, List, Optional, Type, cast
|
|
|
7
7
|
import avro.schema
|
|
8
8
|
import confluent_kafka
|
|
9
9
|
import confluent_kafka.admin
|
|
10
|
-
import pydantic
|
|
11
10
|
from confluent_kafka.admin import (
|
|
12
11
|
AdminClient,
|
|
13
12
|
ConfigEntry,
|
|
@@ -16,13 +15,8 @@ from confluent_kafka.admin import (
|
|
|
16
15
|
)
|
|
17
16
|
from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
|
|
18
17
|
|
|
19
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
20
18
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
21
19
|
from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
|
|
22
|
-
from datahub.configuration.source_common import (
|
|
23
|
-
DatasetSourceConfigMixin,
|
|
24
|
-
LowerCaseDatasetUrnConfigMixin,
|
|
25
|
-
)
|
|
26
20
|
from datahub.emitter import mce_builder
|
|
27
21
|
from datahub.emitter.mce_builder import (
|
|
28
22
|
make_data_platform_urn,
|
|
@@ -50,16 +44,15 @@ from datahub.ingestion.api.source import (
|
|
|
50
44
|
)
|
|
51
45
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
52
46
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
47
|
+
from datahub.ingestion.source.kafka.kafka_config import KafkaSourceConfig
|
|
53
48
|
from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
|
|
54
49
|
KafkaSchemaRegistryBase,
|
|
55
50
|
)
|
|
56
51
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
57
52
|
StaleEntityRemovalHandler,
|
|
58
53
|
StaleEntityRemovalSourceReport,
|
|
59
|
-
StatefulStaleMetadataRemovalConfig,
|
|
60
54
|
)
|
|
61
55
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
62
|
-
StatefulIngestionConfigBase,
|
|
63
56
|
StatefulIngestionSourceBase,
|
|
64
57
|
)
|
|
65
58
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
@@ -90,64 +83,6 @@ class KafkaTopicConfigKeys(StrEnum):
|
|
|
90
83
|
UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable"
|
|
91
84
|
|
|
92
85
|
|
|
93
|
-
class KafkaSourceConfig(
|
|
94
|
-
StatefulIngestionConfigBase,
|
|
95
|
-
DatasetSourceConfigMixin,
|
|
96
|
-
LowerCaseDatasetUrnConfigMixin,
|
|
97
|
-
):
|
|
98
|
-
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
|
99
|
-
|
|
100
|
-
topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
|
|
101
|
-
domain: Dict[str, AllowDenyPattern] = pydantic.Field(
|
|
102
|
-
default={},
|
|
103
|
-
description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
|
|
104
|
-
)
|
|
105
|
-
topic_subject_map: Dict[str, str] = pydantic.Field(
|
|
106
|
-
default={},
|
|
107
|
-
description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
|
|
108
|
-
)
|
|
109
|
-
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
110
|
-
schema_registry_class: str = pydantic.Field(
|
|
111
|
-
default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
|
|
112
|
-
description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
|
|
113
|
-
)
|
|
114
|
-
schema_tags_field: str = pydantic.Field(
|
|
115
|
-
default="tags",
|
|
116
|
-
description="The field name in the schema metadata that contains the tags to be added to the dataset.",
|
|
117
|
-
)
|
|
118
|
-
enable_meta_mapping: bool = pydantic.Field(
|
|
119
|
-
default=True,
|
|
120
|
-
description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
|
|
121
|
-
)
|
|
122
|
-
meta_mapping: Dict = pydantic.Field(
|
|
123
|
-
default={},
|
|
124
|
-
description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
|
|
125
|
-
)
|
|
126
|
-
field_meta_mapping: Dict = pydantic.Field(
|
|
127
|
-
default={},
|
|
128
|
-
description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
|
|
129
|
-
)
|
|
130
|
-
strip_user_ids_from_email: bool = pydantic.Field(
|
|
131
|
-
default=False,
|
|
132
|
-
description="Whether or not to strip email id while adding owners using meta mappings.",
|
|
133
|
-
)
|
|
134
|
-
tag_prefix: str = pydantic.Field(
|
|
135
|
-
default="", description="Prefix added to tags during ingestion."
|
|
136
|
-
)
|
|
137
|
-
ignore_warnings_on_schema_type: bool = pydantic.Field(
|
|
138
|
-
default=False,
|
|
139
|
-
description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
|
|
140
|
-
)
|
|
141
|
-
disable_topic_record_naming_strategy: bool = pydantic.Field(
|
|
142
|
-
default=False,
|
|
143
|
-
description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
|
|
144
|
-
)
|
|
145
|
-
ingest_schemas_as_entities: bool = pydantic.Field(
|
|
146
|
-
default=False,
|
|
147
|
-
description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
|
|
151
86
|
def get_kafka_consumer(
|
|
152
87
|
connection: KafkaConsumerConnectionConfig,
|
|
153
88
|
) -> confluent_kafka.Consumer:
|
|
@@ -430,6 +365,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
430
365
|
|
|
431
366
|
# 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro
|
|
432
367
|
description: Optional[str] = None
|
|
368
|
+
external_url: Optional[str] = None
|
|
433
369
|
if (
|
|
434
370
|
schema_metadata is not None
|
|
435
371
|
and isinstance(schema_metadata.platformSchema, KafkaSchemaClass)
|
|
@@ -481,8 +417,16 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
481
417
|
mce_builder.make_global_tag_aspect_with_tag_list(all_tags)
|
|
482
418
|
)
|
|
483
419
|
|
|
420
|
+
if self.source_config.external_url_base:
|
|
421
|
+
# Remove trailing slash from base URL if present
|
|
422
|
+
base_url = self.source_config.external_url_base.rstrip("/")
|
|
423
|
+
external_url = f"{base_url}/{dataset_name}"
|
|
424
|
+
|
|
484
425
|
dataset_properties = DatasetPropertiesClass(
|
|
485
|
-
name=dataset_name,
|
|
426
|
+
name=dataset_name,
|
|
427
|
+
customProperties=custom_props,
|
|
428
|
+
description=description,
|
|
429
|
+
externalUrl=external_url,
|
|
486
430
|
)
|
|
487
431
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
488
432
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
6
|
+
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
7
|
+
from datahub.configuration.source_common import (
|
|
8
|
+
DatasetSourceConfigMixin,
|
|
9
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
10
|
+
)
|
|
11
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
12
|
+
StatefulStaleMetadataRemovalConfig,
|
|
13
|
+
)
|
|
14
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
15
|
+
StatefulIngestionConfigBase,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class KafkaSourceConfig(
|
|
20
|
+
StatefulIngestionConfigBase,
|
|
21
|
+
DatasetSourceConfigMixin,
|
|
22
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
23
|
+
):
|
|
24
|
+
connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
|
|
25
|
+
|
|
26
|
+
topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
|
|
27
|
+
domain: Dict[str, AllowDenyPattern] = Field(
|
|
28
|
+
default={},
|
|
29
|
+
description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
|
|
30
|
+
)
|
|
31
|
+
topic_subject_map: Dict[str, str] = Field(
|
|
32
|
+
default={},
|
|
33
|
+
description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
|
|
34
|
+
)
|
|
35
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
36
|
+
schema_registry_class: str = Field(
|
|
37
|
+
default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
|
|
38
|
+
description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
|
|
39
|
+
)
|
|
40
|
+
schema_tags_field: str = Field(
|
|
41
|
+
default="tags",
|
|
42
|
+
description="The field name in the schema metadata that contains the tags to be added to the dataset.",
|
|
43
|
+
)
|
|
44
|
+
enable_meta_mapping: bool = Field(
|
|
45
|
+
default=True,
|
|
46
|
+
description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
|
|
47
|
+
)
|
|
48
|
+
meta_mapping: Dict = Field(
|
|
49
|
+
default={},
|
|
50
|
+
description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
|
|
51
|
+
)
|
|
52
|
+
field_meta_mapping: Dict = Field(
|
|
53
|
+
default={},
|
|
54
|
+
description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
|
|
55
|
+
)
|
|
56
|
+
strip_user_ids_from_email: bool = Field(
|
|
57
|
+
default=False,
|
|
58
|
+
description="Whether or not to strip email id while adding owners using meta mappings.",
|
|
59
|
+
)
|
|
60
|
+
tag_prefix: str = Field(
|
|
61
|
+
default="", description="Prefix added to tags during ingestion."
|
|
62
|
+
)
|
|
63
|
+
ignore_warnings_on_schema_type: bool = Field(
|
|
64
|
+
default=False,
|
|
65
|
+
description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
|
|
66
|
+
)
|
|
67
|
+
disable_topic_record_naming_strategy: bool = Field(
|
|
68
|
+
default=False,
|
|
69
|
+
description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
|
|
70
|
+
)
|
|
71
|
+
ingest_schemas_as_entities: bool = Field(
|
|
72
|
+
default=False,
|
|
73
|
+
description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
|
|
74
|
+
)
|
|
75
|
+
external_url_base: Optional[str] = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="Base URL for external platform (e.g. Aiven) where topics can be viewed. The topic name will be appended to this base URL.",
|
|
78
|
+
)
|
|
@@ -867,3 +867,18 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
867
867
|
return partition_column.get("column_names")
|
|
868
868
|
|
|
869
869
|
return []
|
|
870
|
+
|
|
871
|
+
def get_table_properties(
|
|
872
|
+
self, inspector: Inspector, schema: str, table: str
|
|
873
|
+
) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
|
|
874
|
+
(description, properties, location) = super().get_table_properties(
|
|
875
|
+
inspector, schema, table
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
new_properties = {}
|
|
879
|
+
for key, value in properties.items():
|
|
880
|
+
if key and key[-1] == ":":
|
|
881
|
+
new_properties[key[:-1]] = value
|
|
882
|
+
else:
|
|
883
|
+
new_properties[key] = value
|
|
884
|
+
return (description, new_properties, location)
|
|
@@ -166,6 +166,7 @@ class SupersetDataset(BaseModel):
|
|
|
166
166
|
class SupersetConfig(
|
|
167
167
|
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
168
168
|
):
|
|
169
|
+
# TODO: Add support for missing dataPlatformInstance/containers
|
|
169
170
|
# See the Superset /security/login endpoint for details
|
|
170
171
|
# https://superset.apache.org/docs/rest-api
|
|
171
172
|
connect_uri: str = Field(
|
|
@@ -177,7 +178,7 @@ class SupersetConfig(
|
|
|
177
178
|
)
|
|
178
179
|
domain: Dict[str, AllowDenyPattern] = Field(
|
|
179
180
|
default=dict(),
|
|
180
|
-
description="
|
|
181
|
+
description="Regex patterns for tables to filter to assign domain_key. ",
|
|
181
182
|
)
|
|
182
183
|
dataset_pattern: AllowDenyPattern = Field(
|
|
183
184
|
default=AllowDenyPattern.allow_all(),
|
|
@@ -191,6 +192,10 @@ class SupersetConfig(
|
|
|
191
192
|
AllowDenyPattern.allow_all(),
|
|
192
193
|
description="Patterns for selecting dashboard names that are to be included",
|
|
193
194
|
)
|
|
195
|
+
database_pattern: AllowDenyPattern = Field(
|
|
196
|
+
default=AllowDenyPattern.allow_all(),
|
|
197
|
+
description="Regex patterns for databases to filter in ingestion.",
|
|
198
|
+
)
|
|
194
199
|
username: Optional[str] = Field(default=None, description="Superset username.")
|
|
195
200
|
password: Optional[str] = Field(default=None, description="Superset password.")
|
|
196
201
|
# Configuration for stateful ingestion
|
|
@@ -297,6 +302,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
297
302
|
)
|
|
298
303
|
self.session = self.login()
|
|
299
304
|
self.owner_info = self.parse_owner_info()
|
|
305
|
+
self.filtered_dataset_to_database: Dict[int, str] = {}
|
|
306
|
+
self.filtered_chart_to_database: Dict[int, str] = {}
|
|
307
|
+
self.processed_charts: Dict[int, Tuple[Optional[str], bool]] = {}
|
|
300
308
|
|
|
301
309
|
def login(self) -> requests.Session:
|
|
302
310
|
login_response = requests.post(
|
|
@@ -519,12 +527,57 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
519
527
|
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
520
528
|
)
|
|
521
529
|
return
|
|
530
|
+
|
|
531
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
532
|
+
raw_position_data = dashboard_data.get("position_json", "{}")
|
|
533
|
+
position_data = (
|
|
534
|
+
json.loads(raw_position_data)
|
|
535
|
+
if raw_position_data is not None
|
|
536
|
+
else {}
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
chart_ids = []
|
|
540
|
+
for key, value in position_data.items():
|
|
541
|
+
if not key.startswith("CHART-"):
|
|
542
|
+
continue
|
|
543
|
+
chart_id = value.get("meta", {}).get("chartId")
|
|
544
|
+
if chart_id:
|
|
545
|
+
chart_ids.append(chart_id)
|
|
546
|
+
|
|
547
|
+
for chart_id in chart_ids:
|
|
548
|
+
if chart_id in self.processed_charts:
|
|
549
|
+
database_name, is_filtered = self.processed_charts[chart_id]
|
|
550
|
+
if is_filtered:
|
|
551
|
+
self.report.warning(
|
|
552
|
+
message="Dashboard contains charts using datasets from a filtered database. Set the dashboard pattern to deny ingestion.",
|
|
553
|
+
context=str(
|
|
554
|
+
dict(
|
|
555
|
+
dashboard_id=dashboard_id,
|
|
556
|
+
dashboard_title=dashboard_title,
|
|
557
|
+
chart_id=chart_id,
|
|
558
|
+
database_name=database_name,
|
|
559
|
+
)
|
|
560
|
+
),
|
|
561
|
+
title="Incomplete Ingestion",
|
|
562
|
+
)
|
|
563
|
+
|
|
522
564
|
dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
|
|
565
|
+
|
|
523
566
|
except Exception as e:
|
|
524
567
|
self.report.warning(
|
|
525
|
-
|
|
568
|
+
message="Failed to construct dashboard snapshot. This dashboard will not be ingested.",
|
|
569
|
+
context=str(
|
|
570
|
+
dict(
|
|
571
|
+
dashboard_id=dashboard_id,
|
|
572
|
+
dashboard_title=dashboard_title,
|
|
573
|
+
error=str(e),
|
|
574
|
+
)
|
|
575
|
+
),
|
|
576
|
+
title="Dashboard Construction Failed",
|
|
577
|
+
exc=e,
|
|
526
578
|
)
|
|
527
579
|
return
|
|
580
|
+
|
|
528
581
|
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
529
582
|
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
530
583
|
yield from self._get_domain_wu(
|
|
@@ -780,14 +833,80 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
780
833
|
|
|
781
834
|
def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
782
835
|
chart_name = ""
|
|
836
|
+
database_name = None
|
|
783
837
|
try:
|
|
784
|
-
chart_id =
|
|
838
|
+
chart_id = chart_data.get("id")
|
|
785
839
|
chart_name = chart_data.get("slice_name", "")
|
|
786
840
|
if not self.config.chart_pattern.allowed(chart_name):
|
|
787
841
|
self.report.report_dropped(
|
|
788
842
|
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
789
843
|
)
|
|
790
844
|
return
|
|
845
|
+
|
|
846
|
+
# TODO: Make helper methods for database_pattern
|
|
847
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
848
|
+
datasource_id = chart_data.get("datasource_id")
|
|
849
|
+
|
|
850
|
+
if datasource_id:
|
|
851
|
+
if datasource_id in self.filtered_dataset_to_database:
|
|
852
|
+
database_name = self.filtered_dataset_to_database[datasource_id]
|
|
853
|
+
self.filtered_chart_to_database[chart_id] = database_name
|
|
854
|
+
|
|
855
|
+
is_filtered = not self.config.database_pattern.allowed(
|
|
856
|
+
database_name
|
|
857
|
+
)
|
|
858
|
+
self.processed_charts[chart_id] = (database_name, is_filtered)
|
|
859
|
+
|
|
860
|
+
if is_filtered:
|
|
861
|
+
self.report.warning(
|
|
862
|
+
message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
|
|
863
|
+
context=str(
|
|
864
|
+
dict(
|
|
865
|
+
chart_id=chart_id,
|
|
866
|
+
chart_name=chart_name,
|
|
867
|
+
database_name=database_name,
|
|
868
|
+
)
|
|
869
|
+
),
|
|
870
|
+
title="Incomplete Ingestion",
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
else:
|
|
874
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
875
|
+
database_name = (
|
|
876
|
+
dataset_response.get("result", {})
|
|
877
|
+
.get("database", {})
|
|
878
|
+
.get("database_name")
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
if database_name:
|
|
882
|
+
is_filtered = not self.config.database_pattern.allowed(
|
|
883
|
+
database_name
|
|
884
|
+
)
|
|
885
|
+
if is_filtered:
|
|
886
|
+
self.filtered_chart_to_database[chart_id] = (
|
|
887
|
+
database_name
|
|
888
|
+
)
|
|
889
|
+
self.filtered_dataset_to_database[datasource_id] = (
|
|
890
|
+
database_name
|
|
891
|
+
)
|
|
892
|
+
self.processed_charts[chart_id] = (
|
|
893
|
+
database_name,
|
|
894
|
+
is_filtered,
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
if is_filtered:
|
|
898
|
+
self.report.warning(
|
|
899
|
+
message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
|
|
900
|
+
context=str(
|
|
901
|
+
dict(
|
|
902
|
+
chart_id=chart_id,
|
|
903
|
+
chart_name=chart_name,
|
|
904
|
+
database_name=database_name,
|
|
905
|
+
)
|
|
906
|
+
),
|
|
907
|
+
title="Incomplete Ingestion",
|
|
908
|
+
)
|
|
909
|
+
|
|
791
910
|
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
792
911
|
datasource_id = chart_data.get("datasource_id")
|
|
793
912
|
if datasource_id:
|
|
@@ -799,12 +918,28 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
799
918
|
dataset_name
|
|
800
919
|
):
|
|
801
920
|
self.report.warning(
|
|
802
|
-
|
|
921
|
+
message="Chart uses a dataset that was filtered by dataset pattern. Update your dataset pattern to include this dataset.",
|
|
922
|
+
context=str(
|
|
923
|
+
dict(
|
|
924
|
+
chart_id=chart_id,
|
|
925
|
+
chart_name=chart_name,
|
|
926
|
+
dataset_name=dataset_name,
|
|
927
|
+
)
|
|
928
|
+
),
|
|
929
|
+
title="Incomplete Ingestion",
|
|
803
930
|
)
|
|
931
|
+
if chart_id not in self.processed_charts:
|
|
932
|
+
self.processed_charts[chart_id] = (database_name, False)
|
|
933
|
+
|
|
804
934
|
yield from self.construct_chart_from_chart_data(chart_data)
|
|
805
935
|
except Exception as e:
|
|
806
936
|
self.report.warning(
|
|
807
|
-
|
|
937
|
+
message="Failed to construct chart snapshot. This chart will not be ingested.",
|
|
938
|
+
context=str(
|
|
939
|
+
dict(chart_id=chart_id, chart_name=chart_name, error=str(e))
|
|
940
|
+
),
|
|
941
|
+
title="Chart Construction Failed",
|
|
942
|
+
exc=e,
|
|
808
943
|
)
|
|
809
944
|
return
|
|
810
945
|
|
|
@@ -1048,12 +1183,30 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1048
1183
|
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1049
1184
|
dataset_name = ""
|
|
1050
1185
|
try:
|
|
1186
|
+
dataset_id = dataset_data.get("id")
|
|
1051
1187
|
dataset_name = dataset_data.get("table_name", "")
|
|
1052
1188
|
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
1053
1189
|
self.report.report_dropped(
|
|
1054
1190
|
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1055
1191
|
)
|
|
1056
1192
|
return
|
|
1193
|
+
if self.config.database_pattern != AllowDenyPattern.allow_all():
|
|
1194
|
+
dataset_response = self.get_dataset_info(dataset_id)
|
|
1195
|
+
database_name = (
|
|
1196
|
+
dataset_response.get("result", {})
|
|
1197
|
+
.get("database", {})
|
|
1198
|
+
.get("database_name")
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
if database_name and not self.config.database_pattern.allowed(
|
|
1202
|
+
database_name
|
|
1203
|
+
):
|
|
1204
|
+
self.filtered_dataset_to_database[dataset_id] = database_name
|
|
1205
|
+
self.report.report_dropped(
|
|
1206
|
+
f"Dataset '{dataset_name}' filtered by database_pattern with database '{database_name}'"
|
|
1207
|
+
)
|
|
1208
|
+
return
|
|
1209
|
+
|
|
1057
1210
|
dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
|
|
1058
1211
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1059
1212
|
except Exception as e:
|
|
@@ -1079,12 +1232,13 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1079
1232
|
)
|
|
1080
1233
|
|
|
1081
1234
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1082
|
-
|
|
1083
|
-
yield from self.emit_dashboard_mces()
|
|
1084
|
-
if self.config.ingest_charts:
|
|
1085
|
-
yield from self.emit_chart_mces()
|
|
1235
|
+
# TODO: Possibly change ingestion order to minimize API calls
|
|
1086
1236
|
if self.config.ingest_datasets:
|
|
1087
1237
|
yield from self.emit_dataset_mces()
|
|
1238
|
+
if self.config.ingest_charts:
|
|
1239
|
+
yield from self.emit_chart_mces()
|
|
1240
|
+
if self.config.ingest_dashboards:
|
|
1241
|
+
yield from self.emit_dashboard_mces()
|
|
1088
1242
|
|
|
1089
1243
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
1090
1244
|
return [
|