acryl-datahub 1.0.0.4rc3__py3-none-any.whl → 1.0.0.4rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.4rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.4rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=s9GagN9LnstD0j2tgPma0WgwFBeEF2rYkuJB2EEtcJY,323
4
+ datahub/_version.py,sha256=C9cJOHt7RVINEp5u9HEU9BQHiZS7hSsr2uIJMS3Pdjw,323
5
5
  datahub/entrypoints.py,sha256=AQN5MzCe6q3LKI4SS6WmwN56kgjF6AC1ld7yELWVP2w,8953
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -207,7 +207,7 @@ datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suab
207
207
  datahub/ingestion/source/ldap.py,sha256=PKoA5pVjuIxFfW1TcbYNIWSm7-C7shK2FDn7Zo5mrVM,18705
208
208
  datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
209
209
  datahub/ingestion/source/mlflow.py,sha256=fh7izN9jlSwbpGIrEyJktlmwFZR5vNG9z9L5VQ31k_4,33141
210
- datahub/ingestion/source/mode.py,sha256=uOm6Hk-3ybAYkL7qRKWbrp9P78KjrpOXUGCOgyoPg8g,66309
210
+ datahub/ingestion/source/mode.py,sha256=Hs_qi-ntsuW_DIJ2wvMsFjir6P_QkRo4mq5Ldk7c1iU,66709
211
211
  datahub/ingestion/source/mongodb.py,sha256=2C2Cxn8DXL53IbNiywIuKt8UT_EMcPg9f8su-OPSNGU,21237
212
212
  datahub/ingestion/source/nifi.py,sha256=D1gBXxdpLuUQ0eurwofIR_SGg1rHGhwk3qxsWI1PT9c,56882
213
213
  datahub/ingestion/source/openapi.py,sha256=zx976zstg6M2KoTz_iKKgU9VETDeX2rnw6BofiHXbDc,18669
@@ -218,7 +218,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
218
218
  datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
219
219
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
220
220
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
221
- datahub/ingestion/source/superset.py,sha256=bMfvm9HgUoS3T7BjHsDrrOodc8iBRrJRQYv2D66bABo,41194
221
+ datahub/ingestion/source/superset.py,sha256=acxKU8XkaCNvhcv0CwU27_dYTdV5iR45BPcc83SR_T0,48380
222
222
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
223
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
224
224
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -343,7 +343,8 @@ datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
343
343
  datahub/ingestion/source/identity/azure_ad.py,sha256=9Hrvm4CSfc02yjnPUsCYSY4Qw9fXPnDFWLexab0mcpc,28559
344
344
  datahub/ingestion/source/identity/okta.py,sha256=jC21myJuMRTaPgj0OD9heaC-mz8ECjqpy2hSJwlUSwM,31943
345
345
  datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
346
- datahub/ingestion/source/kafka/kafka.py,sha256=Bo9l8KbCle8ZUOyJMOo2HsJcT63EM_R7bCrPS-FhWT4,26512
346
+ datahub/ingestion/source/kafka/kafka.py,sha256=HMoe1P0QE9JlcX6MNEALTgz7LsmG-HUXVuWnk3jkRo8,22900
347
+ datahub/ingestion/source/kafka/kafka_config.py,sha256=ijUB8PS5p-o3uLCHkAxAJAIM88s47rVaAUYXmi_lR4M,4406
347
348
  datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
348
349
  datahub/ingestion/source/kafka_connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
349
350
  datahub/ingestion/source/kafka_connect/common.py,sha256=lH64n1v_rJamWGfidBeuQJj8W1_IvOBpXQLR2YZaEvQ,7057
@@ -475,7 +476,7 @@ datahub/ingestion/source/sql/clickhouse.py,sha256=9Fvaic9FZufRKdhVz2EcPUnEt5cA9V
475
476
  datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
476
477
  datahub/ingestion/source/sql/druid.py,sha256=_tzgTa5jhPUXk6WCmS7p10feCwJm6yUFcOgMZA-OcE8,2922
477
478
  datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
478
- datahub/ingestion/source/sql/hive.py,sha256=voQl6QjHUXPdx7LPONuHiFavW9nRKMjHZx7o3vJQG7A,31034
479
+ datahub/ingestion/source/sql/hive.py,sha256=E5ZuGHoJmLQDMpUQFXPUc69Zbjv9QxGqtocFu_S4hbw,31590
479
480
  datahub/ingestion/source/sql/hive_metastore.py,sha256=qpX9eCRm-zq3DKC49MaZP9vzGot9QIDfaaeFgXGbOuM,36283
480
481
  datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
481
482
  datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
@@ -910,7 +911,7 @@ datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
910
911
  datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
911
912
  datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
912
913
  datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
913
- datahub/sdk/lineage_client.py,sha256=1JPxQx5pZYQwnzL_VX3KjAbX2QWdvjSKm8S8Bya2IAc,8617
914
+ datahub/sdk/lineage_client.py,sha256=NJu9SbTNMWplFsiT5WWgqJ4ypD76y7Sm6I3btZ78rdE,13368
914
915
  datahub/sdk/main_client.py,sha256=agOPt93N2uYLuHdiDSJyk2xXZtZiYHvEbJC1VN5PCyo,4355
915
916
  datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
916
917
  datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
@@ -1053,8 +1054,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1053
1054
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1054
1055
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1055
1056
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1056
- acryl_datahub-1.0.0.4rc3.dist-info/METADATA,sha256=QpiKmTUaWMGnd69dnDEHug3N-2SHZdTa1LzKxcIDnf4,179949
1057
- acryl_datahub-1.0.0.4rc3.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
1058
- acryl_datahub-1.0.0.4rc3.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1059
- acryl_datahub-1.0.0.4rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1060
- acryl_datahub-1.0.0.4rc3.dist-info/RECORD,,
1057
+ acryl_datahub-1.0.0.4rc5.dist-info/METADATA,sha256=B-WcxIPWm4TjOoKAuGZzHhvkLHMLkjg4SfUMPMhYVhc,179949
1058
+ acryl_datahub-1.0.0.4rc5.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
1059
+ acryl_datahub-1.0.0.4rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1060
+ acryl_datahub-1.0.0.4rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1061
+ acryl_datahub-1.0.0.4rc5.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.4rc3"
3
+ __version__ = "1.0.0.4rc5"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -7,7 +7,6 @@ from typing import Any, Dict, Iterable, List, Optional, Type, cast
7
7
  import avro.schema
8
8
  import confluent_kafka
9
9
  import confluent_kafka.admin
10
- import pydantic
11
10
  from confluent_kafka.admin import (
12
11
  AdminClient,
13
12
  ConfigEntry,
@@ -16,13 +15,8 @@ from confluent_kafka.admin import (
16
15
  )
17
16
  from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
18
17
 
19
- from datahub.configuration.common import AllowDenyPattern
20
18
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
21
19
  from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
22
- from datahub.configuration.source_common import (
23
- DatasetSourceConfigMixin,
24
- LowerCaseDatasetUrnConfigMixin,
25
- )
26
20
  from datahub.emitter import mce_builder
27
21
  from datahub.emitter.mce_builder import (
28
22
  make_data_platform_urn,
@@ -50,16 +44,15 @@ from datahub.ingestion.api.source import (
50
44
  )
51
45
  from datahub.ingestion.api.workunit import MetadataWorkUnit
52
46
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
47
+ from datahub.ingestion.source.kafka.kafka_config import KafkaSourceConfig
53
48
  from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
54
49
  KafkaSchemaRegistryBase,
55
50
  )
56
51
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
57
52
  StaleEntityRemovalHandler,
58
53
  StaleEntityRemovalSourceReport,
59
- StatefulStaleMetadataRemovalConfig,
60
54
  )
61
55
  from datahub.ingestion.source.state.stateful_ingestion_base import (
62
- StatefulIngestionConfigBase,
63
56
  StatefulIngestionSourceBase,
64
57
  )
65
58
  from datahub.metadata.com.linkedin.pegasus2avro.common import Status
@@ -90,64 +83,6 @@ class KafkaTopicConfigKeys(StrEnum):
90
83
  UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable"
91
84
 
92
85
 
93
- class KafkaSourceConfig(
94
- StatefulIngestionConfigBase,
95
- DatasetSourceConfigMixin,
96
- LowerCaseDatasetUrnConfigMixin,
97
- ):
98
- connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
99
-
100
- topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
101
- domain: Dict[str, AllowDenyPattern] = pydantic.Field(
102
- default={},
103
- description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
104
- )
105
- topic_subject_map: Dict[str, str] = pydantic.Field(
106
- default={},
107
- description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
108
- )
109
- stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
110
- schema_registry_class: str = pydantic.Field(
111
- default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
112
- description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
113
- )
114
- schema_tags_field: str = pydantic.Field(
115
- default="tags",
116
- description="The field name in the schema metadata that contains the tags to be added to the dataset.",
117
- )
118
- enable_meta_mapping: bool = pydantic.Field(
119
- default=True,
120
- description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
121
- )
122
- meta_mapping: Dict = pydantic.Field(
123
- default={},
124
- description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
125
- )
126
- field_meta_mapping: Dict = pydantic.Field(
127
- default={},
128
- description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
129
- )
130
- strip_user_ids_from_email: bool = pydantic.Field(
131
- default=False,
132
- description="Whether or not to strip email id while adding owners using meta mappings.",
133
- )
134
- tag_prefix: str = pydantic.Field(
135
- default="", description="Prefix added to tags during ingestion."
136
- )
137
- ignore_warnings_on_schema_type: bool = pydantic.Field(
138
- default=False,
139
- description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
140
- )
141
- disable_topic_record_naming_strategy: bool = pydantic.Field(
142
- default=False,
143
- description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
144
- )
145
- ingest_schemas_as_entities: bool = pydantic.Field(
146
- default=False,
147
- description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
148
- )
149
-
150
-
151
86
  def get_kafka_consumer(
152
87
  connection: KafkaConsumerConnectionConfig,
153
88
  ) -> confluent_kafka.Consumer:
@@ -430,6 +365,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
430
365
 
431
366
  # 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro
432
367
  description: Optional[str] = None
368
+ external_url: Optional[str] = None
433
369
  if (
434
370
  schema_metadata is not None
435
371
  and isinstance(schema_metadata.platformSchema, KafkaSchemaClass)
@@ -481,8 +417,16 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
481
417
  mce_builder.make_global_tag_aspect_with_tag_list(all_tags)
482
418
  )
483
419
 
420
+ if self.source_config.external_url_base:
421
+ # Remove trailing slash from base URL if present
422
+ base_url = self.source_config.external_url_base.rstrip("/")
423
+ external_url = f"{base_url}/{dataset_name}"
424
+
484
425
  dataset_properties = DatasetPropertiesClass(
485
- name=dataset_name, customProperties=custom_props, description=description
426
+ name=dataset_name,
427
+ customProperties=custom_props,
428
+ description=description,
429
+ externalUrl=external_url,
486
430
  )
487
431
  dataset_snapshot.aspects.append(dataset_properties)
488
432
 
@@ -0,0 +1,78 @@
1
+ from typing import Dict, Optional
2
+
3
+ from pydantic import Field
4
+
5
+ from datahub.configuration.common import AllowDenyPattern
6
+ from datahub.configuration.kafka import KafkaConsumerConnectionConfig
7
+ from datahub.configuration.source_common import (
8
+ DatasetSourceConfigMixin,
9
+ LowerCaseDatasetUrnConfigMixin,
10
+ )
11
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
12
+ StatefulStaleMetadataRemovalConfig,
13
+ )
14
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
15
+ StatefulIngestionConfigBase,
16
+ )
17
+
18
+
19
+ class KafkaSourceConfig(
20
+ StatefulIngestionConfigBase,
21
+ DatasetSourceConfigMixin,
22
+ LowerCaseDatasetUrnConfigMixin,
23
+ ):
24
+ connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
25
+
26
+ topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
27
+ domain: Dict[str, AllowDenyPattern] = Field(
28
+ default={},
29
+ description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
30
+ )
31
+ topic_subject_map: Dict[str, str] = Field(
32
+ default={},
33
+ description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
34
+ )
35
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
36
+ schema_registry_class: str = Field(
37
+ default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
38
+ description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
39
+ )
40
+ schema_tags_field: str = Field(
41
+ default="tags",
42
+ description="The field name in the schema metadata that contains the tags to be added to the dataset.",
43
+ )
44
+ enable_meta_mapping: bool = Field(
45
+ default=True,
46
+ description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
47
+ )
48
+ meta_mapping: Dict = Field(
49
+ default={},
50
+ description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
51
+ )
52
+ field_meta_mapping: Dict = Field(
53
+ default={},
54
+ description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
55
+ )
56
+ strip_user_ids_from_email: bool = Field(
57
+ default=False,
58
+ description="Whether or not to strip email id while adding owners using meta mappings.",
59
+ )
60
+ tag_prefix: str = Field(
61
+ default="", description="Prefix added to tags during ingestion."
62
+ )
63
+ ignore_warnings_on_schema_type: bool = Field(
64
+ default=False,
65
+ description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
66
+ )
67
+ disable_topic_record_naming_strategy: bool = Field(
68
+ default=False,
69
+ description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
70
+ )
71
+ ingest_schemas_as_entities: bool = Field(
72
+ default=False,
73
+ description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
74
+ )
75
+ external_url_base: Optional[str] = Field(
76
+ default=None,
77
+ description="Base URL for external platform (e.g. Aiven) where topics can be viewed. The topic name will be appended to this base URL.",
78
+ )
@@ -1457,11 +1457,18 @@ class ModeSource(StatefulIngestionSourceBase):
1457
1457
  )
1458
1458
  queries = queries_json.get("_embedded", {}).get("queries", {})
1459
1459
  except ModeRequestError as e:
1460
- self.report.report_failure(
1461
- title="Failed to Retrieve Queries",
1462
- message="Unable to retrieve queries for report token.",
1463
- context=f"Report Token: {report_token}, Error: {str(e)}",
1464
- )
1460
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1461
+ self.report.report_warning(
1462
+ title="No Queries Found",
1463
+ message="No queries found for the report token. Maybe the report is deleted...",
1464
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1465
+ )
1466
+ else:
1467
+ self.report.report_failure(
1468
+ title="Failed to Retrieve Queries",
1469
+ message="Unable to retrieve queries for report token.",
1470
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1471
+ )
1465
1472
  return queries
1466
1473
 
1467
1474
  @lru_cache(maxsize=None)
@@ -867,3 +867,18 @@ class HiveSource(TwoTierSQLAlchemySource):
867
867
  return partition_column.get("column_names")
868
868
 
869
869
  return []
870
+
871
+ def get_table_properties(
872
+ self, inspector: Inspector, schema: str, table: str
873
+ ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
874
+ (description, properties, location) = super().get_table_properties(
875
+ inspector, schema, table
876
+ )
877
+
878
+ new_properties = {}
879
+ for key, value in properties.items():
880
+ if key and key[-1] == ":":
881
+ new_properties[key[:-1]] = value
882
+ else:
883
+ new_properties[key] = value
884
+ return (description, new_properties, location)
@@ -166,6 +166,7 @@ class SupersetDataset(BaseModel):
166
166
  class SupersetConfig(
167
167
  StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
168
168
  ):
169
+ # TODO: Add support for missing dataPlatformInstance/containers
169
170
  # See the Superset /security/login endpoint for details
170
171
  # https://superset.apache.org/docs/rest-api
171
172
  connect_uri: str = Field(
@@ -177,7 +178,7 @@ class SupersetConfig(
177
178
  )
178
179
  domain: Dict[str, AllowDenyPattern] = Field(
179
180
  default=dict(),
180
- description="regex patterns for tables to filter to assign domain_key. ",
181
+ description="Regex patterns for tables to filter to assign domain_key. ",
181
182
  )
182
183
  dataset_pattern: AllowDenyPattern = Field(
183
184
  default=AllowDenyPattern.allow_all(),
@@ -191,6 +192,10 @@ class SupersetConfig(
191
192
  AllowDenyPattern.allow_all(),
192
193
  description="Patterns for selecting dashboard names that are to be included",
193
194
  )
195
+ database_pattern: AllowDenyPattern = Field(
196
+ default=AllowDenyPattern.allow_all(),
197
+ description="Regex patterns for databases to filter in ingestion.",
198
+ )
194
199
  username: Optional[str] = Field(default=None, description="Superset username.")
195
200
  password: Optional[str] = Field(default=None, description="Superset password.")
196
201
  # Configuration for stateful ingestion
@@ -297,6 +302,9 @@ class SupersetSource(StatefulIngestionSourceBase):
297
302
  )
298
303
  self.session = self.login()
299
304
  self.owner_info = self.parse_owner_info()
305
+ self.filtered_dataset_to_database: Dict[int, str] = {}
306
+ self.filtered_chart_to_database: Dict[int, str] = {}
307
+ self.processed_charts: Dict[int, Tuple[Optional[str], bool]] = {}
300
308
 
301
309
  def login(self) -> requests.Session:
302
310
  login_response = requests.post(
@@ -519,12 +527,57 @@ class SupersetSource(StatefulIngestionSourceBase):
519
527
  f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
520
528
  )
521
529
  return
530
+
531
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
532
+ raw_position_data = dashboard_data.get("position_json", "{}")
533
+ position_data = (
534
+ json.loads(raw_position_data)
535
+ if raw_position_data is not None
536
+ else {}
537
+ )
538
+
539
+ chart_ids = []
540
+ for key, value in position_data.items():
541
+ if not key.startswith("CHART-"):
542
+ continue
543
+ chart_id = value.get("meta", {}).get("chartId")
544
+ if chart_id:
545
+ chart_ids.append(chart_id)
546
+
547
+ for chart_id in chart_ids:
548
+ if chart_id in self.processed_charts:
549
+ database_name, is_filtered = self.processed_charts[chart_id]
550
+ if is_filtered:
551
+ self.report.warning(
552
+ message="Dashboard contains charts using datasets from a filtered database. Set the dashboard pattern to deny ingestion.",
553
+ context=str(
554
+ dict(
555
+ dashboard_id=dashboard_id,
556
+ dashboard_title=dashboard_title,
557
+ chart_id=chart_id,
558
+ database_name=database_name,
559
+ )
560
+ ),
561
+ title="Incomplete Ingestion",
562
+ )
563
+
522
564
  dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
565
+
523
566
  except Exception as e:
524
567
  self.report.warning(
525
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
568
+ message="Failed to construct dashboard snapshot. This dashboard will not be ingested.",
569
+ context=str(
570
+ dict(
571
+ dashboard_id=dashboard_id,
572
+ dashboard_title=dashboard_title,
573
+ error=str(e),
574
+ )
575
+ ),
576
+ title="Dashboard Construction Failed",
577
+ exc=e,
526
578
  )
527
579
  return
580
+
528
581
  mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
529
582
  yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
530
583
  yield from self._get_domain_wu(
@@ -780,14 +833,80 @@ class SupersetSource(StatefulIngestionSourceBase):
780
833
 
781
834
  def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
782
835
  chart_name = ""
836
+ database_name = None
783
837
  try:
784
- chart_id = str(chart_data.get("id"))
838
+ chart_id = chart_data.get("id")
785
839
  chart_name = chart_data.get("slice_name", "")
786
840
  if not self.config.chart_pattern.allowed(chart_name):
787
841
  self.report.report_dropped(
788
842
  f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
789
843
  )
790
844
  return
845
+
846
+ # TODO: Make helper methods for database_pattern
847
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
848
+ datasource_id = chart_data.get("datasource_id")
849
+
850
+ if datasource_id:
851
+ if datasource_id in self.filtered_dataset_to_database:
852
+ database_name = self.filtered_dataset_to_database[datasource_id]
853
+ self.filtered_chart_to_database[chart_id] = database_name
854
+
855
+ is_filtered = not self.config.database_pattern.allowed(
856
+ database_name
857
+ )
858
+ self.processed_charts[chart_id] = (database_name, is_filtered)
859
+
860
+ if is_filtered:
861
+ self.report.warning(
862
+ message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
863
+ context=str(
864
+ dict(
865
+ chart_id=chart_id,
866
+ chart_name=chart_name,
867
+ database_name=database_name,
868
+ )
869
+ ),
870
+ title="Incomplete Ingestion",
871
+ )
872
+
873
+ else:
874
+ dataset_response = self.get_dataset_info(datasource_id)
875
+ database_name = (
876
+ dataset_response.get("result", {})
877
+ .get("database", {})
878
+ .get("database_name")
879
+ )
880
+
881
+ if database_name:
882
+ is_filtered = not self.config.database_pattern.allowed(
883
+ database_name
884
+ )
885
+ if is_filtered:
886
+ self.filtered_chart_to_database[chart_id] = (
887
+ database_name
888
+ )
889
+ self.filtered_dataset_to_database[datasource_id] = (
890
+ database_name
891
+ )
892
+ self.processed_charts[chart_id] = (
893
+ database_name,
894
+ is_filtered,
895
+ )
896
+
897
+ if is_filtered:
898
+ self.report.warning(
899
+ message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
900
+ context=str(
901
+ dict(
902
+ chart_id=chart_id,
903
+ chart_name=chart_name,
904
+ database_name=database_name,
905
+ )
906
+ ),
907
+ title="Incomplete Ingestion",
908
+ )
909
+
791
910
  if self.config.dataset_pattern != AllowDenyPattern.allow_all():
792
911
  datasource_id = chart_data.get("datasource_id")
793
912
  if datasource_id:
@@ -799,12 +918,28 @@ class SupersetSource(StatefulIngestionSourceBase):
799
918
  dataset_name
800
919
  ):
801
920
  self.report.warning(
802
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
921
+ message="Chart uses a dataset that was filtered by dataset pattern. Update your dataset pattern to include this dataset.",
922
+ context=str(
923
+ dict(
924
+ chart_id=chart_id,
925
+ chart_name=chart_name,
926
+ dataset_name=dataset_name,
927
+ )
928
+ ),
929
+ title="Incomplete Ingestion",
803
930
  )
931
+ if chart_id not in self.processed_charts:
932
+ self.processed_charts[chart_id] = (database_name, False)
933
+
804
934
  yield from self.construct_chart_from_chart_data(chart_data)
805
935
  except Exception as e:
806
936
  self.report.warning(
807
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
937
+ message="Failed to construct chart snapshot. This chart will not be ingested.",
938
+ context=str(
939
+ dict(chart_id=chart_id, chart_name=chart_name, error=str(e))
940
+ ),
941
+ title="Chart Construction Failed",
942
+ exc=e,
808
943
  )
809
944
  return
810
945
 
@@ -1048,12 +1183,30 @@ class SupersetSource(StatefulIngestionSourceBase):
1048
1183
  def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1049
1184
  dataset_name = ""
1050
1185
  try:
1186
+ dataset_id = dataset_data.get("id")
1051
1187
  dataset_name = dataset_data.get("table_name", "")
1052
1188
  if not self.config.dataset_pattern.allowed(dataset_name):
1053
1189
  self.report.report_dropped(
1054
1190
  f"Dataset '{dataset_name}' filtered by dataset_pattern"
1055
1191
  )
1056
1192
  return
1193
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
1194
+ dataset_response = self.get_dataset_info(dataset_id)
1195
+ database_name = (
1196
+ dataset_response.get("result", {})
1197
+ .get("database", {})
1198
+ .get("database_name")
1199
+ )
1200
+
1201
+ if database_name and not self.config.database_pattern.allowed(
1202
+ database_name
1203
+ ):
1204
+ self.filtered_dataset_to_database[dataset_id] = database_name
1205
+ self.report.report_dropped(
1206
+ f"Dataset '{dataset_name}' filtered by database_pattern with database '{database_name}'"
1207
+ )
1208
+ return
1209
+
1057
1210
  dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1058
1211
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1059
1212
  except Exception as e:
@@ -1079,12 +1232,13 @@ class SupersetSource(StatefulIngestionSourceBase):
1079
1232
  )
1080
1233
 
1081
1234
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1082
- if self.config.ingest_dashboards:
1083
- yield from self.emit_dashboard_mces()
1084
- if self.config.ingest_charts:
1085
- yield from self.emit_chart_mces()
1235
+ # TODO: Possibly change ingestion order to minimize API calls
1086
1236
  if self.config.ingest_datasets:
1087
1237
  yield from self.emit_dataset_mces()
1238
+ if self.config.ingest_charts:
1239
+ yield from self.emit_chart_mces()
1240
+ if self.config.ingest_dashboards:
1241
+ yield from self.emit_dashboard_mces()
1088
1242
 
1089
1243
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1090
1244
  return [