PyPI - acryl-datahub - Versions diffs - 1.0.0.4rc3__py3-none-any.whl → 1.0.0.4rc5__py3-none-any.whl - Mend

acryl-datahub 1.0.0.4rc3py3-none-any.whl → 1.0.0.4rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (13) hide show

{acryl_datahub-1.0.0.4rc3.dist-info → acryl_datahub-1.0.0.4rc5.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-acryl_datahub-1.0.0.4rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
+acryl_datahub-1.0.0.4rc5.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
 datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
-datahub/_version.py,sha256=s9GagN9LnstD0j2tgPma0WgwFBeEF2rYkuJB2EEtcJY,323
+datahub/_version.py,sha256=C9cJOHt7RVINEp5u9HEU9BQHiZS7hSsr2uIJMS3Pdjw,323
 datahub/entrypoints.py,sha256=AQN5MzCe6q3LKI4SS6WmwN56kgjF6AC1ld7yELWVP2w,8953
 datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -207,7 +207,7 @@ datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suab
 datahub/ingestion/source/ldap.py,sha256=PKoA5pVjuIxFfW1TcbYNIWSm7-C7shK2FDn7Zo5mrVM,18705
 datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
 datahub/ingestion/source/mlflow.py,sha256=fh7izN9jlSwbpGIrEyJktlmwFZR5vNG9z9L5VQ31k_4,33141
-datahub/ingestion/source/mode.py,sha256=uOm6Hk-3ybAYkL7qRKWbrp9P78KjrpOXUGCOgyoPg8g,66309
+datahub/ingestion/source/mode.py,sha256=Hs_qi-ntsuW_DIJ2wvMsFjir6P_QkRo4mq5Ldk7c1iU,66709
 datahub/ingestion/source/mongodb.py,sha256=2C2Cxn8DXL53IbNiywIuKt8UT_EMcPg9f8su-OPSNGU,21237
 datahub/ingestion/source/nifi.py,sha256=D1gBXxdpLuUQ0eurwofIR_SGg1rHGhwk3qxsWI1PT9c,56882
 datahub/ingestion/source/openapi.py,sha256=zx976zstg6M2KoTz_iKKgU9VETDeX2rnw6BofiHXbDc,18669
@@ -218,7 +218,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
 datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
 datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
 datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
-datahub/ingestion/source/superset.py,sha256=bMfvm9HgUoS3T7BjHsDrrOodc8iBRrJRQYv2D66bABo,41194
+datahub/ingestion/source/superset.py,sha256=acxKU8XkaCNvhcv0CwU27_dYTdV5iR45BPcc83SR_T0,48380
 datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
 datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -343,7 +343,8 @@ datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
 datahub/ingestion/source/identity/azure_ad.py,sha256=9Hrvm4CSfc02yjnPUsCYSY4Qw9fXPnDFWLexab0mcpc,28559
 datahub/ingestion/source/identity/okta.py,sha256=jC21myJuMRTaPgj0OD9heaC-mz8ECjqpy2hSJwlUSwM,31943
 datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/kafka/kafka.py,sha256=Bo9l8KbCle8ZUOyJMOo2HsJcT63EM_R7bCrPS-FhWT4,26512
+datahub/ingestion/source/kafka/kafka.py,sha256=HMoe1P0QE9JlcX6MNEALTgz7LsmG-HUXVuWnk3jkRo8,22900
+datahub/ingestion/source/kafka/kafka_config.py,sha256=ijUB8PS5p-o3uLCHkAxAJAIM88s47rVaAUYXmi_lR4M,4406
 datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
 datahub/ingestion/source/kafka_connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/kafka_connect/common.py,sha256=lH64n1v_rJamWGfidBeuQJj8W1_IvOBpXQLR2YZaEvQ,7057
@@ -475,7 +476,7 @@ datahub/ingestion/source/sql/clickhouse.py,sha256=9Fvaic9FZufRKdhVz2EcPUnEt5cA9V
 datahub/ingestion/source/sql/cockroachdb.py,sha256=XaD7eae34plU9ISRC6PzYX9q6RdT2qkzjH6CpTOgkx4,1443
 datahub/ingestion/source/sql/druid.py,sha256=_tzgTa5jhPUXk6WCmS7p10feCwJm6yUFcOgMZA-OcE8,2922
 datahub/ingestion/source/sql/hana.py,sha256=0PIvcX0Rz59NyR7Ag5Bv1MBV_UbJwxl9UAopo_xe_CA,1342
-datahub/ingestion/source/sql/hive.py,sha256=voQl6QjHUXPdx7LPONuHiFavW9nRKMjHZx7o3vJQG7A,31034
+datahub/ingestion/source/sql/hive.py,sha256=E5ZuGHoJmLQDMpUQFXPUc69Zbjv9QxGqtocFu_S4hbw,31590
 datahub/ingestion/source/sql/hive_metastore.py,sha256=qpX9eCRm-zq3DKC49MaZP9vzGot9QIDfaaeFgXGbOuM,36283
 datahub/ingestion/source/sql/mariadb.py,sha256=Hm102kmfs_1rd4lsTYhzVMZq5S3B6cyfvpHSzJjqvMw,737
 datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_k94o7g-0,3350
@@ -910,7 +911,7 @@ datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
 datahub/sdk/dataset.py,sha256=7PlmqKDROJvsh1CtlhG8owOhLdelHVbSSg5hA5Kbwp0,29150
 datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
 datahub/sdk/entity_client.py,sha256=1AC9J7-jv3rD-MFEPz2PnFrT8nFkj_WO0M-4nyVOtQk,5319
-datahub/sdk/lineage_client.py,sha256=1JPxQx5pZYQwnzL_VX3KjAbX2QWdvjSKm8S8Bya2IAc,8617
+datahub/sdk/lineage_client.py,sha256=NJu9SbTNMWplFsiT5WWgqJ4ypD76y7Sm6I3btZ78rdE,13368
 datahub/sdk/main_client.py,sha256=agOPt93N2uYLuHdiDSJyk2xXZtZiYHvEbJC1VN5PCyo,4355
 datahub/sdk/mlmodel.py,sha256=amS-hHg5tT7zAqEHG17kSA60Q7td2DFtO-W2rEfb2rY,10206
 datahub/sdk/mlmodelgroup.py,sha256=_7IkqkLVeyqYVEUHTVePSDLQyESsnwht5ca1lcMODAg,7842
@@ -1053,8 +1054,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-1.0.0.4rc3.dist-info/METADATA,sha256=QpiKmTUaWMGnd69dnDEHug3N-2SHZdTa1LzKxcIDnf4,179949
-acryl_datahub-1.0.0.4rc3.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
-acryl_datahub-1.0.0.4rc3.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
-acryl_datahub-1.0.0.4rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-1.0.0.4rc3.dist-info/RECORD,,
+acryl_datahub-1.0.0.4rc5.dist-info/METADATA,sha256=B-WcxIPWm4TjOoKAuGZzHhvkLHMLkjg4SfUMPMhYVhc,179949
+acryl_datahub-1.0.0.4rc5.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
+acryl_datahub-1.0.0.4rc5.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
+acryl_datahub-1.0.0.4rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-1.0.0.4rc5.dist-info/RECORD,,

datahub/_version.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "1.0.0.4rc3"
+__version__ = "1.0.0.4rc5"
 def is_dev_mode() -> bool:

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import Any, Dict, Iterable, List, Optional, Type, cast
 import avro.schema
 import confluent_kafka
 import confluent_kafka.admin
-import pydantic
 from confluent_kafka.admin import (
     AdminClient,
     ConfigEntry,
@@ -16,13 +15,8 @@ from confluent_kafka.admin import (
 )
 from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
-from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.kafka import KafkaConsumerConnectionConfig
 from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
-from datahub.configuration.source_common import (
-    DatasetSourceConfigMixin,
-    LowerCaseDatasetUrnConfigMixin,
-)
 from datahub.emitter import mce_builder
 from datahub.emitter.mce_builder import (
     make_data_platform_urn,
@@ -50,16 +44,15 @@ from datahub.ingestion.api.source import (
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source.kafka.kafka_config import KafkaSourceConfig
 from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
     KafkaSchemaRegistryBase,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
-    StatefulStaleMetadataRemovalConfig,
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
-    StatefulIngestionConfigBase,
     StatefulIngestionSourceBase,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.common import Status
@@ -90,64 +83,6 @@ class KafkaTopicConfigKeys(StrEnum):
     UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable"
-class KafkaSourceConfig(
-    StatefulIngestionConfigBase,
-    DatasetSourceConfigMixin,
-    LowerCaseDatasetUrnConfigMixin,
-):
-    connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
-    topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
-    domain: Dict[str, AllowDenyPattern] = pydantic.Field(
-        default={},
-        description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
-    )
-    topic_subject_map: Dict[str, str] = pydantic.Field(
-        default={},
-        description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
-    )
-    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
-    schema_registry_class: str = pydantic.Field(
-        default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
-        description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
-    )
-    schema_tags_field: str = pydantic.Field(
-        default="tags",
-        description="The field name in the schema metadata that contains the tags to be added to the dataset.",
-    )
-    enable_meta_mapping: bool = pydantic.Field(
-        default=True,
-        description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
-    )
-    meta_mapping: Dict = pydantic.Field(
-        default={},
-        description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
-    )
-    field_meta_mapping: Dict = pydantic.Field(
-        default={},
-        description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
-    )
-    strip_user_ids_from_email: bool = pydantic.Field(
-        default=False,
-        description="Whether or not to strip email id while adding owners using meta mappings.",
-    )
-    tag_prefix: str = pydantic.Field(
-        default="", description="Prefix added to tags during ingestion."
-    )
-    ignore_warnings_on_schema_type: bool = pydantic.Field(
-        default=False,
-        description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
-    )
-    disable_topic_record_naming_strategy: bool = pydantic.Field(
-        default=False,
-        description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
-    )
-    ingest_schemas_as_entities: bool = pydantic.Field(
-        default=False,
-        description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
-    )
 def get_kafka_consumer(
     connection: KafkaConsumerConnectionConfig,
 ) -> confluent_kafka.Consumer:
@@ -430,6 +365,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
         # 4. Set dataset's description, tags, ownership, etc, if topic schema type is avro
         description: Optional[str] = None
+        external_url: Optional[str] = None
         if (
             schema_metadata is not None
             and isinstance(schema_metadata.platformSchema, KafkaSchemaClass)
@@ -481,8 +417,16 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
                     mce_builder.make_global_tag_aspect_with_tag_list(all_tags)
                 )
+        if self.source_config.external_url_base:
+            # Remove trailing slash from base URL if present
+            base_url = self.source_config.external_url_base.rstrip("/")
+            external_url = f"{base_url}/{dataset_name}"
         dataset_properties = DatasetPropertiesClass(
-            name=dataset_name, customProperties=custom_props, description=description
+            name=dataset_name,
+            customProperties=custom_props,
+            description=description,
+            externalUrl=external_url,
         )
         dataset_snapshot.aspects.append(dataset_properties)

datahub/ingestion/source/kafka/kafka_config.py ADDED Viewed

@@ -0,0 +1,78 @@
+from typing import Dict, Optional
+from pydantic import Field
+from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.kafka import KafkaConsumerConnectionConfig
+from datahub.configuration.source_common import (
+    DatasetSourceConfigMixin,
+    LowerCaseDatasetUrnConfigMixin,
+)
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+)
+class KafkaSourceConfig(
+    StatefulIngestionConfigBase,
+    DatasetSourceConfigMixin,
+    LowerCaseDatasetUrnConfigMixin,
+):
+    connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig()
+    topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"])
+    domain: Dict[str, AllowDenyPattern] = Field(
+        default={},
+        description="A map of domain names to allow deny patterns. Domains can be urn-based (`urn:li:domain:13ae4d85-d955-49fc-8474-9004c663a810`) or bare (`13ae4d85-d955-49fc-8474-9004c663a810`).",
+    )
+    topic_subject_map: Dict[str, str] = Field(
+        default={},
+        description="Provides the mapping for the `key` and the `value` schemas of a topic to the corresponding schema registry subject name. Each entry of this map has the form `<topic_name>-key`:`<schema_registry_subject_name_for_key_schema>` and `<topic_name>-value`:`<schema_registry_subject_name_for_value_schema>` for the key and the value schemas associated with the topic, respectively. This parameter is mandatory when the [RecordNameStrategy](https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#how-the-naming-strategies-work) is used as the subject naming strategy in the kafka schema registry. NOTE: When provided, this overrides the default subject name resolution even when the `TopicNameStrategy` or the `TopicRecordNameStrategy` are used.",
+    )
+    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
+    schema_registry_class: str = Field(
+        default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry",
+        description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.",
+    )
+    schema_tags_field: str = Field(
+        default="tags",
+        description="The field name in the schema metadata that contains the tags to be added to the dataset.",
+    )
+    enable_meta_mapping: bool = Field(
+        default=True,
+        description="When enabled, applies the mappings that are defined through the meta_mapping directives.",
+    )
+    meta_mapping: Dict = Field(
+        default={},
+        description="mapping rules that will be executed against top-level schema properties. Refer to the section below on meta automated mappings.",
+    )
+    field_meta_mapping: Dict = Field(
+        default={},
+        description="mapping rules that will be executed against field-level schema properties. Refer to the section below on meta automated mappings.",
+    )
+    strip_user_ids_from_email: bool = Field(
+        default=False,
+        description="Whether or not to strip email id while adding owners using meta mappings.",
+    )
+    tag_prefix: str = Field(
+        default="", description="Prefix added to tags during ingestion."
+    )
+    ignore_warnings_on_schema_type: bool = Field(
+        default=False,
+        description="Disables warnings reported for non-AVRO/Protobuf value or key schemas if set.",
+    )
+    disable_topic_record_naming_strategy: bool = Field(
+        default=False,
+        description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
+    )
+    ingest_schemas_as_entities: bool = Field(
+        default=False,
+        description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
+    )
+    external_url_base: Optional[str] = Field(
+        default=None,
+        description="Base URL for external platform (e.g. Aiven) where topics can be viewed. The topic name will be appended to this base URL.",
+    )

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -1457,11 +1457,18 @@ class ModeSource(StatefulIngestionSourceBase):
             )
             queries = queries_json.get("_embedded", {}).get("queries", {})
         except ModeRequestError as e:
-            self.report.report_failure(
-                title="Failed to Retrieve Queries",
-                message="Unable to retrieve queries for report token.",
-                context=f"Report Token: {report_token}, Error: {str(e)}",
-            )
+            if isinstance(e, HTTPError) and e.response.status_code == 404:
+                self.report.report_warning(
+                    title="No Queries Found",
+                    message="No queries found for the report token. Maybe the report is deleted...",
+                    context=f"Report Token: {report_token}, Error: {str(e)}",
+                )
+            else:
+                self.report.report_failure(
+                    title="Failed to Retrieve Queries",
+                    message="Unable to retrieve queries for report token.",
+                    context=f"Report Token: {report_token}, Error: {str(e)}",
+                )
         return queries
     @lru_cache(maxsize=None)

datahub/ingestion/source/sql/hive.py CHANGED Viewed

@@ -867,3 +867,18 @@ class HiveSource(TwoTierSQLAlchemySource):
                 return partition_column.get("column_names")
         return []
+    def get_table_properties(
+        self, inspector: Inspector, schema: str, table: str
+    ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
+        (description, properties, location) = super().get_table_properties(
+            inspector, schema, table
+        )
+        new_properties = {}
+        for key, value in properties.items():
+            if key and key[-1] == ":":
+                new_properties[key[:-1]] = value
+            else:
+                new_properties[key] = value
+        return (description, new_properties, location)

datahub/ingestion/source/superset.py CHANGED Viewed

@@ -166,6 +166,7 @@ class SupersetDataset(BaseModel):
 class SupersetConfig(
     StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
 ):
+    # TODO: Add support for missing dataPlatformInstance/containers
     # See the Superset /security/login endpoint for details
     # https://superset.apache.org/docs/rest-api
     connect_uri: str = Field(
@@ -177,7 +178,7 @@ class SupersetConfig(
     )
     domain: Dict[str, AllowDenyPattern] = Field(
         default=dict(),
-        description="regex patterns for tables to filter to assign domain_key. ",
+        description="Regex patterns for tables to filter to assign domain_key. ",
     )
     dataset_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
@@ -191,6 +192,10 @@ class SupersetConfig(
         AllowDenyPattern.allow_all(),
         description="Patterns for selecting dashboard names that are to be included",
     )
+    database_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for databases to filter in ingestion.",
+    )
     username: Optional[str] = Field(default=None, description="Superset username.")
     password: Optional[str] = Field(default=None, description="Superset password.")
     # Configuration for stateful ingestion
@@ -297,6 +302,9 @@ class SupersetSource(StatefulIngestionSourceBase):
             )
         self.session = self.login()
         self.owner_info = self.parse_owner_info()
+        self.filtered_dataset_to_database: Dict[int, str] = {}
+        self.filtered_chart_to_database: Dict[int, str] = {}
+        self.processed_charts: Dict[int, Tuple[Optional[str], bool]] = {}
     def login(self) -> requests.Session:
         login_response = requests.post(
@@ -519,12 +527,57 @@ class SupersetSource(StatefulIngestionSourceBase):
                     f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
                 )
                 return
+            if self.config.database_pattern != AllowDenyPattern.allow_all():
+                raw_position_data = dashboard_data.get("position_json", "{}")
+                position_data = (
+                    json.loads(raw_position_data)
+                    if raw_position_data is not None
+                    else {}
+                )
+                chart_ids = []
+                for key, value in position_data.items():
+                    if not key.startswith("CHART-"):
+                        continue
+                    chart_id = value.get("meta", {}).get("chartId")
+                    if chart_id:
+                        chart_ids.append(chart_id)
+                for chart_id in chart_ids:
+                    if chart_id in self.processed_charts:
+                        database_name, is_filtered = self.processed_charts[chart_id]
+                        if is_filtered:
+                            self.report.warning(
+                                message="Dashboard contains charts using datasets from a filtered database. Set the dashboard pattern to deny ingestion.",
+                                context=str(
+                                    dict(
+                                        dashboard_id=dashboard_id,
+                                        dashboard_title=dashboard_title,
+                                        chart_id=chart_id,
+                                        database_name=database_name,
+                                    )
+                                ),
+                                title="Incomplete Ingestion",
+                            )
             dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
         except Exception as e:
             self.report.warning(
-                f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
+                message="Failed to construct dashboard snapshot. This dashboard will not be ingested.",
+                context=str(
+                    dict(
+                        dashboard_id=dashboard_id,
+                        dashboard_title=dashboard_title,
+                        error=str(e),
+                    )
+                ),
+                title="Dashboard Construction Failed",
+                exc=e,
             )
             return
         mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
         yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
         yield from self._get_domain_wu(
@@ -780,14 +833,80 @@ class SupersetSource(StatefulIngestionSourceBase):
     def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
         chart_name = ""
+        database_name = None
         try:
-            chart_id = str(chart_data.get("id"))
+            chart_id = chart_data.get("id")
             chart_name = chart_data.get("slice_name", "")
             if not self.config.chart_pattern.allowed(chart_name):
                 self.report.report_dropped(
                     f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
                 )
                 return
+            # TODO: Make helper methods for database_pattern
+            if self.config.database_pattern != AllowDenyPattern.allow_all():
+                datasource_id = chart_data.get("datasource_id")
+                if datasource_id:
+                    if datasource_id in self.filtered_dataset_to_database:
+                        database_name = self.filtered_dataset_to_database[datasource_id]
+                        self.filtered_chart_to_database[chart_id] = database_name
+                        is_filtered = not self.config.database_pattern.allowed(
+                            database_name
+                        )
+                        self.processed_charts[chart_id] = (database_name, is_filtered)
+                        if is_filtered:
+                            self.report.warning(
+                                message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
+                                context=str(
+                                    dict(
+                                        chart_id=chart_id,
+                                        chart_name=chart_name,
+                                        database_name=database_name,
+                                    )
+                                ),
+                                title="Incomplete Ingestion",
+                            )
+                    else:
+                        dataset_response = self.get_dataset_info(datasource_id)
+                        database_name = (
+                            dataset_response.get("result", {})
+                            .get("database", {})
+                            .get("database_name")
+                        )
+                        if database_name:
+                            is_filtered = not self.config.database_pattern.allowed(
+                                database_name
+                            )
+                            if is_filtered:
+                                self.filtered_chart_to_database[chart_id] = (
+                                    database_name
+                                )
+                                self.filtered_dataset_to_database[datasource_id] = (
+                                    database_name
+                                )
+                            self.processed_charts[chart_id] = (
+                                database_name,
+                                is_filtered,
+                            )
+                            if is_filtered:
+                                self.report.warning(
+                                    message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
+                                    context=str(
+                                        dict(
+                                            chart_id=chart_id,
+                                            chart_name=chart_name,
+                                            database_name=database_name,
+                                        )
+                                    ),
+                                    title="Incomplete Ingestion",
+                                )
             if self.config.dataset_pattern != AllowDenyPattern.allow_all():
                 datasource_id = chart_data.get("datasource_id")
                 if datasource_id:
@@ -799,12 +918,28 @@ class SupersetSource(StatefulIngestionSourceBase):
                         dataset_name
                     ):
                         self.report.warning(
-                            f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
+                            message="Chart uses a dataset that was filtered by dataset pattern. Update your dataset pattern to include this dataset.",
+                            context=str(
+                                dict(
+                                    chart_id=chart_id,
+                                    chart_name=chart_name,
+                                    dataset_name=dataset_name,
+                                )
+                            ),
+                            title="Incomplete Ingestion",
                         )
+            if chart_id not in self.processed_charts:
+                self.processed_charts[chart_id] = (database_name, False)
             yield from self.construct_chart_from_chart_data(chart_data)
         except Exception as e:
             self.report.warning(
-                f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
+                message="Failed to construct chart snapshot. This chart will not be ingested.",
+                context=str(
+                    dict(chart_id=chart_id, chart_name=chart_name, error=str(e))
+                ),
+                title="Chart Construction Failed",
+                exc=e,
             )
             return
@@ -1048,12 +1183,30 @@ class SupersetSource(StatefulIngestionSourceBase):
     def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
         dataset_name = ""
         try:
+            dataset_id = dataset_data.get("id")
             dataset_name = dataset_data.get("table_name", "")
             if not self.config.dataset_pattern.allowed(dataset_name):
                 self.report.report_dropped(
                     f"Dataset '{dataset_name}' filtered by dataset_pattern"
                 )
                 return
+            if self.config.database_pattern != AllowDenyPattern.allow_all():
+                dataset_response = self.get_dataset_info(dataset_id)
+                database_name = (
+                    dataset_response.get("result", {})
+                    .get("database", {})
+                    .get("database_name")
+                )
+                if database_name and not self.config.database_pattern.allowed(
+                    database_name
+                ):
+                    self.filtered_dataset_to_database[dataset_id] = database_name
+                    self.report.report_dropped(
+                        f"Dataset '{dataset_name}' filtered by database_pattern with database '{database_name}'"
+                    )
+                    return
             dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
             mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
         except Exception as e:
@@ -1079,12 +1232,13 @@ class SupersetSource(StatefulIngestionSourceBase):
         )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        if self.config.ingest_dashboards:
-            yield from self.emit_dashboard_mces()
-        if self.config.ingest_charts:
-            yield from self.emit_chart_mces()
+        # TODO: Possibly change ingestion order to minimize API calls
         if self.config.ingest_datasets:
             yield from self.emit_dataset_mces()
+        if self.config.ingest_charts:
+            yield from self.emit_chart_mces()
+        if self.config.ingest_dashboards:
+            yield from self.emit_dashboard_mces()
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [

acryl-datahub 1.0.0.4rc3__py3-none-any.whl → 1.0.0.4rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.4rc3py3-none-any.whl → 1.0.0.4rc5py3-none-any.whl