PyPI - acryl-datahub - Versions diffs - 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc24py3-none-any.whl → 0.15.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show

{acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
{acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
{acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
datahub/__init__.py +1 -1
datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
datahub/configuration/common.py +2 -5
datahub/configuration/source_common.py +13 -0
datahub/emitter/mce_builder.py +20 -4
datahub/emitter/mcp_builder.py +2 -7
datahub/emitter/mcp_patch_builder.py +37 -13
datahub/emitter/rest_emitter.py +25 -3
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
datahub/ingestion/api/closeable.py +3 -3
datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
datahub/ingestion/api/report.py +4 -1
datahub/ingestion/api/sink.py +4 -3
datahub/ingestion/api/source.py +4 -0
datahub/ingestion/api/source_helpers.py +2 -6
datahub/ingestion/glossary/classifier.py +2 -3
datahub/ingestion/graph/client.py +6 -3
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
datahub/ingestion/source/aws/aws_common.py +231 -27
datahub/ingestion/source/aws/glue.py +12 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
datahub/ingestion/source/datahub/config.py +22 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
datahub/ingestion/source/gc/datahub_gc.py +21 -5
datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
datahub/ingestion/source/iceberg/iceberg.py +27 -1
datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
datahub/ingestion/source/kafka_connect/__init__.py +0 -0
datahub/ingestion/source/kafka_connect/common.py +202 -0
datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
datahub/ingestion/source/looker/looker_common.py +63 -2
datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
datahub/ingestion/source/looker/looker_source.py +31 -4
datahub/ingestion/source/looker/looker_usage.py +23 -17
datahub/ingestion/source/mlflow.py +30 -5
datahub/ingestion/source/mode.py +40 -27
datahub/ingestion/source/powerbi/config.py +1 -14
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
datahub/ingestion/source/s3/source.py +1 -1
datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
datahub/ingestion/source/sql/hive.py +621 -8
datahub/ingestion/source/sql/hive_metastore.py +7 -0
datahub/ingestion/source/sql/mssql/job_models.py +30 -1
datahub/ingestion/source/sql/mssql/source.py +15 -1
datahub/ingestion/source/sql/sql_common.py +41 -102
datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
datahub/ingestion/source/sql/sql_report.py +2 -0
datahub/ingestion/source/state/checkpoint.py +2 -1
datahub/ingestion/source/tableau/tableau.py +122 -45
datahub/ingestion/source/tableau/tableau_common.py +18 -0
datahub/ingestion/source/tableau/tableau_constant.py +3 -1
datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/proxy.py +8 -27
datahub/ingestion/source/usage/usage_common.py +15 -1
datahub/ingestion/source_report/ingestion_stage.py +3 -0
datahub/metadata/_schema_classes.py +256 -3
datahub/metadata/_urns/urn_defs.py +168 -168
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
datahub/metadata/schema.avsc +252 -33
datahub/metadata/schemas/DataJobKey.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
datahub/metadata/schemas/MLModelProperties.avsc +62 -2
datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
datahub/specific/aspect_helpers/__init__.py +0 -0
datahub/specific/aspect_helpers/custom_properties.py +79 -0
datahub/specific/aspect_helpers/ownership.py +67 -0
datahub/specific/aspect_helpers/structured_properties.py +72 -0
datahub/specific/aspect_helpers/tags.py +42 -0
datahub/specific/aspect_helpers/terms.py +43 -0
datahub/specific/chart.py +28 -184
datahub/specific/dashboard.py +31 -196
datahub/specific/datajob.py +34 -189
datahub/specific/dataproduct.py +24 -86
datahub/specific/dataset.py +48 -133
datahub/specific/form.py +12 -32
datahub/specific/structured_property.py +9 -9
datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
datahub/sql_parsing/sqlglot_lineage.py +15 -5
datahub/sql_parsing/tool_meta_extractor.py +119 -5
datahub/utilities/time.py +8 -3
datahub/utilities/urns/_urn_base.py +5 -7
datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
datahub/specific/custom_properties.py +0 -37
datahub/specific/ownership.py +0 -48
datahub/specific/structured_properties.py +0 -53
{acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import logging
 import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, wait
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import List, Optional
+from threading import Lock
+from typing import Dict, Iterable, List, Optional
 from pydantic import Field
@@ -18,12 +19,28 @@ from datahub.utilities.urns._urn_base import Urn
 logger = logging.getLogger(__name__)
+QUERY_QUERY_ENTITY = """
+query listQueries($input: ScrollAcrossEntitiesInput!) {
+  scrollAcrossEntities(input: $input) {
+    nextScrollId
+    count
+    searchResults {
+      entity {
+        ... on QueryEntity {
+          urn
+        }
+      }
+    }
+  }
+}
+"""
 class SoftDeletedEntitiesCleanupConfig(ConfigModel):
     enabled: bool = Field(
         default=True, description="Whether to do soft deletion cleanup."
     )
-    retention_days: Optional[int] = Field(
+    retention_days: int = Field(
         10,
         description="Number of days to retain metadata in DataHub",
     )
@@ -62,25 +79,34 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
         default=None,
         description="Query to filter entities",
     )
     limit_entities_delete: Optional[int] = Field(
         25000, description="Max number of entities to delete."
     )
-    runtime_limit_seconds: Optional[int] = Field(
-        None,
+    futures_max_at_time: int = Field(
+        1000, description="Max number of futures to have at a time."
+    )
+    runtime_limit_seconds: int = Field(
+        7200,  # 2 hours by default
         description="Runtime limit in seconds",
     )
 @dataclass
 class SoftDeletedEntitiesReport(SourceReport):
-    num_soft_deleted_entity_removed: int = 0
-    num_soft_deleted_entity_removed_by_type: TopKDict[str, int] = field(
-        default_factory=TopKDict
-    )
-    sample_soft_deleted_removed_aspects_by_type: TopKDict[str, LossyList[str]] = field(
+    num_queries_found: int = 0
+    num_soft_deleted_entity_processed: int = 0
+    num_soft_deleted_retained_due_to_age: int = 0
+    num_soft_deleted_entity_removal_started: int = 0
+    num_hard_deleted: int = 0
+    num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
+    sample_hard_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
         default_factory=TopKDict
     )
+    runtime_limit_reached: bool = False
+    deletion_limit_reached: bool = False
 class SoftDeletedEntitiesCleanup:
@@ -103,48 +129,55 @@ class SoftDeletedEntitiesCleanup:
         self.config = config
         self.report = report
         self.dry_run = dry_run
+        self.start_time = 0.0
+        self._report_lock: Lock = Lock()
+        self.last_print_time = 0.0
+    def _increment_retained_count(self) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_soft_deleted_retained_due_to_age += 1
+    def _increment_removal_started_count(self) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_soft_deleted_entity_removal_started += 1
+    def _update_report(self, urn: str, entity_type: str) -> None:
+        """Thread-safe method to update report fields"""
+        with self._report_lock:
+            self.report.num_hard_deleted += 1
+            current_count = self.report.num_hard_deleted_by_type.get(entity_type, 0)
+            self.report.num_hard_deleted_by_type[entity_type] = current_count + 1
+            if entity_type not in self.report.sample_hard_deleted_aspects_by_type:
+                self.report.sample_hard_deleted_aspects_by_type[
+                    entity_type
+                ] = LossyList()
+            self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
     def delete_entity(self, urn: str) -> None:
         assert self.ctx.graph
         entity_urn = Urn.from_string(urn)
-        self.report.num_soft_deleted_entity_removed += 1
-        self.report.num_soft_deleted_entity_removed_by_type[entity_urn.entity_type] = (
-            self.report.num_soft_deleted_entity_removed_by_type.get(
-                entity_urn.entity_type, 0
-            )
-            + 1
-        )
-        if (
-            entity_urn.entity_type
-            not in self.report.sample_soft_deleted_removed_aspects_by_type
-        ):
-            self.report.sample_soft_deleted_removed_aspects_by_type[
-                entity_urn.entity_type
-            ] = LossyList()
-        self.report.sample_soft_deleted_removed_aspects_by_type[
-            entity_urn.entity_type
-        ].append(urn)
         if self.dry_run:
             logger.info(
                 f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
             )
             return
+        if self._deletion_limit_reached() or self._times_up():
+            return
+        self._increment_removal_started_count()
         self.ctx.graph.delete_entity(urn=urn, hard=True)
         self.ctx.graph.delete_references_to_urn(
             urn=urn,
             dry_run=False,
         )
+        self._update_report(urn, entity_urn.entity_type)
     def delete_soft_deleted_entity(self, urn: str) -> None:
         assert self.ctx.graph
-        if self.config.retention_days is None:
-            logger.info("Retention days is not set, skipping soft delete cleanup")
-            return
         retention_time = (
             int(datetime.now(timezone.utc).timestamp())
             - self.config.retention_days * 24 * 60 * 60
@@ -157,15 +190,84 @@ class SoftDeletedEntitiesCleanup:
             ]["created"]["time"] < (retention_time * 1000):
                 logger.debug(f"Hard deleting {urn}")
                 self.delete_entity(urn)
+            else:
+                self._increment_retained_count()
+    def _print_report(self) -> None:
+        time_taken = round(time.time() - self.last_print_time, 1)
+        # Print report every 2 minutes
+        if time_taken > 120:
+            self.last_print_time = time.time()
+            logger.info(f"\n{self.report.as_string()}")
+    def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
+        done, not_done = wait(futures, return_when=FIRST_COMPLETED)
+        futures = {future: urn for future, urn in futures.items() if future in not_done}
+        for future in done:
+            self._print_report()
+            if future.exception():
+                self.report.failure(
+                    title="Failed to delete entity",
+                    message="Failed to delete entity",
+                    context=futures[future],
+                    exc=future.exception(),
+                )
+            self.report.num_soft_deleted_entity_processed += 1
+            if (
+                self.report.num_soft_deleted_entity_processed % self.config.batch_size
+                == 0
+            ):
+                if self.config.delay:
+                    logger.debug(
+                        f"Sleeping for {self.config.delay} seconds before further processing batch"
+                    )
+                    time.sleep(self.config.delay)
+        return futures
-    def cleanup_soft_deleted_entities(self) -> None:
-        if not self.config.enabled:
-            return
+    def _get_soft_deleted_queries(self) -> Iterable[str]:
         assert self.ctx.graph
-        start_time = time.time()
-        deleted_count_retention = 0
-        urns = self.ctx.graph.get_urns_by_filter(
+        scroll_id: Optional[str] = None
+        while True:
+            try:
+                result = self.ctx.graph.execute_graphql(
+                    QUERY_QUERY_ENTITY,
+                    {
+                        "input": {
+                            "types": ["QUERY"],
+                            "query": "*",
+                            "scrollId": scroll_id if scroll_id else None,
+                            "count": self.config.batch_size,
+                            "orFilters": [
+                                {
+                                    "and": [
+                                        {
+                                            "field": "removed",
+                                            "values": ["true"],
+                                            "condition": "EQUAL",
+                                        }
+                                    ]
+                                }
+                            ],
+                        }
+                    },
+                )
+            except Exception as e:
+                self.report.failure(
+                    f"While trying to get queries with {scroll_id}", exc=e
+                )
+                break
+            scroll_across_entities = result.get("scrollAcrossEntities")
+            if not scroll_across_entities or not scroll_across_entities.get("count"):
+                break
+            scroll_id = scroll_across_entities.get("nextScrollId")
+            self.report.num_queries_found += scroll_across_entities.get("count")
+            for query in scroll_across_entities.get("searchResults"):
+                yield query["entity"]["urn"]
+    def _get_urns(self) -> Iterable[str]:
+        assert self.ctx.graph
+        yield from self.ctx.graph.get_urns_by_filter(
             entity_types=self.config.entity_types,
             platform=self.config.platform,
             env=self.config.env,
@@ -173,52 +275,45 @@ class SoftDeletedEntitiesCleanup:
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
+        yield from self._get_soft_deleted_queries()
+    def _times_up(self) -> bool:
+        if (
+            self.config.runtime_limit_seconds
+            and time.time() - self.start_time > self.config.runtime_limit_seconds
+        ):
+            with self._report_lock:
+                self.report.runtime_limit_reached = True
+            return True
+        return False
+    def _deletion_limit_reached(self) -> bool:
+        if (
+            self.config.limit_entities_delete
+            and self.report.num_hard_deleted > self.config.limit_entities_delete
+        ):
+            with self._report_lock:
+                self.report.deletion_limit_reached = True
+            return True
+        return False
+    def cleanup_soft_deleted_entities(self) -> None:
+        if not self.config.enabled:
+            return
+        self.start_time = time.time()
-        futures = {}
+        futures: Dict[Future, str] = dict()
         with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
-            num_urns_submitted = 0
-            for urn in urns:
-                num_urns_submitted += 1
-                if (
-                    self.config.limit_entities_delete
-                    and num_urns_submitted > self.config.limit_entities_delete
-                ):
-                    logger.info(
-                        f"Limit of {self.config.limit_entities_delete} entities reached. Stopping"
-                    )
+            for urn in self._get_urns():
+                self._print_report()
+                while len(futures) >= self.config.futures_max_at_time:
+                    futures = self._process_futures(futures)
+                if self._deletion_limit_reached() or self._times_up():
                     break
-                if (
-                    self.config.runtime_limit_seconds
-                    and time.time() - start_time > self.config.runtime_limit_seconds
-                ):
-                    logger.info(
-                        f"Runtime limit of {self.config.runtime_limit_seconds} seconds reached. Stopping"
-                    )
-                    break
                 future = executor.submit(self.delete_soft_deleted_entity, urn)
                 futures[future] = urn
-            if not futures:
-                return
-            for future in as_completed(futures):
-                if future.exception():
-                    logger.error(
-                        f"Failed to delete entity {futures[future]}: {future.exception()}"
-                    )
-                    self.report.failure(
-                        f"Failed to delete entity {futures[future]}",
-                        exc=future.exception(),
-                    )
-                deleted_count_retention += 1
-                if deleted_count_retention % self.config.batch_size == 0:
-                    logger.info(
-                        f"Processed {deleted_count_retention} soft deleted entity and deleted {self.report.num_soft_deleted_entity_removed} entities so far"
-                    )
-                    if self.config.delay:
-                        logger.debug(
-                            f"Sleeping for {self.config.delay} seconds before getting next batch"
-                        )
-                        time.sleep(self.config.delay)
+            logger.info(f"Waiting for {len(futures)} futures to complete")
+            while len(futures) > 0:
+                self._print_report()
+                futures = self._process_futures(futures)

datahub/ingestion/source/iceberg/iceberg.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pyiceberg.exceptions import (
     NoSuchNamespaceError,
     NoSuchPropertyException,
     NoSuchTableError,
+    ServerError,
 )
 from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
 from pyiceberg.table import Table
@@ -145,6 +146,13 @@ class IcebergSource(StatefulIngestionSourceBase):
         self.report.report_no_listed_namespaces(len(namespaces))
         tables_count = 0
         for namespace in namespaces:
+            namespace_repr = ".".join(namespace)
+            if not self.config.namespace_pattern.allowed(namespace_repr):
+                LOGGER.info(
+                    f"Namespace {namespace_repr} is not allowed by config pattern, skipping"
+                )
+                self.report.report_dropped(f"{namespace_repr}.*")
+                continue
             try:
                 tables = catalog.list_tables(namespace)
                 tables_count += len(tables)
@@ -181,6 +189,9 @@ class IcebergSource(StatefulIngestionSourceBase):
             if not self.config.table_pattern.allowed(dataset_name):
                 # Dataset name is rejected by pattern, report as dropped.
                 self.report.report_dropped(dataset_name)
+                LOGGER.debug(
+                    f"Skipping table {dataset_name} due to not being allowed by the config pattern"
+                )
                 return
             try:
                 if not hasattr(thread_local, "local_catalog"):
@@ -219,6 +230,22 @@ class IcebergSource(StatefulIngestionSourceBase):
                 LOGGER.warning(
                     f"NoSuchTableError while processing table {dataset_path}, skipping it.",
                 )
+            except FileNotFoundError as e:
+                self.report.report_warning(
+                    "file-not-found",
+                    f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
+                )
+                LOGGER.warning(
+                    f"FileNotFoundError while processing table {dataset_path}, skipping it."
+                )
+            except ServerError as e:
+                self.report.report_warning(
+                    "iceberg-rest-server-error",
+                    f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
+                )
+                LOGGER.warning(
+                    f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
+                )
             except Exception as e:
                 self.report.report_failure("general", f"Failed to create workunit: {e}")
                 LOGGER.exception(
@@ -269,7 +296,6 @@ class IcebergSource(StatefulIngestionSourceBase):
                 ] = table.current_snapshot().manifest_list
             dataset_properties = DatasetPropertiesClass(
                 name=table.name()[-1],
-                tags=[],
                 description=table.metadata.properties.get("comment", None),
                 customProperties=custom_properties,
             )

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns for tables to filter in ingestion.",
     )
+    namespace_pattern: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="Regex patterns for namespaces to filter in ingestion.",
+    )
     user_ownership_property: Optional[str] = Field(
         default="owner",
         description="Iceberg table property to look for a `CorpUser` owner.  Can only hold a single user value.  If property has no value, no owner information will be emitted.",

datahub/ingestion/source/kafka_connect/__init__.py ADDED Viewed

File without changes

datahub/ingestion/source/kafka_connect/common.py ADDED Viewed

@@ -0,0 +1,202 @@
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional
+from pydantic.fields import Field
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.source_common import (
+    DatasetLineageProviderConfigBase,
+    PlatformInstanceConfigMixin,
+)
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalSourceReport,
+    StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    StatefulIngestionConfigBase,
+)
+logger = logging.getLogger(__name__)
+KAFKA = "kafka"
+SOURCE = "source"
+SINK = "sink"
+CONNECTOR_CLASS = "connector.class"
+class ProvidedConfig(ConfigModel):
+    provider: str
+    path_key: str
+    value: str
+class GenericConnectorConfig(ConfigModel):
+    connector_name: str
+    source_dataset: str
+    source_platform: str
+class KafkaConnectSourceConfig(
+    PlatformInstanceConfigMixin,
+    DatasetLineageProviderConfigBase,
+    StatefulIngestionConfigBase,
+):
+    # See the Connect REST Interface for details
+    # https://docs.confluent.io/platform/current/connect/references/restapi.html#
+    connect_uri: str = Field(
+        default="http://localhost:8083/", description="URI to connect to."
+    )
+    username: Optional[str] = Field(default=None, description="Kafka Connect username.")
+    password: Optional[str] = Field(default=None, description="Kafka Connect password.")
+    cluster_name: Optional[str] = Field(
+        default="connect-cluster", description="Cluster to ingest from."
+    )
+    # convert lineage dataset's urns to lowercase
+    convert_lineage_urns_to_lowercase: bool = Field(
+        default=False,
+        description="Whether to convert the urns of ingested lineage dataset to lowercase",
+    )
+    connector_patterns: AllowDenyPattern = Field(
+        default=AllowDenyPattern.allow_all(),
+        description="regex patterns for connectors to filter for ingestion.",
+    )
+    provided_configs: Optional[List[ProvidedConfig]] = Field(
+        default=None, description="Provided Configurations"
+    )
+    connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
+        default=None,
+        description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
+    )
+    platform_instance_map: Optional[Dict[str, str]] = Field(
+        default=None,
+        description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
+    )
+    generic_connectors: List[GenericConnectorConfig] = Field(
+        default=[],
+        description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
+    )
+    stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
+@dataclass
+class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
+    connectors_scanned: int = 0
+    filtered: List[str] = field(default_factory=list)
+    def report_connector_scanned(self, connector: str) -> None:
+        self.connectors_scanned += 1
+    def report_dropped(self, connector: str) -> None:
+        self.filtered.append(connector)
+@dataclass
+class KafkaConnectLineage:
+    """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
+    source_platform: str
+    target_dataset: str
+    target_platform: str
+    job_property_bag: Optional[Dict[str, str]] = None
+    source_dataset: Optional[str] = None
+@dataclass
+class ConnectorManifest:
+    """Each instance is potential DataFlow"""
+    name: str
+    type: str
+    config: Dict
+    tasks: Dict
+    url: Optional[str] = None
+    flow_property_bag: Optional[Dict[str, str]] = None
+    lineages: List[KafkaConnectLineage] = field(default_factory=list)
+    topic_names: Iterable[str] = field(default_factory=list)
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        index = len(prefix)
+        return text[index:]
+    return text
+def unquote(
+    string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
+) -> str:
+    """
+    If string starts and ends with a quote, unquote it
+    """
+    trailing_quote = trailing_quote if trailing_quote else leading_quote
+    if string.startswith(leading_quote) and string.endswith(trailing_quote):
+        string = string[1:-1]
+    return string
+def get_dataset_name(
+    database_name: Optional[str],
+    source_table: str,
+) -> str:
+    if database_name:
+        dataset_name = database_name + "." + source_table
+    else:
+        dataset_name = source_table
+    return dataset_name
+def get_platform_instance(
+    config: KafkaConnectSourceConfig, connector_name: str, platform: str
+) -> Optional[str]:
+    instance_name = None
+    if (
+        config.connect_to_platform_map
+        and config.connect_to_platform_map.get(connector_name)
+        and config.connect_to_platform_map[connector_name].get(platform)
+    ):
+        instance_name = config.connect_to_platform_map[connector_name][platform]
+        if config.platform_instance_map and config.platform_instance_map.get(platform):
+            logger.warning(
+                f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
+                "Will prefer connector specific platform instance from connect_to_platform_map."
+            )
+    elif config.platform_instance_map and config.platform_instance_map.get(platform):
+        instance_name = config.platform_instance_map[platform]
+    logger.info(
+        f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
+    )
+    return instance_name
+def transform_connector_config(
+    connector_config: Dict, provided_configs: List[ProvidedConfig]
+) -> None:
+    """This method will update provided configs in connector config values, if any"""
+    lookupsByProvider = {}
+    for pconfig in provided_configs:
+        lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
+    for k, v in connector_config.items():
+        for key, value in lookupsByProvider.items():
+            if key in v:
+                connector_config[k] = connector_config[k].replace(key, value)
+# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
+def has_three_level_hierarchy(platform: str) -> bool:
+    return platform in ["postgres", "trino", "redshift", "snowflake"]
+@dataclass
+class BaseConnector:
+    connector_manifest: ConnectorManifest
+    config: KafkaConnectSourceConfig
+    report: KafkaConnectSourceReport
+    def extract_lineages(self) -> List[KafkaConnectLineage]:
+        return []
+    def extract_flow_property_bag(self) -> Optional[Dict[str, str]]:
+        return None

acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc24py3-none-any.whl → 0.15.0.1py3-none-any.whl