PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import logging
+import threading
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
 from humanfriendly import format_timespan
 from pydantic import Field, validator
 from pyiceberg.catalog import Catalog, load_catalog
+from pyiceberg.catalog.rest import RestCatalog
+from requests.adapters import HTTPAdapter
 from sortedcontainers import SortedList
+from urllib3.util import Retry
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -25,6 +29,23 @@ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
 logger = logging.getLogger(__name__)
+DEFAULT_REST_TIMEOUT = 120
+DEFAULT_REST_RETRY_POLICY = {"total": 3, "backoff_factor": 0.1}
+class TimeoutHTTPAdapter(HTTPAdapter):
+    def __init__(self, *args, **kwargs):
+        if "timeout" in kwargs:
+            self.timeout = kwargs["timeout"]
+            del kwargs["timeout"]
+        super().__init__(*args, **kwargs)
+    def send(self, request, **kwargs):
+        timeout = kwargs.get("timeout")
+        if timeout is None and hasattr(self, "timeout"):
+            kwargs["timeout"] = self.timeout
+        return super().send(request, **kwargs)
 class IcebergProfilingConfig(ConfigModel):
     enabled: bool = Field(
@@ -145,7 +166,26 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
         logger.debug(
             "Initializing the catalog %s with config: %s", catalog_name, catalog_config
         )
-        return load_catalog(name=catalog_name, **catalog_config)
+        catalog = load_catalog(name=catalog_name, **catalog_config)
+        if isinstance(catalog, RestCatalog):
+            logger.debug(
+                "Recognized REST catalog type being configured, attempting to configure HTTP Adapter for the session"
+            )
+            retry_policy: Dict[str, Any] = DEFAULT_REST_RETRY_POLICY.copy()
+            retry_policy.update(catalog_config.get("connection", {}).get("retry", {}))
+            retries = Retry(**retry_policy)
+            logger.debug(f"Retry policy to be set: {retry_policy}")
+            timeout = catalog_config.get("connection", {}).get(
+                "timeout", DEFAULT_REST_TIMEOUT
+            )
+            logger.debug(f"Timeout to be set: {timeout}")
+            catalog._session.mount(
+                "http://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
+            )
+            catalog._session.mount(
+                "https://", TimeoutHTTPAdapter(timeout=timeout, max_retries=retries)
+            )
+        return catalog
 class TopTableTimings:
@@ -156,18 +196,21 @@ class TopTableTimings:
     def __init__(self, size: int = 10):
         self._size = size
         self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
+        self._lock = threading.Lock()
     def add(self, entity: Dict[str, Any]) -> None:
         if self._VALUE_FIELD not in entity:
             return
-        self.top_entites.add(entity)
-        if len(self.top_entites) > self._size:
-            self.top_entites.pop()
+        with self._lock:
+            self.top_entites.add(entity)
+            if len(self.top_entites) > self._size:
+                self.top_entites.pop()
     def __str__(self) -> str:
-        if len(self.top_entites) == 0:
-            return "no timings reported"
-        return str(list(self.top_entites))
+        with self._lock:
+            if len(self.top_entites) == 0:
+                return "no timings reported"
+            return str(list(self.top_entites))
 class TimingClass:
@@ -175,24 +218,31 @@ class TimingClass:
     def __init__(self):
         self.times = SortedList()
+        self._lock = threading.Lock()
     def add_timing(self, t: float) -> None:
-        self.times.add(t)
+        with self._lock:
+            self.times.add(t)
     def __str__(self) -> str:
-        if len(self.times) == 0:
-            return "no timings reported"
-        total = sum(self.times)
-        avg = total / len(self.times)
-        return str(
-            {
-                "average_time": format_timespan(avg, detailed=True, max_units=3),
-                "min_time": format_timespan(self.times[0], detailed=True, max_units=3),
-                "max_time": format_timespan(self.times[-1], detailed=True, max_units=3),
-                # total_time does not provide correct information in case we run in more than 1 thread
-                "total_time": format_timespan(total, detailed=True, max_units=3),
-            }
-        )
+        with self._lock:
+            if len(self.times) == 0:
+                return "no timings reported"
+            total = sum(self.times)
+            avg = total / len(self.times)
+            return str(
+                {
+                    "average_time": format_timespan(avg, detailed=True, max_units=3),
+                    "min_time": format_timespan(
+                        self.times[0], detailed=True, max_units=3
+                    ),
+                    "max_time": format_timespan(
+                        self.times[-1], detailed=True, max_units=3
+                    ),
+                    # total_time does not provide correct information in case we run in more than 1 thread
+                    "total_time": format_timespan(total, detailed=True, max_units=3),
+                }
+            )
 @dataclass

datahub/ingestion/source/identity/okta.py CHANGED Viewed

@@ -5,7 +5,7 @@ import urllib
 from collections import defaultdict
 from dataclasses import dataclass, field
 from time import sleep
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, Iterable, List, Optional, Set, Union
 import nest_asyncio
 from okta.client import Client as OktaClient
@@ -14,7 +14,6 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
 from pydantic import validator
 from pydantic.fields import Field
-from datahub.configuration.common import ConfigModel
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -56,7 +55,7 @@ logger = logging.getLogger(__name__)
 nest_asyncio.apply()
-class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
+class OktaConfig(StatefulIngestionConfigBase):
     # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
     okta_domain: str = Field(
         description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
@@ -77,6 +76,10 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
         default=True,
         description="Whether group membership should be ingested into DataHub. ingest_groups must be True if this is True.",
     )
+    ingest_groups_users: bool = Field(
+        default=True,
+        description="Only ingest users belonging to the selected groups. This option is only useful when `ingest_users` is set to False and `ingest_group_membership` to True.",
+    )
     # Optional: Customize the mapping to DataHub Username from an attribute appearing in the Okta User
     # profile. Reference: https://developer.okta.com/docs/reference/api/users/
@@ -344,6 +347,7 @@ class OktaSource(StatefulIngestionSourceBase):
                     aspect=StatusClass(removed=False),
                 ).as_workunit()
+        okta_users: Set[User] = set()
         # Step 2: Populate GroupMembership Aspects for CorpUsers
         datahub_corp_user_urn_to_group_membership: Dict[str, GroupMembershipClass] = (
             defaultdict(lambda: GroupMembershipClass(groups=[]))
@@ -372,6 +376,9 @@ class OktaSource(StatefulIngestionSourceBase):
                         self.report.report_failure("okta_user_mapping", error_str)
                         continue
+                    if self.config.ingest_groups_users:
+                        okta_users.add(okta_user)
                     # Update the GroupMembership aspect for this group member.
                     datahub_corp_user_urn_to_group_membership[
                         datahub_corp_user_urn
@@ -379,7 +386,10 @@ class OktaSource(StatefulIngestionSourceBase):
         # Step 3: Produce MetadataWorkUnits for CorpUsers.
         if self.config.ingest_users:
-            okta_users = self._get_okta_users(event_loop)
+            # we can just throw away collected okta users so far and fetch them all
+            okta_users = set(self._get_okta_users(event_loop))
+        if okta_users:
             filtered_okta_users = filter(self._filter_okta_user, okta_users)
             datahub_corp_user_snapshots = self._map_okta_users(filtered_okta_users)
             for user_count, datahub_corp_user_snapshot in enumerate(
@@ -558,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
         if (
             self.config.include_deprovisioned_users is False
             and okta_user.status == UserStatus.DEPROVISIONED
-        ):
-            return False
-        elif (
+        ) or (
             self.config.include_suspended_users is False
             and okta_user.status == UserStatus.SUSPENDED
         ):
@@ -658,6 +666,27 @@ class OktaSource(StatefulIngestionSourceBase):
             self.config.okta_profile_to_username_regex,
         )
+    def _map_okta_user_profile_custom_properties(
+        self, profile: UserProfile
+    ) -> Dict[str, str]:
+        # filter out the common fields that are already mapped to the CorpUserInfo aspect and the private ones
+        return {
+            k: str(v)
+            for k, v in profile.__dict__.items()
+            if v
+            and k
+            not in [
+                "displayName",
+                "firstName",
+                "lastName",
+                "email",
+                "title",
+                "countryCode",
+                "department",
+            ]
+            and not k.startswith("_")
+        }
     # Converts Okta User Profile into a CorpUserInfo.
     def _map_okta_user_profile(self, profile: UserProfile) -> CorpUserInfoClass:
         # TODO: Extract user's manager if provided.
@@ -675,6 +704,7 @@ class OktaSource(StatefulIngestionSourceBase):
             title=profile.title,
             countryCode=profile.countryCode,
             departmentName=profile.department,
+            customProperties=self._map_okta_user_profile_custom_properties(profile),
         )
     def _make_corp_group_urn(self, name: str) -> str:

datahub/ingestion/source/kafka/kafka.py CHANGED Viewed

@@ -272,7 +272,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
             return schema_registry_class.create(config, report)
         except Exception as e:
             logger.debug(e, exc_info=e)
-            raise ImportError(config.schema_registry_class)
+            raise ImportError(config.schema_registry_class) from e
     def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)

datahub/ingestion/source/kafka_connect/common.py CHANGED Viewed

@@ -110,7 +110,7 @@ class ConnectorManifest:
     name: str
     type: str
-    config: Dict
+    config: Dict[str, str]
     tasks: Dict
     url: Optional[str] = None
     flow_property_bag: Optional[Dict[str, str]] = None
@@ -141,12 +141,7 @@ def get_dataset_name(
     database_name: Optional[str],
     source_table: str,
 ) -> str:
-    if database_name:
-        dataset_name = database_name + "." + source_table
-    else:
-        dataset_name = source_table
-    return dataset_name
+    return database_name + "." + source_table if database_name else source_table
 def get_platform_instance(

datahub/ingestion/source/kafka_connect/kafka_connect.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Iterable, List, Optional, Type
+from typing import Dict, Iterable, List, Optional, Type
 import jpype
 import jpype.imports
@@ -121,7 +121,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
                     connector_manifest.config, self.config.provided_configs
                 )
             connector_manifest.url = connector_url
-            connector_manifest.topic_names = self._get_connector_topics(connector_name)
+            connector_manifest.topic_names = self._get_connector_topics(
+                connector_name=connector_name,
+                config=connector_manifest.config,
+                connector_type=connector_manifest.type,
+            )
             connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
             class_type: Type[BaseConnector] = BaseConnector
@@ -203,7 +207,9 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         return response.json()
-    def _get_connector_topics(self, connector_name: str) -> List[str]:
+    def _get_connector_topics(
+        self, connector_name: str, config: Dict[str, str], connector_type: str
+    ) -> List[str]:
         try:
             response = self.session.get(
                 f"{self.config.connect_uri}/connectors/{connector_name}/topics",
@@ -215,7 +221,21 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
             )
             return []
-        return response.json()[connector_name]["topics"]
+        processed_topics = response.json()[connector_name]["topics"]
+        if connector_type == SINK:
+            try:
+                return SinkTopicFilter().filter_stale_topics(processed_topics, config)
+            except Exception as e:
+                self.report.warning(
+                    title="Error parsing sink conector topics configuration",
+                    message="Some stale lineage tasks might show up for connector",
+                    context=connector_name,
+                    exc=e,
+                )
+                return processed_topics
+        else:
+            return processed_topics
     def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
         connector_name = connector.name
@@ -359,3 +379,76 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
         return builder.make_dataset_urn_with_platform_instance(
             platform, name, platform_instance, self.config.env
         )
+class SinkTopicFilter:
+    """Helper class to filter Kafka Connect topics based on configuration."""
+    def filter_stale_topics(
+        self,
+        processed_topics: List[str],
+        sink_config: Dict[str, str],
+    ) -> List[str]:
+        """
+        Kafka-connect's /topics API returns the set of topic names the connector has been using
+        since its creation or since the last time its set of active topics was reset. This means-
+        if a topic was ever used by a connector, it will be returned, even if it is no longer used.
+        To remove these stale topics from the list, we double-check the list returned by the API
+        against the sink connector's config.
+        Sink connectors configure exactly one of `topics` or `topics.regex`
+        https://kafka.apache.org/documentation/#sinkconnectorconfigs_topics
+        Args:
+            processed_topics: List of topics currently being processed
+            sink_config: Configuration dictionary for the sink connector
+        Returns:
+            List of filtered topics that match the configuration
+        Raises:
+            ValueError: If sink connector configuration is missing both 'topics' and 'topics.regex' fields
+        """
+        # Absence of topics config is a defensive NOOP,
+        # although this should never happen in real world
+        if not self.has_topic_config(sink_config):
+            logger.warning(
+                f"Found sink without topics config {sink_config.get(CONNECTOR_CLASS)}"
+            )
+            return processed_topics
+        # Handle explicit topic list
+        if sink_config.get("topics"):
+            return self._filter_by_topic_list(processed_topics, sink_config["topics"])
+        else:
+            # Handle regex pattern
+            return self._filter_by_topic_regex(
+                processed_topics, sink_config["topics.regex"]
+            )
+    def has_topic_config(self, sink_config: Dict[str, str]) -> bool:
+        """Check if sink config has either topics or topics.regex."""
+        return bool(sink_config.get("topics") or sink_config.get("topics.regex"))
+    def _filter_by_topic_list(
+        self, processed_topics: List[str], topics_config: str
+    ) -> List[str]:
+        """Filter topics based on explicit topic list from config."""
+        config_topics = [
+            topic.strip() for topic in topics_config.split(",") if topic.strip()
+        ]
+        return [topic for topic in processed_topics if topic in config_topics]
+    def _filter_by_topic_regex(
+        self, processed_topics: List[str], regex_pattern: str
+    ) -> List[str]:
+        """Filter topics based on regex pattern from config."""
+        from java.util.regex import Pattern
+        regex_matcher = Pattern.compile(regex_pattern)
+        return [
+            topic
+            for topic in processed_topics
+            if regex_matcher.matcher(topic).matches()
+        ]

datahub/ingestion/source/kafka_connect/sink_connectors.py CHANGED Viewed

@@ -175,7 +175,7 @@ class BigQuerySinkConnector(BaseConnector):
     class BQParser:
         project: str
         target_platform: str
-        sanitizeTopics: str
+        sanitizeTopics: bool
         transforms: list
         topicsToTables: Optional[str] = None
         datasets: Optional[str] = None
@@ -187,7 +187,7 @@ class BigQuerySinkConnector(BaseConnector):
         connector_manifest: ConnectorManifest,
     ) -> BQParser:
         project = connector_manifest.config["project"]
-        sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
+        sanitizeTopics = connector_manifest.config.get("sanitizeTopics") or "false"
         transform_names = (
             self.connector_manifest.config.get("transforms", "").split(",")
             if self.connector_manifest.config.get("transforms")

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -107,9 +107,9 @@ class ConfluentJDBCSourceConnector(BaseConnector):
         assert database_name
         db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
-        topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
+        topic_prefix = self.connector_manifest.config.get("topic.prefix") or ""
-        query = self.connector_manifest.config.get("query", None)
+        query = self.connector_manifest.config.get("query") or ""
         transform_names = (
             self.connector_manifest.config.get("transforms", "").split(",")
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
     ) -> DebeziumParser:
         connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
-        if connector_class == "io.debezium.connector.mysql.MySqlConnector":
-            parser = self.DebeziumParser(
-                source_platform="mysql",
-                server_name=self.get_server_name(connector_manifest),
-                database_name=None,
-            )
-        elif connector_class == "MySqlConnector":
+        if (
+            connector_class == "io.debezium.connector.mysql.MySqlConnector"
+            or connector_class == "MySqlConnector"
+        ):
             parser = self.DebeziumParser(
                 source_platform="mysql",
                 server_name=self.get_server_name(connector_manifest),

datahub/ingestion/source/looker/looker_common.py CHANGED Viewed

@@ -923,7 +923,7 @@ class LookerExplore:
             tags=cast(List, dict.get("tags")) if dict.get("tags") is not None else [],
         )
-    @classmethod  # noqa: C901
+    @classmethod
     def from_api(  # noqa: C901
         cls,
         model: str,
@@ -931,7 +931,7 @@ class LookerExplore:
         client: LookerAPI,
         reporter: SourceReport,
         source_config: LookerDashboardSourceConfig,
-    ) -> Optional["LookerExplore"]:  # noqa: C901
+    ) -> Optional["LookerExplore"]:
         try:
             explore = client.lookml_model_explore(model, explore_name)
             views: Set[str] = set()
@@ -1183,7 +1183,7 @@ class LookerExplore:
         base_url = remove_port_from_url(base_url)
         return f"{base_url}/embed/explore/{self.model_name}/{self.name}"
-    def _to_metadata_events(  # noqa: C901
+    def _to_metadata_events(
         self,
         config: LookerCommonConfig,
         reporter: SourceReport,
@@ -1673,10 +1673,11 @@ class LookerUserRegistry:
                 primary_key="",
             )
-            # Extract user email mappings
+            # Extract user email mappings.
+            # Sort it to ensure the order is deterministic.
             user_email_cache = {
                 user_id: user.email
-                for user_id, user in self._user_cache.items()
+                for user_id, user in sorted(self._user_cache.items())
                 if user.email
             }

datahub/ingestion/source/looker/looker_file_loader.py CHANGED Viewed

@@ -33,14 +33,14 @@ class LookerViewFileLoader:
         base_projects_folder: Dict[str, pathlib.Path],
         reporter: LookMLSourceReport,
         source_config: LookMLSourceConfig,
-        manifest_constants: Dict[str, LookerConstant] = {},
+        manifest_constants: Optional[Dict[str, LookerConstant]] = None,
     ) -> None:
         self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
         self._root_project_name = root_project_name
         self._base_projects_folder = base_projects_folder
         self.reporter = reporter
         self.source_config = source_config
-        self.manifest_constants = manifest_constants
+        self.manifest_constants = manifest_constants or {}
     def _load_viewfile(
         self, project_name: str, path: str, reporter: LookMLSourceReport

datahub/ingestion/source/looker/looker_lib_wrapper.py CHANGED Viewed

@@ -205,8 +205,9 @@ class LookerAPI:
     def folder_ancestors(
         self,
         folder_id: str,
-        fields: Union[str, List[str]] = ["id", "name", "parent_id"],
+        fields: Optional[Union[str, List[str]]] = None,
     ) -> Sequence[Folder]:
+        fields = fields or ["id", "name", "parent_id"]
         self.client_stats.folder_calls += 1
         try:
             return self.client.folder_ancestors(

datahub/ingestion/source/looker/looker_source.py CHANGED Viewed

@@ -383,7 +383,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
         self.reachable_explores[(model, explore)].append(via)
-    def _get_looker_dashboard_element(  # noqa: C901
+    def _get_looker_dashboard_element(
         self, element: DashboardElement
     ) -> Optional[LookerDashboardElement]:
         # Dashboard elements can use raw usage_queries against explores

datahub/ingestion/source/looker/looker_template_language.py CHANGED Viewed

@@ -464,9 +464,10 @@ def process_lookml_template_language(
     source_config: LookMLSourceConfig,
     view_lkml_file_dict: dict,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> None:
+    manifest_constants = manifest_constants or {}
     if "views" not in view_lkml_file_dict:
         return
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
     path: Union[str, pathlib.Path],
     source_config: LookMLSourceConfig,
     reporter: LookMLSourceReport,
-    manifest_constants: Dict[str, "LookerConstant"] = {},
+    manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
     resolve_constants: bool = False,
 ) -> dict:
+    manifest_constants = manifest_constants or {}
     parsed = load_lkml(path)
     process_lookml_template_language(

datahub/ingestion/source/looker/lookml_source.py CHANGED Viewed

@@ -501,7 +501,7 @@ class LookMLSource(StatefulIngestionSourceBase):
             raise ValueError(
                 f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
                 f"in your config file"
-            )
+            ) from None
     def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
         manifest_file = folder / "manifest.lkml"
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
     def report_skipped_unreachable_views(
         self,
         viewfile_loader: LookerViewFileLoader,
-        processed_view_map: Dict[str, Set[str]] = {},
+        processed_view_map: Optional[Dict[str, Set[str]]] = None,
     ) -> None:
+        processed_view_map = processed_view_map or {}
         view_files: Dict[str, List[pathlib.Path]] = {}
         for project, folder_path in self.base_projects_folder.items():
             folder = pathlib.Path(folder_path)

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl