PyPI - acryl-datahub - Versions diffs - 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl - Mend

acryl-datahub 1.1.0.4rc2py3-none-any.whl → 1.1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show

{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/cli/check_cli.py +65 -11
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +3 -4
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/emitter/rest_emitter.py +41 -8
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +47 -45
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/graph/client.py +73 -30
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +12 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/glue.py +1 -1
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
datahub/ingestion/source/dbt/dbt_common.py +3 -1
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
datahub/ingestion/source/powerbi/powerbi.py +0 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/redshift/redshift.py +17 -0
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +6 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -12
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/hive_metastore.py +0 -10
datahub/ingestion/source/sql/mssql/source.py +24 -15
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/sql_common.py +11 -0
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +57 -2
datahub/ingestion/source/tableau/tableau.py +57 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/proxy.py +4 -3
datahub/ingestion/source/unity/source.py +56 -30
datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1253 -536
datahub/metadata/_urns/urn_defs.py +1797 -1685
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +16614 -16538
datahub/metadata/schemas/ContainerProperties.avsc +2 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataJobInfo.avsc +2 -0
datahub/metadata/schemas/DataProcessKey.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +4 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
datahub/metadata/schemas/MLModelKey.avsc +2 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/sdk/datajob.py +39 -15
datahub/sdk/lineage_client.py +2 -0
datahub/sdk/main_client.py +14 -2
datahub/sdk/search_client.py +4 -3
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +40 -13
datahub/telemetry/telemetry.py +17 -11
datahub/upgrade/upgrade.py +46 -13
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -22,6 +22,7 @@ from typing import (
     Union,
 )
+import progressbar
 from avro.schema import RecordSchema
 from pydantic import BaseModel
 from requests.models import HTTPError
@@ -504,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             "limit": limit,
             "filter": filter,
         }
-        end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
+        end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
         resp: Dict = self._post_generic(end_point, query_body)
         values: Optional[List] = resp.get("value", {}).get("values")
@@ -524,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
     def get_entity_raw(
         self, entity_urn: str, aspects: Optional[List[str]] = None
     ) -> Dict:
-        endpoint: str = f"{self.config.server}/entitiesV2/{Urn.url_encode(entity_urn)}"
+        endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
         if aspects is not None:
             assert aspects, "if provided, aspects must be a non-empty list"
             endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
@@ -654,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
     @property
     def _search_endpoint(self):
-        return f"{self.config.server}/entities?action=search"
+        return f"{self._gms_server}/entities?action=search"
     @property
     def _relationships_endpoint(self):
-        return f"{self.config.server}/openapi/relationships/v1/"
+        return f"{self._gms_server}/openapi/relationships/v1/"
     @property
     def _aspect_count_endpoint(self):
-        return f"{self.config.server}/aspects?action=getCount"
+        return f"{self._gms_server}/aspects?action=getCount"
     def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
         """Retrieve a domain urn based on its name. Returns None if there is no match found"""
@@ -1209,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         operation_name: Optional[str] = None,
         format_exception: bool = True,
     ) -> Dict:
-        url = f"{self.config.server}/api/graphql"
+        url = f"{self._gms_server}/api/graphql"
         body: Dict = {
             "query": query,
@@ -1434,40 +1435,82 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         related_aspects = response.get("relatedAspects", [])
         return reference_count, related_aspects
+    def get_kafka_consumer_offsets(
+        self,
+    ) -> dict:
+        """
+        Get Kafka consumer offsets from the DataHub API.
+        Args:
+            graph (DataHubGraph): The DataHub graph client
+        """
+        urls = {
+            "mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
+            "mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
+            "mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
+        }
+        params = {"skipCache": "true", "detailed": "true"}
+        results = {}
+        for key, url in urls.items():
+            response = self._get_generic(url=url, params=params)
+            results[key] = response
+            if "errors" in response:
+                logger.error(f"Error: {response['errors']}")
+        return results
+    def _restore_index_call(self, payload_obj: dict) -> None:
+        result = self._post_generic(
+            f"{self._gms_server}/operations?action=restoreIndices", payload_obj
+        )
+        logger.debug(f"Restore indices result: {result}")
     def restore_indices(
         self,
-        urn_pattern: str,
+        urn_pattern: Optional[str] = None,
         aspect: Optional[str] = None,
         start: Optional[int] = None,
         batch_size: Optional[int] = None,
-    ) -> str:
+        file: Optional[str] = None,
+    ) -> None:
         """Restore the indices for a given urn or urn-like pattern.
         Args:
-            urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
+            urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
             aspect: Optional aspect string to restore indices for a specific aspect.
-            start: Optional integer to decide which row number of sql store to restore from. Default: 0.
-            batch_size: Optional integer to decide how many rows to restore. Default: 10.
+            start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
+            batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
+            file: Optional file path to a file containing URNs to restore indices for.
         Returns:
             A string containing the result of the restore indices operation. This format is subject to change.
         """
-        if "%" in urn_pattern:
-            payload_obj: dict = {"urnLike": urn_pattern}
+        payload_obj = {}
+        if file is not None:
+            with open(file) as f:
+                for urn in progressbar.progressbar(f.readlines()):
+                    urn = urn.strip()
+                    if "%" in urn:
+                        payload_obj["urnLike"] = urn
+                    else:
+                        payload_obj["urn"] = urn
+                    if aspect is not None:
+                        payload_obj["aspect"] = aspect
+                    self._restore_index_call(payload_obj)
         else:
-            payload_obj = {"urn": urn_pattern}
-        if aspect is not None:
-            payload_obj["aspect"] = aspect
-        if start is not None:
-            payload_obj["start"] = start
-        if batch_size is not None:
-            payload_obj["batchSize"] = batch_size
-        raw_result = self._post_generic(
-            f"{self._gms_server}/operations?action=restoreIndices", payload_obj
-        )
-        result = raw_result["value"]
-        logger.debug(f"Restore indices result: {result}")
-        return result
+            if urn_pattern is not None:
+                if "%" in urn_pattern:
+                    payload_obj["urnLike"] = urn_pattern
+                else:
+                    payload_obj["urn"] = urn_pattern
+            if aspect is not None:
+                payload_obj["aspect"] = aspect
+            if start is not None:
+                payload_obj["start"] = start
+            if batch_size is not None:
+                payload_obj["batchSize"] = batch_size
+            self._restore_index_call(payload_obj)
     @functools.lru_cache
     def _make_schema_resolver(
@@ -1533,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         env: str = DEFAULT_ENV,
         default_db: Optional[str] = None,
         default_schema: Optional[str] = None,
-        default_dialect: Optional[str] = None,
+        override_dialect: Optional[str] = None,
     ) -> "SqlParsingResult":
         from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
@@ -1547,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
-            default_dialect=default_dialect,
+            override_dialect=override_dialect,
         )
     def create_tag(self, tag_name: str) -> str:
@@ -1774,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             "Accept": "application/json",
             "Content-Type": "application/json",
         }
-        url = f"{self.config.server}/openapi/v2/entity/batch/{entity_name}"
+        url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
         response = self._session.post(url, data=json.dumps(payload), headers=headers)
         response.raise_for_status()
@@ -1831,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             "Content-Type": "application/json",
         }
-        url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
+        url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
         if with_system_metadata:
             url += "?systemMetadata=true"

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.sdk._attribution import KnownAttribution, change_default_attribution
 from datahub.telemetry import stats
 from datahub.telemetry.telemetry import telemetry_instance
+from datahub.upgrade.upgrade import (
+    is_server_default_cli_ahead,
+    retrieve_version_stats,
+)
 from datahub.utilities._custom_package_loader import model_version_name
 from datahub.utilities.global_warning_util import (
     clear_global_warnings,
@@ -171,7 +175,10 @@ class Pipeline:
         self.last_time_printed = int(time.time())
         self.cli_report = CliReport()
-        with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
+        with (
+            contextlib.ExitStack() as exit_stack,
+            contextlib.ExitStack() as inner_exit_stack,
+        ):
             self.graph: Optional[DataHubGraph] = None
             with _add_init_error_context("connect to DataHub"):
                 if self.config.datahub_api:
@@ -340,6 +347,44 @@ class Pipeline:
             except Exception as e:
                 logger.warning("Reporting failed on start", exc_info=e)
+    def _warn_old_cli_version(self) -> None:
+        """
+        Check if the server default CLI version is ahead of the CLI version being used.
+        If so, add a warning to the report.
+        """
+        try:
+            version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
+        except RuntimeError as e:
+            # Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
+            if "no current event loop" in str(e):
+                logger.debug("Skipping version check - no event loop available")
+                return
+            raise
+        if not version_stats or not self.graph:
+            return
+        if is_server_default_cli_ahead(version_stats):
+            server_default_version = (
+                version_stats.server.current_server_default_cli_version.version
+                if version_stats.server.current_server_default_cli_version
+                else None
+            )
+            current_version = version_stats.client.current.version
+            logger.debug(f"""
+                client_version: {current_version}
+                server_default_version: {server_default_version}
+                server_default_cli_ahead: True
+            """)
+            self.source.get_report().warning(
+                title="Server default CLI version is ahead of CLI version",
+                message="Please upgrade the CLI version being used",
+                context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
+            )
     def _notify_reporters_on_ingestion_completion(self) -> None:
         for reporter in self.reporters:
             try:
@@ -396,6 +441,7 @@ class Pipeline:
         return False
     def run(self) -> None:
+        self._warn_old_cli_version()
         with self.exit_stack, self.inner_exit_stack:
             if self.config.flags.generate_memory_profiles:
                 import memray
@@ -502,7 +548,7 @@ class Pipeline:
                 self._handle_uncaught_pipeline_exception(exc)
             finally:
                 clear_global_warnings()
+                self.sink.flush()
                 self._notify_reporters_on_ingestion_completion()
     def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
@@ -578,11 +624,17 @@ class Pipeline:
         sink_failures = len(self.sink.get_report().failures)
         sink_warnings = len(self.sink.get_report().warnings)
         global_warnings = len(get_global_warnings())
+        source_aspects = self.source.get_report().get_aspects_dict()
+        source_aspects_by_subtype = (
+            self.source.get_report().get_aspects_by_subtypes_dict()
+        )
         telemetry_instance.ping(
             "ingest_stats",
             {
                 "source_type": self.source_type,
+                "source_aspects": source_aspects,
+                "source_aspects_by_subtype": source_aspects_by_subtype,
                 "sink_type": self.sink_type,
                 "transformer_types": [
                     transformer.type for transformer in self.config.transformers or []

datahub/ingestion/sink/datahub_rest.py CHANGED Viewed

@@ -5,6 +5,7 @@ import functools
 import logging
 import os
 import threading
+import time
 import uuid
 from enum import auto
 from typing import List, Optional, Tuple, Union
@@ -346,6 +347,17 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
             RecordEnvelope(item, metadata={}), NoopWriteCallback()
         )
+    def flush(self) -> None:
+        """Wait for all pending records to be written."""
+        i = 0
+        while self.report.pending_requests > 0:
+            time.sleep(0.1)
+            i += 1
+            if i % 1000 == 0:
+                logger.info(
+                    f"Waiting for {self.report.pending_requests} records to be written"
+                )
     def close(self):
         with self.report.main_thread_blocking_timer:
             self.executor.shutdown()

datahub/ingestion/source/abs/source.py CHANGED Viewed

@@ -533,7 +533,7 @@ class ABSSource(StatefulIngestionSourceBase):
             )
             path_spec.sample_files = False
             for obj in container_client.list_blobs(
-                prefix=f"{prefix}", results_per_page=PAGE_SIZE
+                name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
             ):
                 abs_path = self.create_abs_path(obj.name)
                 logger.debug(f"Path: {abs_path}")

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -269,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
 @capability(
     SourceCapability.DELETION_DETECTION,
-    "Enabled by default when stateful ingestion is turned on.",
+    "Enabled by default via stateful ingestion.",
 )
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
 @capability(

datahub/ingestion/source/azure/azure_common.py CHANGED Viewed

@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
     def get_blob_service_client(self):
         return BlobServiceClient(
             account_url=f"https://{self.account_name}.blob.core.windows.net",
-            credential=f"{self.get_credentials()}",
+            credential=self.get_credentials(),
         )
     def get_data_lake_service_client(self) -> DataLakeServiceClient:
         return DataLakeServiceClient(
             account_url=f"https://{self.account_name}.dfs.core.windows.net",
-            credential=f"{self.get_credentials()}",
+            credential=self.get_credentials(),
         )
     def get_credentials(

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 from typing import Iterable, List, Optional
+from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -44,6 +45,7 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
     BigQueryQueriesExtractorConfig,
 )
 from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
+from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
 from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantLineageRunSkipHandler,
@@ -77,7 +79,14 @@ def cleanup(config: BigQueryV2Config) -> None:
     supported=False,
 )
 @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
-@capability(SourceCapability.CONTAINERS, "Enabled by default")
+@capability(
+    SourceCapability.CONTAINERS,
+    "Enabled by default",
+    subtype_modifier=[
+        SourceCapabilityModifier.BIGQUERY_PROJECT,
+        SourceCapabilityModifier.BIGQUERY_DATASET,
+    ],
+)
 @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
 @capability(
     SourceCapability.DATA_PROFILING,
@@ -242,7 +251,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ).workunit_processor,
         ]
+    def _warn_deprecated_configs(self):
+        if (
+            self.config.match_fully_qualified_names is not None
+            and not self.config.match_fully_qualified_names
+            and self.config.schema_pattern is not None
+            and self.config.schema_pattern != AllowDenyPattern.allow_all()
+        ):
+            self.report.report_warning(
+                message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
+                "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
+                "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
+                context="Config option deprecation warning",
+                title="Config option deprecation warning",
+            )
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        self._warn_deprecated_configs()
         projects = get_projects(
             self.bq_schema_extractor.schema_api,
             self.report,
@@ -271,28 +296,29 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ):
                 return
-            with self.report.new_stage(
-                f"*: {QUERIES_EXTRACTION}"
-            ), BigQueryQueriesExtractor(
-                connection=self.config.get_bigquery_client(),
-                schema_api=self.bq_schema_extractor.schema_api,
-                config=BigQueryQueriesExtractorConfig(
-                    window=self.config,
-                    user_email_pattern=self.config.usage.user_email_pattern,
-                    include_lineage=self.config.include_table_lineage,
-                    include_usage_statistics=self.config.include_usage_statistics,
-                    include_operations=self.config.usage.include_operational_stats,
-                    include_queries=self.config.include_queries,
-                    include_query_usage_statistics=self.config.include_query_usage_statistics,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    region_qualifiers=self.config.region_qualifiers,
-                ),
-                structured_report=self.report,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                schema_resolver=self.sql_parser_schema_resolver,
-                discovered_tables=self.bq_schema_extractor.table_refs,
-            ) as queries_extractor:
+            with (
+                self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
+                BigQueryQueriesExtractor(
+                    connection=self.config.get_bigquery_client(),
+                    schema_api=self.bq_schema_extractor.schema_api,
+                    config=BigQueryQueriesExtractorConfig(
+                        window=self.config,
+                        user_email_pattern=self.config.usage.user_email_pattern,
+                        include_lineage=self.config.include_table_lineage,
+                        include_usage_statistics=self.config.include_usage_statistics,
+                        include_operations=self.config.usage.include_operational_stats,
+                        include_queries=self.config.include_queries,
+                        include_query_usage_statistics=self.config.include_query_usage_statistics,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        region_qualifiers=self.config.region_qualifiers,
+                    ),
+                    structured_report=self.report,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    schema_resolver=self.sql_parser_schema_resolver,
+                    discovered_tables=self.bq_schema_extractor.table_refs,
+                ) as queries_extractor,
+            ):
                 self.report.queries_extractor = queries_extractor.report
                 yield from queries_extractor.get_workunits_internal()
         else:

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -342,7 +342,7 @@ class BigQueryV2Config(
     )
     use_queries_v2: bool = Field(
-        default=False,
+        default=True,
         description="If enabled, uses the new queries extractor to extract queries from bigquery.",
     )
     include_queries: bool = Field(

datahub/ingestion/source/bigquery_v2/bigquery_queries.py CHANGED Viewed

@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
     def close(self) -> None:
         self.queries_extractor.close()
         self.connection.close()
+        super().close()

datahub/ingestion/source/bigquery_v2/profiler.py CHANGED Viewed

@@ -189,6 +189,7 @@ WHERE
         if len(profile_requests) == 0:
             return
         yield from self.generate_profile_workunits(
             profile_requests,
             max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
             db_name, schema_name, bq_table, self.config.profiling.partition_datetime
         )
-        if partition is None and bq_table.partition_info:
+        # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
+        if partition is None and bq_table.partition_info and bq_table.rows_count:
             self.report.report_warning(
                 title="Profile skipped for partitioned table",
-                message="profile skipped as partitioned table is empty or partition id or type was invalid",
+                message="profile skipped as partition id or type was invalid",
                 context=profile_request.pretty_name,
             )
             return None

datahub/ingestion/source/bigquery_v2/queries.py CHANGED Viewed

@@ -45,12 +45,12 @@ SELECT
   tos.OPTION_VALUE as comment,
   t.is_insertable_into,
   t.ddl,
-  ts.row_count,
+  ts.row_count as row_count,
   ts.size_bytes as bytes,
   p.num_partitions,
   p.max_partition_id,
-  p.active_billable_bytes,
-  p.long_term_billable_bytes,
+  p.active_billable_bytes as active_billable_bytes,
+  IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
   REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
   REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base

datahub/ingestion/source/cassandra/cassandra.py CHANGED Viewed

@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
 @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
 @capability(
     SourceCapability.DELETION_DETECTION,
-    "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
+    "Enabled by default via stateful ingestion",
     supported=True,
 )
 class CassandraSource(StatefulIngestionSourceBase):

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,11 +70,12 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            with self.report.new_stage(
-                f"{keyspace_name}: {PROFILING}"
-            ), ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
+            with (
+                self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
+                ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor,
+            ):
                 future_to_dataset = {
                     executor.submit(
                         self.generate_profile,

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -1,5 +1,10 @@
+import logging
+from typing import Any, Dict
 from datahub.utilities.str_enum import StrEnum
+logger = logging.getLogger(__name__)
 class DatasetSubTypes(StrEnum):
     # Generic SubTypes
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
     NEO4J_RELATIONSHIP = "Neo4j Relationship"
     SNOWFLAKE_STREAM = "Snowflake Stream"
     API_ENDPOINT = "API Endpoint"
+    SLACK_CHANNEL = "Slack Channel"
+    PROJECTIONS = "Projections"
     # TODO: Create separate entity...
     NOTEBOOK = "Notebook"
@@ -52,6 +59,8 @@ class BIContainerSubTypes(StrEnum):
     LOOKER_FOLDER = "Folder"
     LOOKML_PROJECT = "LookML Project"
     LOOKML_MODEL = "LookML Model"
+    TABLEAU_SITE = "Site"
+    TABLEAU_PROJECT = "Project"
     TABLEAU_WORKBOOK = "Workbook"
     POWERBI_DATASET = "Semantic Model"
     POWERBI_DATASET_TABLE = "Table"
@@ -74,6 +83,9 @@ class JobContainerSubTypes(StrEnum):
 class BIAssetSubTypes(StrEnum):
+    DASHBOARD = "Dashboard"
+    CHART = "Chart"
     # Generic SubTypes
     REPORT = "Report"
@@ -116,3 +128,36 @@ class MLAssetSubTypes(StrEnum):
     VERTEX_PIPELINE = "Pipeline Job"
     VERTEX_PIPELINE_TASK = "Pipeline Task"
     VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
+def create_source_capability_modifier_enum():
+    all_values: Dict[str, Any] = {}
+    source_enums = [
+        DatasetSubTypes,
+        DatasetContainerSubTypes,
+        BIContainerSubTypes,
+        FlowContainerSubTypes,
+        JobContainerSubTypes,
+        BIAssetSubTypes,
+        MLAssetSubTypes,
+    ]
+    for enum_class in source_enums:
+        for member in enum_class:  # type: ignore[var-annotated]
+            if member.name in all_values:
+                logger.debug(
+                    f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
+                )
+                continue
+            all_values[member.name] = member.value
+    enum_code = "class SourceCapabilityModifier(StrEnum):\n"
+    for name, value in all_values.items():
+        enum_code += f'    {name} = "{value}"\n'
+    exec(enum_code, globals())
+    return globals()["SourceCapabilityModifier"]
+# This will have all values from the enums above
+SourceCapabilityModifier = create_source_capability_modifier_enum()

acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.4rc2py3-none-any.whl → 1.1.0.5py3-none-any.whl