PyPI - acryl-datahub - Versions diffs - 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl - Mend

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show

{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/datacontract/datacontract.py +35 -3
datahub/api/entities/datajob/dataflow.py +3 -3
datahub/api/entities/datajob/datajob.py +7 -4
datahub/api/entities/dataset/dataset.py +9 -11
datahub/api/entities/forms/forms.py +34 -34
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/operation.py +4 -4
datahub/cli/check_cli.py +3 -2
datahub/cli/config_utils.py +2 -2
datahub/cli/delete_cli.py +6 -5
datahub/cli/docker_cli.py +2 -2
datahub/cli/exists_cli.py +2 -1
datahub/cli/get_cli.py +2 -1
datahub/cli/iceberg_cli.py +6 -5
datahub/cli/ingest_cli.py +9 -6
datahub/cli/migrate.py +4 -3
datahub/cli/migration_utils.py +4 -3
datahub/cli/put_cli.py +3 -2
datahub/cli/specific/assertions_cli.py +2 -1
datahub/cli/specific/datacontract_cli.py +3 -2
datahub/cli/specific/dataproduct_cli.py +10 -9
datahub/cli/specific/dataset_cli.py +4 -3
datahub/cli/specific/forms_cli.py +2 -1
datahub/cli/specific/group_cli.py +2 -1
datahub/cli/specific/structuredproperties_cli.py +4 -3
datahub/cli/specific/user_cli.py +2 -1
datahub/cli/state_cli.py +2 -1
datahub/cli/timeline_cli.py +2 -1
datahub/configuration/common.py +5 -0
datahub/configuration/source_common.py +1 -1
datahub/emitter/mcp.py +20 -5
datahub/emitter/request_helper.py +116 -3
datahub/emitter/rest_emitter.py +163 -93
datahub/entrypoints.py +2 -1
datahub/errors.py +4 -0
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
datahub/ingestion/api/source.py +2 -5
datahub/ingestion/api/source_helpers.py +1 -0
datahub/ingestion/glossary/classification_mixin.py +4 -2
datahub/ingestion/graph/client.py +33 -8
datahub/ingestion/graph/config.py +14 -0
datahub/ingestion/graph/filters.py +1 -1
datahub/ingestion/graph/links.py +53 -0
datahub/ingestion/run/pipeline.py +9 -6
datahub/ingestion/run/pipeline_config.py +1 -1
datahub/ingestion/sink/datahub_rest.py +5 -6
datahub/ingestion/source/apply/datahub_apply.py +2 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
datahub/ingestion/source/dbt/dbt_common.py +10 -2
datahub/ingestion/source/dbt/dbt_core.py +82 -42
datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
datahub/ingestion/source/feast.py +4 -4
datahub/ingestion/source/fivetran/config.py +1 -1
datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
datahub/ingestion/source/ge_data_profiler.py +27 -1
datahub/ingestion/source/hex/api.py +1 -20
datahub/ingestion/source/hex/query_fetcher.py +4 -1
datahub/ingestion/source/iceberg/iceberg.py +20 -4
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_common.py +17 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
datahub/ingestion/source/looker/looker_source.py +34 -5
datahub/ingestion/source/looker/lookml_source.py +7 -1
datahub/ingestion/source/metadata/lineage.py +2 -1
datahub/ingestion/source/mlflow.py +19 -6
datahub/ingestion/source/mode.py +74 -28
datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
datahub/ingestion/source/powerbi/config.py +13 -1
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/redshift/usage.py +10 -9
datahub/ingestion/source/sigma/config.py +74 -6
datahub/ingestion/source/sigma/sigma.py +16 -1
datahub/ingestion/source/sigma/sigma_api.py +99 -58
datahub/ingestion/source/slack/slack.py +4 -52
datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
datahub/ingestion/source/sql/athena.py +2 -1
datahub/ingestion/source/sql/clickhouse.py +5 -1
datahub/ingestion/source/sql/druid.py +7 -2
datahub/ingestion/source/sql/hive.py +7 -2
datahub/ingestion/source/sql/hive_metastore.py +5 -5
datahub/ingestion/source/sql/mssql/source.py +1 -1
datahub/ingestion/source/sql/oracle.py +6 -2
datahub/ingestion/source/sql/sql_config.py +1 -34
datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
datahub/ingestion/source/tableau/tableau.py +31 -6
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/unity/config.py +2 -1
datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
datahub/ingestion/source/vertexai/vertexai.py +316 -4
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
datahub/integrations/assertion/common.py +3 -2
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
datahub/metadata/_urns/urn_defs.py +1819 -1763
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +17296 -16883
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/DataContractKey.avsc +2 -1
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/_all_entities.py +4 -0
datahub/sdk/_shared.py +142 -4
datahub/sdk/_utils.py +4 -0
datahub/sdk/dataset.py +2 -2
datahub/sdk/entity_client.py +8 -0
datahub/sdk/lineage_client.py +235 -0
datahub/sdk/main_client.py +6 -3
datahub/sdk/mlmodel.py +301 -0
datahub/sdk/mlmodelgroup.py +233 -0
datahub/secret/datahub_secret_store.py +2 -1
datahub/specific/dataset.py +12 -0
datahub/sql_parsing/fingerprint_utils.py +6 -0
datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
datahub/sql_parsing/sqlglot_utils.py +18 -14
datahub/telemetry/telemetry.py +2 -2
datahub/testing/check_imports.py +1 -1
datahub/testing/mcp_diff.py +15 -2
datahub/upgrade/upgrade.py +10 -12
datahub/utilities/logging_manager.py +8 -1
datahub/utilities/server_config_util.py +350 -10
datahub/utilities/sqlalchemy_query_combiner.py +4 -5
datahub/utilities/urn_encoder.py +1 -1
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/sigma/sigma_api.py CHANGED Viewed

@@ -95,22 +95,22 @@ class SigmaAPI:
         return get_response
     def get_workspace(self, workspace_id: str) -> Optional[Workspace]:
+        if workspace_id in self.workspaces:
+            return self.workspaces[workspace_id]
         logger.debug(f"Fetching workspace metadata with id '{workspace_id}'")
         try:
-            if workspace_id in self.workspaces:
-                return self.workspaces[workspace_id]
-            else:
-                response = self._get_api_call(
-                    f"{self.config.api_url}/workspaces/{workspace_id}"
-                )
-                if response.status_code == 403:
-                    logger.debug(f"Workspace {workspace_id} not accessible.")
-                    self.report.non_accessible_workspaces_count += 1
-                    return None
-                response.raise_for_status()
-                workspace = Workspace.parse_obj(response.json())
-                self.workspaces[workspace.workspaceId] = workspace
-                return workspace
+            response = self._get_api_call(
+                f"{self.config.api_url}/workspaces/{workspace_id}"
+            )
+            if response.status_code == 403:
+                logger.debug(f"Workspace {workspace_id} not accessible.")
+                self.report.non_accessible_workspaces_count += 1
+                return None
+            response.raise_for_status()
+            workspace = Workspace.parse_obj(response.json())
+            self.workspaces[workspace.workspaceId] = workspace
+            return workspace
         except Exception as e:
             self._log_http_error(
                 message=f"Unable to fetch workspace '{workspace_id}'. Exception: {e}"
@@ -187,7 +187,9 @@ class SigmaAPI:
     @functools.lru_cache
     def _get_files_metadata(self, file_type: str) -> Dict[str, File]:
         logger.debug(f"Fetching file metadata with type {file_type}.")
-        file_url = url = f"{self.config.api_url}/files?typeFilters={file_type}"
+        file_url = url = (
+            f"{self.config.api_url}/files?permissionFilter=view&typeFilters={file_type}"
+        )
         try:
             files_metadata: Dict[str, File] = {}
             while True:
@@ -225,31 +227,50 @@ class SigmaAPI:
                 for dataset_dict in response_dict[Constant.ENTRIES]:
                     dataset = SigmaDataset.parse_obj(dataset_dict)
-                    if dataset.datasetId in dataset_files_metadata:
-                        dataset.path = dataset_files_metadata[dataset.datasetId].path
-                        dataset.badge = dataset_files_metadata[dataset.datasetId].badge
-                        workspace_id = dataset_files_metadata[
-                            dataset.datasetId
-                        ].workspaceId
-                        if workspace_id:
-                            dataset.workspaceId = workspace_id
-                            workspace = self.get_workspace(dataset.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    datasets.append(dataset)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for dataset we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                datasets.append(dataset)
+                    if dataset.datasetId not in dataset_files_metadata:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) (missing file metadata)"
+                        )
+                        continue
+                    dataset.workspaceId = dataset_files_metadata[
+                        dataset.datasetId
+                    ].workspaceId
+                    dataset.path = dataset_files_metadata[dataset.datasetId].path
+                    dataset.badge = dataset_files_metadata[dataset.datasetId].badge
+                    workspace = None
+                    if dataset.workspaceId:
+                        workspace = self.get_workspace(dataset.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.datasets.processed(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                            datasets.append(dataset)
+                        else:
+                            self.report.datasets.dropped(
+                                f"{dataset.name} ({dataset.datasetId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for dataset we can consider it as shared entity
+                        self.report.datasets_without_workspace += 1
+                        self.report.datasets.processed(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
+                        datasets.append(dataset)
+                    else:
+                        self.report.datasets.dropped(
+                            f"{dataset.name} ({dataset.datasetId}) in workspace id {dataset.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{dataset_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_datasets = len(datasets)
             return datasets
         except Exception as e:
             self._log_http_error(
@@ -381,34 +402,54 @@ class SigmaAPI:
                 for workbook_dict in response_dict[Constant.ENTRIES]:
                     workbook = Workbook.parse_obj(workbook_dict)
-                    if workbook.workbookId in workbook_files_metadata:
-                        workbook.badge = workbook_files_metadata[
-                            workbook.workbookId
-                        ].badge
-                        workspace_id = workbook_files_metadata[
-                            workbook.workbookId
-                        ].workspaceId
-                        if workspace_id:
-                            workbook.workspaceId = workspace_id
-                            workspace = self.get_workspace(workbook.workspaceId)
-                            if workspace:
-                                if self.config.workspace_pattern.allowed(
-                                    workspace.name
-                                ):
-                                    workbook.pages = self.get_workbook_pages(workbook)
-                                    workbooks.append(workbook)
-                            elif self.config.ingest_shared_entities:
-                                # If no workspace for workbook we can consider it as shared entity
-                                self.report.shared_entities_count += 1
-                                workbook.pages = self.get_workbook_pages(workbook)
-                                workbooks.append(workbook)
+                    if workbook.workbookId not in workbook_files_metadata:
+                        # Due to a bug in the Sigma API, it seems like the /files endpoint does not
+                        # return file metadata when the user has access via admin permissions. In
+                        # those cases, the user associated with the token needs to be manually added
+                        # to the workspace.
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) (missing file metadata; path: {workbook.path}; likely need to manually add user to workspace)"
+                        )
+                        continue
+                    workbook.workspaceId = workbook_files_metadata[
+                        workbook.workbookId
+                    ].workspaceId
+                    workbook.badge = workbook_files_metadata[workbook.workbookId].badge
+                    workspace = None
+                    if workbook.workspaceId:
+                        workspace = self.get_workspace(workbook.workspaceId)
+                    if workspace:
+                        if self.config.workspace_pattern.allowed(workspace.name):
+                            self.report.workbooks.processed(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                            workbook.pages = self.get_workbook_pages(workbook)
+                            workbooks.append(workbook)
+                        else:
+                            self.report.workbooks.dropped(
+                                f"{workbook.name} ({workbook.workbookId}) in {workspace.name}"
+                            )
+                    elif self.config.ingest_shared_entities:
+                        # If no workspace for workbook we can consider it as shared entity
+                        self.report.workbooks_without_workspace += 1
+                        self.report.workbooks.processed(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
+                        workbook.pages = self.get_workbook_pages(workbook)
+                        workbooks.append(workbook)
+                    else:
+                        self.report.workbooks.dropped(
+                            f"{workbook.name} ({workbook.workbookId}) in workspace id {workbook.workspaceId or 'unknown'}"
+                        )
                 if response_dict[Constant.NEXTPAGE]:
                     url = f"{workbook_url}?page={response_dict[Constant.NEXTPAGE]}"
                 else:
                     break
-            self.report.number_of_workbooks = len(workbooks)
             return workbooks
         except Exception as e:
             self._log_http_error(

datahub/ingestion/source/slack/slack.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import logging
-import textwrap
 from dataclasses import dataclass
 from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -613,6 +612,10 @@ class SlackSource(StatefulIngestionSourceBase):
             ),
         )
+    @retry(
+        wait=wait_exponential(multiplier=2, min=4, max=60),
+        before_sleep=before_sleep_log(logger, logging.ERROR, True),
+    )
     def get_user_to_be_updated(
         self,
     ) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
@@ -634,56 +637,5 @@ class SlackSource(StatefulIngestionSourceBase):
             if user_obj.email is not None:
                 yield (user_obj, editable_properties)
-    @retry(
-        wait=wait_exponential(multiplier=2, min=4, max=60),
-        before_sleep=before_sleep_log(logger, logging.ERROR, True),
-    )
-    def get_user_to_be_updated_oss(self) -> Iterable[CorpUser]:
-        graphql_query = textwrap.dedent(
-            """
-            query listUsers($input: ListUsersInput!) {
-                listUsers(input: $input) {
-                    total
-                    users {
-                        urn
-                        editableProperties {
-                            email
-                            slack
-                        }
-                    }
-                }
-            }
-        """
-        )
-        start = 0
-        count = 10
-        total = count
-        assert self.ctx.graph is not None
-        while start < total:
-            variables = {"input": {"start": start, "count": count}}
-            response = self.ctx.graph.execute_graphql(
-                query=graphql_query, variables=variables
-            )
-            list_users = response.get("listUsers", {})
-            total = list_users.get("total", 0)
-            users = list_users.get("users", [])
-            for user in users:
-                user_obj = CorpUser()
-                editable_properties = user.get("editableProperties", {})
-                user_obj.urn = user.get("urn")
-                if user_obj.urn is None:
-                    continue
-                if editable_properties is not None:
-                    user_obj.email = editable_properties.get("email")
-                if user_obj.email is None:
-                    urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
-                    if "@" in urn_id:
-                        user_obj.email = urn_id
-                if user_obj.email is not None:
-                    yield user_obj
-            start += count
     def get_report(self) -> SourceReport:
         return self.report

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Set
 import pydantic
-from pydantic import Field, SecretStr, root_validator, validator
+from pydantic import Field, root_validator, validator
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.pattern_utils import UUID_REGEX
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
         default=AllowDenyPattern.allow_all(),
         description=(
             "List of regex patterns for structured properties to include in ingestion."
+            " Applied to tags with form `<database>.<schema>.<tag_name>`."
             " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
         ),
     )
@@ -384,17 +385,6 @@ class SnowflakeV2Config(
         return values
-    def get_sql_alchemy_url(
-        self,
-        database: Optional[str] = None,
-        username: Optional[str] = None,
-        password: Optional[SecretStr] = None,
-        role: Optional[str] = None,
-    ) -> str:
-        return SnowflakeConnectionConfig.get_sql_alchemy_url(
-            self, database=database, username=username, password=password, role=role
-        )
     @validator("shares")
     def validate_shares(
         cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict

datahub/ingestion/source/snowflake/snowflake_connection.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import threading
 from typing import Any, Dict, Optional
 import pydantic
@@ -27,7 +28,7 @@ from datahub.ingestion.source.snowflake.oauth_config import (
     OAuthIdentityProvider,
 )
 from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator
-from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
 from datahub.utilities.config_clean import (
     remove_protocol,
     remove_suffix,
@@ -192,23 +193,11 @@ class SnowflakeConnectionConfig(ConfigModel):
                 "but should be set when using use_certificate false for oauth_config"
             )
-    def get_sql_alchemy_url(
-        self,
-        database: Optional[str] = None,
-        username: Optional[str] = None,
-        password: Optional[pydantic.SecretStr] = None,
-        role: Optional[str] = None,
-    ) -> str:
-        if username is None:
-            username = self.username
-        if password is None:
-            password = self.password
-        if role is None:
-            role = self.role
+    def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
         return make_sqlalchemy_uri(
             self.scheme,
-            username,
-            password.get_secret_value() if password else None,
+            self.username,
+            self.password.get_secret_value() if self.password else None,
             self.account_id,
             f'"{database}"' if database is not None else database,
             uri_opts={
@@ -217,7 +206,7 @@ class SnowflakeConnectionConfig(ConfigModel):
                 for (key, value) in {
                     "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type),
                     "warehouse": self.warehouse,
-                    "role": role,
+                    "role": self.role,
                     "application": _APPLICATION_NAME,
                 }.items()
                 if value
@@ -402,13 +391,30 @@ class SnowflakeConnection(Closeable):
     def __init__(self, connection: NativeSnowflakeConnection):
         self._connection = connection
+        self._query_num_lock = threading.Lock()
+        self._query_num = 1
     def native_connection(self) -> NativeSnowflakeConnection:
         return self._connection
+    def get_query_no(self) -> int:
+        with self._query_num_lock:
+            no = self._query_num
+            self._query_num += 1
+            return no
     def query(self, query: str) -> Any:
         try:
-            logger.info(f"Query: {query}", stacklevel=2)
+            # We often run multiple queries in parallel across multiple threads,
+            # so we need to number them to help with log readability.
+            query_num = self.get_query_no()
+            logger.info(f"Query #{query_num}: {query}", stacklevel=2)
             resp = self._connection.cursor(DictCursor).execute(query)
+            if resp is not None and resp.rowcount is not None:
+                logger.info(
+                    f"Query #{query_num} got {resp.rowcount} row(s) back from Snowflake",
+                    stacklevel=2,
+                )
             return resp
         except Exception as e:

datahub/ingestion/source/snowflake/snowflake_profiler.py CHANGED Viewed

@@ -135,12 +135,7 @@ class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
     ) -> "DatahubGEProfiler":
         assert db_name
-        url = self.config.get_sql_alchemy_url(
-            database=db_name,
-            username=self.config.username,
-            password=self.config.password,
-            role=self.config.role,
-        )
+        url = self.config.get_sql_alchemy_url(database=db_name)
         logger.debug(f"sql_alchemy_url={url}")

datahub/ingestion/source/snowflake/snowflake_queries.py CHANGED Viewed

@@ -515,7 +515,10 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
             # job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
             # here
             query_id=get_query_fingerprint(
-                res["query_text"], self.identifiers.platform, fast=True
+                res["query_text"],
+                self.identifiers.platform,
+                fast=True,
+                secondary_id=res["query_secondary_fingerprint"],
             ),
             query_text=res["query_text"],
             upstreams=upstreams,
@@ -654,7 +657,17 @@ WITH
 fingerprinted_queries as (
     SELECT *,
         -- TODO: Generate better fingerprints for each query by pushing down regex logic.
-        query_history.query_parameterized_hash as query_fingerprint
+        query_history.query_parameterized_hash as query_fingerprint,
+        -- Optional and additional hash to be used for query deduplication and final query identity
+        CASE
+            WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
+            -- Extract project id and hash it
+            THEN CAST(HASH(
+                REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
+                REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
+            ) AS VARCHAR)
+            ELSE NULL
+        END as query_secondary_fingerprint
     FROM
         snowflake.account_usage.query_history
     WHERE
@@ -670,11 +683,11 @@ fingerprinted_queries as (
             {time_bucket_size},
             CONVERT_TIMEZONE('UTC', start_time)
         ) AS bucket_start_time,
-        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count,
+        COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
     FROM
         fingerprinted_queries
     QUALIFY
-        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1
+        ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
 )
 , raw_access_history AS (
     SELECT
@@ -714,6 +727,7 @@ fingerprinted_queries as (
         q.bucket_start_time,
         q.query_id,
         q.query_fingerprint,
+        q.query_secondary_fingerprint,
         q.query_count,
         q.session_id AS "SESSION_ID",
         q.start_time AS "QUERY_START_TIME",

datahub/ingestion/source/snowflake/snowflake_query.py CHANGED Viewed

@@ -71,14 +71,6 @@ class SnowflakeQuery:
     def current_warehouse() -> str:
         return "select CURRENT_WAREHOUSE()"
-    @staticmethod
-    def current_database() -> str:
-        return "select CURRENT_DATABASE()"
-    @staticmethod
-    def current_schema() -> str:
-        return "select CURRENT_SCHEMA()"
     @staticmethod
     def show_databases() -> str:
         return "show databases"
@@ -107,8 +99,8 @@ class SnowflakeQuery:
         order by database_name"""
     @staticmethod
-    def schemas_for_database(db_name: Optional[str]) -> str:
-        db_clause = f'"{db_name}".' if db_name is not None else ""
+    def schemas_for_database(db_name: str) -> str:
+        db_clause = f'"{db_name}".'
         return f"""
         SELECT schema_name AS "SCHEMA_NAME",
         created AS "CREATED",
@@ -119,8 +111,8 @@ class SnowflakeQuery:
         order by schema_name"""
     @staticmethod
-    def tables_for_database(db_name: Optional[str]) -> str:
-        db_clause = f'"{db_name}".' if db_name is not None else ""
+    def tables_for_database(db_name: str) -> str:
+        db_clause = f'"{db_name}".'
         return f"""
         SELECT table_catalog AS "TABLE_CATALOG",
         table_schema AS "TABLE_SCHEMA",
@@ -142,8 +134,8 @@ class SnowflakeQuery:
         order by table_schema, table_name"""
     @staticmethod
-    def tables_for_schema(schema_name: str, db_name: Optional[str]) -> str:
-        db_clause = f'"{db_name}".' if db_name is not None else ""
+    def tables_for_schema(schema_name: str, db_name: str) -> str:
+        db_clause = f'"{db_name}".'
         return f"""
         SELECT table_catalog AS "TABLE_CATALOG",
         table_schema AS "TABLE_SCHEMA",
@@ -165,8 +157,8 @@ class SnowflakeQuery:
         order by table_schema, table_name"""
     @staticmethod
-    def procedures_for_database(db_name: Optional[str]) -> str:
-        db_clause = f'"{db_name}".' if db_name is not None else ""
+    def procedures_for_database(db_name: str) -> str:
+        db_clause = f'"{db_name}".'
         return f"""
         SELECT procedure_catalog AS "PROCEDURE_CATALOG",
         procedure_schema AS "PROCEDURE_SCHEMA",
@@ -382,26 +374,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         ORDER BY query_start_time DESC
         ;"""
-    @staticmethod
-    def view_dependencies() -> str:
-        return """
-        SELECT
-          concat(
-            referenced_database, '.', referenced_schema,
-            '.', referenced_object_name
-          ) AS "VIEW_UPSTREAM",
-          referenced_object_domain as "REFERENCED_OBJECT_DOMAIN",
-          concat(
-            referencing_database, '.', referencing_schema,
-            '.', referencing_object_name
-          ) AS "DOWNSTREAM_VIEW",
-          referencing_object_domain AS "REFERENCING_OBJECT_DOMAIN"
-        FROM
-          snowflake.account_usage.object_dependencies
-        WHERE
-          referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
-        """
     # Note on use of `upstreams_deny_pattern` to ignore temporary tables:
     # Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
     # OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
@@ -425,32 +397,6 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
                 upstreams_deny_pattern,
             )
-    @staticmethod
-    def view_dependencies_v2() -> str:
-        return """
-        SELECT
-            ARRAY_UNIQUE_AGG(
-                OBJECT_CONSTRUCT(
-                    'upstream_object_name', concat(
-                                    referenced_database, '.', referenced_schema,
-                                    '.', referenced_object_name
-                                ),
-                    'upstream_object_domain', referenced_object_domain
-                )
-                ) as "UPSTREAM_TABLES",
-          concat(
-            referencing_database, '.', referencing_schema,
-            '.', referencing_object_name
-          ) AS "DOWNSTREAM_TABLE_NAME",
-          ANY_VALUE(referencing_object_domain) AS "DOWNSTREAM_TABLE_DOMAIN"
-        FROM
-          snowflake.account_usage.object_dependencies
-        WHERE
-          referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
-        GROUP BY
-            DOWNSTREAM_TABLE_NAME
-        """
     @staticmethod
     def show_external_tables() -> str:
         return "show external tables in account"
@@ -1000,4 +946,4 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
         from_clause = (
             f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
         )
-        return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
+        return f"""SHOW STREAMS IN DATABASE "{db_name}" LIMIT {limit} {from_clause};"""

datahub/ingestion/source/snowflake/snowflake_tag.py CHANGED Viewed

@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
 from datahub.metadata.com.linkedin.pegasus2avro.structured import (
     StructuredPropertyDefinition,
 )
+from datahub.metadata.schema_classes import ChangeTypeClass
 from datahub.metadata.urns import (
     ContainerUrn,
     DatasetUrn,
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
     def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
         for tag in self.data_dictionary.get_all_tags():
             if not self.config.structured_property_pattern.allowed(
-                tag.tag_identifier()
+                tag._id_prefix_as_str()
             ):
                 continue
             if self.config.extract_tags_as_structured_properties:
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
         yield MetadataChangeProposalWrapper(
             entityUrn=urn,
             aspect=aspect,
+            changeType=ChangeTypeClass.CREATE,
+            headers={"If-None-Match": "*"},
         ).as_workunit()
     def _get_tags_on_object_with_propagation(

datahub/ingestion/source/sql/athena.py CHANGED Viewed

@@ -35,13 +35,14 @@ from datahub.ingestion.source.sql.sql_common import (
     SQLAlchemySource,
     register_custom_type,
 )
-from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
 from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     gen_database_container,
     gen_database_key,
 )
+from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
 from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
 from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
 from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column

datahub/ingestion/source/sql/clickhouse.py CHANGED Viewed

@@ -145,7 +145,11 @@ class ClickHouseConfig(
     )
     include_materialized_views: Optional[bool] = Field(default=True, description="")
-    def get_sql_alchemy_url(self, current_db=None):
+    def get_sql_alchemy_url(
+        self,
+        uri_opts: Optional[Dict[str, Any]] = None,
+        current_db: Optional[str] = None,
+    ) -> str:
         url = make_url(
             super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db)
         )

datahub/ingestion/source/sql/druid.py CHANGED Viewed

@@ -1,4 +1,6 @@
 # This import verifies that the dependencies are available.
+from typing import Any, Dict, Optional
 import pydruid  # noqa: F401
 from pydantic.fields import Field
 from pydruid.db.sqlalchemy import DruidDialect
@@ -38,8 +40,11 @@ class DruidConfig(BasicSQLAlchemyConfig):
         description="regex patterns for schemas to filter in ingestion.",
     )
-    def get_sql_alchemy_url(self):
-        return f"{super().get_sql_alchemy_url()}/druid/v2/sql/"
+    def get_sql_alchemy_url(
+        self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
+    ) -> str:
+        base_url = super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
+        return f"{base_url}/druid/v2/sql/"
     """
     The pydruid library already formats the table name correctly, so we do not

acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.2rc4py3-none-any.whl → 1.0.0.3py3-none-any.whl