PyPI - acryl-datahub - Versions diffs - 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl - Mend

acryl-datahub 1.0.0.3rc9py3-none-any.whl → 1.0.0.3rc10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (54) hide show

{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2480 -2480
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +54 -54
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/datajob/dataflow.py +3 -3
datahub/api/entities/forms/forms.py +34 -35
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/operation.py +4 -4
datahub/cli/delete_cli.py +1 -1
datahub/cli/docker_cli.py +2 -2
datahub/configuration/source_common.py +1 -1
datahub/emitter/request_helper.py +116 -3
datahub/emitter/rest_emitter.py +44 -52
datahub/ingestion/api/source.py +2 -5
datahub/ingestion/glossary/classification_mixin.py +4 -2
datahub/ingestion/graph/client.py +3 -1
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/graph/filters.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
datahub/ingestion/source/dbt/dbt_common.py +10 -2
datahub/ingestion/source/dbt/dbt_core.py +82 -42
datahub/ingestion/source/feast.py +4 -4
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +7 -1
datahub/ingestion/source/mode.py +74 -28
datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
datahub/ingestion/source/powerbi/config.py +1 -1
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/redshift/usage.py +10 -9
datahub/ingestion/source/sql/clickhouse.py +5 -1
datahub/ingestion/source/sql/druid.py +7 -2
datahub/ingestion/source/sql/oracle.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
datahub/metadata/_urns/urn_defs.py +1786 -1786
datahub/metadata/schema.avsc +17364 -16988
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/__init__.py +3 -3
datahub/testing/check_imports.py +1 -1
datahub/utilities/logging_manager.py +8 -1
datahub/utilities/sqlalchemy_query_combiner.py +4 -5
datahub/utilities/urn_encoder.py +1 -1
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/dbt/dbt_core.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import json
 import logging
 import re
@@ -12,16 +13,15 @@ from pydantic import BaseModel, Field, validator
 from datahub.configuration.git import GitReference
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
-    capability,
     config_class,
     platform_name,
     support_status,
 )
 from datahub.ingestion.api.source import (
     CapabilityReport,
-    SourceCapability,
     TestableSource,
     TestConnectionReport,
 )
@@ -40,19 +40,28 @@ from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult
 logger = logging.getLogger(__name__)
+@dataclasses.dataclass
+class DBTCoreReport(DBTSourceReport):
+    catalog_info: Optional[dict] = None
+    manifest_info: Optional[dict] = None
 class DBTCoreConfig(DBTCommonConfig):
     manifest_path: str = Field(
-        description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json Note "
-        "this can be a local file or a URI."
+        description="Path to dbt manifest JSON. See https://docs.getdbt.com/reference/artifacts/manifest-json. "
+        "This can be a local file or a URI."
     )
-    catalog_path: str = Field(
-        description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json Note this "
-        "can be a local file or a URI."
+    catalog_path: Optional[str] = Field(
+        None,
+        description="Path to dbt catalog JSON. See https://docs.getdbt.com/reference/artifacts/catalog-json. "
+        "This file is optional, but highly recommended. Without it, some metadata like column info will be incomplete or missing. "
+        "This can be a local file or a URI.",
     )
     sources_path: Optional[str] = Field(
         default=None,
-        description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. If not "
-        "specified, last-modified fields will not be populated. Note this can be a local file or a URI.",
+        description="Path to dbt sources JSON. See https://docs.getdbt.com/reference/artifacts/sources-json. "
+        "If not specified, last-modified fields will not be populated. "
+        "This can be a local file or a URI.",
     )
     run_results_paths: List[str] = Field(
         default=[],
@@ -161,7 +170,7 @@ def get_columns(
 def extract_dbt_entities(
     all_manifest_entities: Dict[str, Dict[str, Any]],
-    all_catalog_entities: Dict[str, Dict[str, Any]],
+    all_catalog_entities: Optional[Dict[str, Dict[str, Any]]],
     sources_results: List[Dict[str, Any]],
     manifest_adapter: str,
     use_identifiers: bool,
@@ -186,15 +195,6 @@ def extract_dbt_entities(
         ):
             name = manifest_node["alias"]
-        # initialize comment to "" for consistency with descriptions
-        # (since dbt null/undefined descriptions as "")
-        comment = ""
-        if key in all_catalog_entities and all_catalog_entities[key]["metadata"].get(
-            "comment"
-        ):
-            comment = all_catalog_entities[key]["metadata"]["comment"]
         materialization = None
         if "materialized" in manifest_node.get("config", {}):
             # It's a model
@@ -204,8 +204,9 @@ def extract_dbt_entities(
         if "depends_on" in manifest_node and "nodes" in manifest_node["depends_on"]:
             upstream_nodes = manifest_node["depends_on"]["nodes"]
-        # It's a source
-        catalog_node = all_catalog_entities.get(key)
+        catalog_node = (
+            all_catalog_entities.get(key) if all_catalog_entities is not None else None
+        )
         missing_from_catalog = catalog_node is None
         catalog_type = None
@@ -214,16 +215,23 @@ def extract_dbt_entities(
                 # Test and ephemeral nodes will never show up in the catalog.
                 missing_from_catalog = False
             else:
-                if not only_include_if_in_catalog:
+                if all_catalog_entities is not None and not only_include_if_in_catalog:
+                    # If the catalog file is missing, we have already generated a general message.
                     report.warning(
                         title="Node missing from catalog",
                         message="Found a node in the manifest file but not in the catalog. "
                         "This usually means the catalog file was not generated by `dbt docs generate` and so is incomplete. "
-                        "Some metadata, such as column types and descriptions, will be impacted.",
+                        "Some metadata, particularly schema information, will be impacted.",
                         context=key,
                     )
         else:
-            catalog_type = all_catalog_entities[key]["metadata"]["type"]
+            catalog_type = catalog_node["metadata"]["type"]
+        # initialize comment to "" for consistency with descriptions
+        # (since dbt null/undefined descriptions as "")
+        comment = ""
+        if catalog_node is not None and catalog_node.get("metadata", {}).get("comment"):
+            comment = catalog_node["metadata"]["comment"]
         query_tag_props = manifest_node.get("query_tag", {})
@@ -231,12 +239,15 @@ def extract_dbt_entities(
         owner = meta.get("owner")
         if owner is None:
-            owner = manifest_node.get("config", {}).get("meta", {}).get("owner")
+            owner = (manifest_node.get("config", {}).get("meta") or {}).get("owner")
+        if not meta:
+            # On older versions of dbt, the meta field was nested under config
+            # for some node types.
+            meta = manifest_node.get("config", {}).get("meta") or {}
         tags = manifest_node.get("tags", [])
         tags = [tag_prefix + tag for tag in tags]
-        if not meta:
-            meta = manifest_node.get("config", {}).get("meta", {})
         max_loaded_at_str = sources_by_id.get(key, {}).get("max_loaded_at")
         max_loaded_at = None
@@ -453,15 +464,18 @@ def load_run_results(
 @platform_name("dbt")
 @config_class(DBTCoreConfig)
 @support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
 class DBTCoreSource(DBTSourceBase, TestableSource):
     config: DBTCoreConfig
+    report: DBTCoreReport
+    def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
+        super().__init__(config, ctx)
+        self.report = DBTCoreReport()
     @classmethod
     def create(cls, config_dict, ctx):
         config = DBTCoreConfig.parse_obj(config_dict)
-        return cls(config, ctx, "dbt")
+        return cls(config, ctx)
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -471,9 +485,10 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
             DBTCoreSource.load_file_as_json(
                 source_config.manifest_path, source_config.aws_connection
             )
-            DBTCoreSource.load_file_as_json(
-                source_config.catalog_path, source_config.aws_connection
-            )
+            if source_config.catalog_path is not None:
+                DBTCoreSource.load_file_as_json(
+                    source_config.catalog_path, source_config.aws_connection
+                )
             test_report.basic_connectivity = CapabilityReport(capable=True)
         except Exception as e:
             test_report.basic_connectivity = CapabilityReport(
@@ -511,11 +526,31 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
         dbt_manifest_json = self.load_file_as_json(
             self.config.manifest_path, self.config.aws_connection
         )
-        dbt_catalog_json = self.load_file_as_json(
-            self.config.catalog_path, self.config.aws_connection
+        dbt_manifest_metadata = dbt_manifest_json["metadata"]
+        self.report.manifest_info = dict(
+            generated_at=dbt_manifest_metadata.get("generated_at", "unknown"),
+            dbt_version=dbt_manifest_metadata.get("dbt_version", "unknown"),
+            project_name=dbt_manifest_metadata.get("project_name", "unknown"),
         )
+        dbt_catalog_json = None
+        dbt_catalog_metadata = None
+        if self.config.catalog_path is not None:
+            dbt_catalog_json = self.load_file_as_json(
+                self.config.catalog_path, self.config.aws_connection
+            )
+            dbt_catalog_metadata = dbt_catalog_json.get("metadata", {})
+            self.report.catalog_info = dict(
+                generated_at=dbt_catalog_metadata.get("generated_at", "unknown"),
+                dbt_version=dbt_catalog_metadata.get("dbt_version", "unknown"),
+                project_name=dbt_catalog_metadata.get("project_name", "unknown"),
+            )
+        else:
+            self.report.warning(
+                title="No catalog file configured",
+                message="Some metadata, particularly schema information, will be missing.",
+            )
         if self.config.sources_path is not None:
             dbt_sources_json = self.load_file_as_json(
                 self.config.sources_path, self.config.aws_connection
@@ -528,18 +563,23 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
         manifest_version = dbt_manifest_json["metadata"].get("dbt_version")
         manifest_adapter = dbt_manifest_json["metadata"].get("adapter_type")
-        catalog_schema = dbt_catalog_json.get("metadata", {}).get("dbt_schema_version")
-        catalog_version = dbt_catalog_json.get("metadata", {}).get("dbt_version")
+        catalog_schema = None
+        catalog_version = None
+        if dbt_catalog_metadata is not None:
+            catalog_schema = dbt_catalog_metadata.get("dbt_schema_version")
+            catalog_version = dbt_catalog_metadata.get("dbt_version")
         manifest_nodes = dbt_manifest_json["nodes"]
         manifest_sources = dbt_manifest_json["sources"]
         all_manifest_entities = {**manifest_nodes, **manifest_sources}
-        catalog_nodes = dbt_catalog_json["nodes"]
-        catalog_sources = dbt_catalog_json["sources"]
+        all_catalog_entities = None
+        if dbt_catalog_json is not None:
+            catalog_nodes = dbt_catalog_json["nodes"]
+            catalog_sources = dbt_catalog_json["sources"]
-        all_catalog_entities = {**catalog_nodes, **catalog_sources}
+            all_catalog_entities = {**catalog_nodes, **catalog_sources}
         nodes = extract_dbt_entities(
             all_manifest_entities=all_manifest_entities,
@@ -590,7 +630,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
                 )
         except Exception as e:
             self.report.info(
-                title="Dbt Catalog Version",
+                title="dbt Catalog Version",
                 message="Failed to determine the catalog version",
                 exc=e,
             )

datahub/ingestion/source/feast.py CHANGED Viewed

@@ -135,10 +135,10 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
     """
     This plugin extracts:
-    - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
-    - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
-    - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
-    - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
+    - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
+    - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
+    - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
+    - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
     - Column types associated with each entity and feature
     """

datahub/ingestion/source/iceberg/iceberg_common.py CHANGED Viewed

@@ -40,11 +40,11 @@ class TimeoutHTTPAdapter(HTTPAdapter):
             del kwargs["timeout"]
         super().__init__(*args, **kwargs)
-    def send(self, request, **kwargs):
+    def send(self, request, *args, **kwargs):
         timeout = kwargs.get("timeout")
         if timeout is None and hasattr(self, "timeout"):
             kwargs["timeout"] = self.timeout
-        return super().send(request, **kwargs)
+        return super().send(request, *args, **kwargs)
 class IcebergProfilingConfig(ConfigModel):

datahub/ingestion/source/ldap.py CHANGED Viewed

@@ -515,5 +515,5 @@ def parse_ldap_dn(input_clean: bytes) -> str:
 def get_attr_or_none(
     attrs: Dict[str, Any], key: str, default: Optional[str] = None
-) -> str:
+) -> Optional[str]:
     return attrs[key][0].decode() if attrs.get(key) else default

datahub/ingestion/source/looker/looker_lib_wrapper.py CHANGED Viewed

@@ -113,7 +113,7 @@ class LookerAPI:
             )
         except SDKError as e:
             raise ConfigurationError(
-                f"Failed to connect/authenticate with looker - check your configuration: {e}"
+                "Failed to connect/authenticate with looker - check your configuration"
             ) from e
         self.client_stats = LookerAPIStats()

datahub/ingestion/source/looker/lookml_source.py CHANGED Viewed

@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
                 f"Failed to find a project name for model {model_name}"
             )
             return model.project_name
-        except SDKError:
+        except SDKError as e:
+            self.reporter.failure(
+                title="Failed to find a project name for model",
+                message="Consider configuring a static project name in your config file",
+                context=str(dict(model_name=model_name)),
+                exc=e,
+            )
             raise ValueError(
                 f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
                 f"in your config file"

datahub/ingestion/source/mode.py CHANGED Viewed

@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from datetime import datetime, timezone
 from functools import lru_cache
 from json import JSONDecodeError
-from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
 import dateutil.parser as dp
 import pydantic
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
     pass
+class HTTPError504(HTTPError):
+    pass
 ModeRequestError = (HTTPError, JSONDecodeError)
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
     num_query_template_render: int = 0
     num_query_template_render_failures: int = 0
     num_query_template_render_success: int = 0
+    num_requests_exceeding_rate_limit: int = 0
+    num_requests_retried_on_timeout: int = 0
+    num_spaces_retrieved: int = 0
     def report_dropped_space(self, ent_name: str) -> None:
         self.filtered_spaces.append(ent_name)
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
         # Datasets
         datasets = []
         for imported_dataset_name in report_info.get("imported_datasets", {}):
-            mode_dataset = self._get_request_json(
-                f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
-            )
+            try:
+                mode_dataset = self._get_request_json(
+                    f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
+                )
+            except HTTPError as http_error:
+                status_code = http_error.response.status_code
+                if status_code == 404:
+                    self.report.report_warning(
+                        title="Report Not Found",
+                        message="Referenced report for reusable dataset was not found.",
+                        context=f"Report: {report_info.get('id')}, "
+                        f"Imported Dataset Report: {imported_dataset_name.get('token')}",
+                    )
+                    continue
+                else:
+                    raise http_error
             dataset_urn = builder.make_dataset_urn_with_platform_instance(
                 self.platform,
                 str(mode_dataset.get("id")),
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
         space_info = {}
         try:
             logger.debug(f"Retrieving spaces for {self.workspace_uri}")
-            payload = self._get_request_json(f"{self.workspace_uri}/spaces?filter=all")
-            spaces = payload.get("_embedded", {}).get("spaces", {})
-            logger.debug(
-                f"Got {len(spaces)} spaces from workspace {self.workspace_uri}"
-            )
-            for s in spaces:
-                logger.debug(f"Space: {s.get('name')}")
-                space_name = s.get("name", "")
-                # Using both restricted and default_access_level because
-                # there is a current bug with restricted returning False everytime
-                # which has been reported to Mode team
-                if self.config.exclude_restricted and (
-                    s.get("restricted") or s.get("default_access_level") == "restricted"
-                ):
-                    logging.debug(
-                        f"Skipping space {space_name} due to exclude restricted"
-                    )
-                    continue
-                if not self.config.space_pattern.allowed(space_name):
-                    self.report.report_dropped_space(space_name)
-                    logging.debug(f"Skipping space {space_name} due to space pattern")
-                    continue
-                space_info[s.get("token", "")] = s.get("name", "")
+            for spaces_page in self._get_paged_request_json(
+                f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
+            ):
+                logger.debug(
+                    f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
+                )
+                self.report.num_spaces_retrieved += len(spaces_page)
+                for s in spaces_page:
+                    logger.debug(f"Space: {s.get('name')}")
+                    space_name = s.get("name", "")
+                    # Using both restricted and default_access_level because
+                    # there is a current bug with restricted returning False everytime
+                    # which has been reported to Mode team
+                    if self.config.exclude_restricted and (
+                        s.get("restricted")
+                        or s.get("default_access_level") == "restricted"
+                    ):
+                        logging.debug(
+                            f"Skipping space {space_name} due to exclude restricted"
+                        )
+                        continue
+                    if not self.config.space_pattern.allowed(space_name):
+                        self.report.report_dropped_space(space_name)
+                        logging.debug(
+                            f"Skipping space {space_name} due to space pattern"
+                        )
+                        continue
+                    space_info[s.get("token", "")] = s.get("name", "")
         except ModeRequestError as e:
             self.report.report_failure(
                 title="Failed to Retrieve Spaces",
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
             )
         return charts
+    def _get_paged_request_json(
+        self, url: str, key: str, per_page: int
+    ) -> Iterator[List[Dict]]:
+        page: int = 1
+        while True:
+            page_url = f"{url}&per_page={per_page}&page={page}"
+            response = self._get_request_json(page_url)
+            data: List[Dict] = response.get("_embedded", {}).get(key, [])
+            if not data:
+                break
+            yield data
+            page += 1
     def _get_request_json(self, url: str) -> Dict:
         r = tenacity.Retrying(
             wait=wait_exponential(
                 multiplier=self.config.api_options.retry_backoff_multiplier,
                 max=self.config.api_options.max_retry_interval,
             ),
-            retry=retry_if_exception_type((HTTPError429, ConnectionError)),
+            retry=retry_if_exception_type(
+                (HTTPError429, HTTPError504, ConnectionError)
+            ),
             stop=stop_after_attempt(self.config.api_options.max_attempts),
         )
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
             except HTTPError as http_error:
                 error_response = http_error.response
                 if error_response.status_code == 429:
+                    self.report.num_requests_exceeding_rate_limit += 1
                     # respect Retry-After
                     sleep_time = error_response.headers.get("retry-after")
                     if sleep_time is not None:
                         time.sleep(float(sleep_time))
                     raise HTTPError429 from None
+                elif error_response.status_code == 504:
+                    self.report.num_requests_retried_on_timeout += 1
+                    time.sleep(0.1)
+                    raise HTTPError504 from None
                 logger.debug(
                     f"Error response ({error_response.status_code}): {error_response.text}"

acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.3rc9py3-none-any.whl → 1.0.0.3rc10py3-none-any.whl