PyPI - acryl-datahub - Versions diffs - 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl - Mend

acryl-datahub 1.0.0.3rc9py3-none-any.whl → 1.0.0.3rc10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (54) hide show

{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/METADATA +2480 -2480
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/RECORD +54 -54
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
datahub/api/entities/datajob/dataflow.py +3 -3
datahub/api/entities/forms/forms.py +34 -35
datahub/api/graphql/assertion.py +1 -1
datahub/api/graphql/operation.py +4 -4
datahub/cli/delete_cli.py +1 -1
datahub/cli/docker_cli.py +2 -2
datahub/configuration/source_common.py +1 -1
datahub/emitter/request_helper.py +116 -3
datahub/emitter/rest_emitter.py +44 -52
datahub/ingestion/api/source.py +2 -5
datahub/ingestion/glossary/classification_mixin.py +4 -2
datahub/ingestion/graph/client.py +3 -1
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/graph/filters.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
datahub/ingestion/source/dbt/dbt_common.py +10 -2
datahub/ingestion/source/dbt/dbt_core.py +82 -42
datahub/ingestion/source/feast.py +4 -4
datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
datahub/ingestion/source/ldap.py +1 -1
datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
datahub/ingestion/source/looker/lookml_source.py +7 -1
datahub/ingestion/source/mode.py +74 -28
datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
datahub/ingestion/source/powerbi/config.py +1 -1
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
datahub/ingestion/source/redshift/usage.py +10 -9
datahub/ingestion/source/sql/clickhouse.py +5 -1
datahub/ingestion/source/sql/druid.py +7 -2
datahub/ingestion/source/sql/oracle.py +6 -2
datahub/ingestion/source/tableau/tableau_validation.py +1 -1
datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +490 -490
datahub/metadata/_urns/urn_defs.py +1786 -1786
datahub/metadata/schema.avsc +17364 -16988
datahub/metadata/schema_classes.py +3 -3
datahub/metadata/schemas/__init__.py +3 -3
datahub/testing/check_imports.py +1 -1
datahub/utilities/logging_manager.py +8 -1
datahub/utilities/sqlalchemy_query_combiner.py +4 -5
datahub/utilities/urn_encoder.py +1 -1
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.3rc9.dist-info → acryl_datahub-1.0.0.3rc10.dist-info}/top_level.txt +0 -0

datahub/emitter/request_helper.py CHANGED Viewed

@@ -1,14 +1,31 @@
+import json
 import shlex
-from typing import List, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
 import requests
 from requests.auth import HTTPBasicAuth
+from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.serialization_helper import pre_json_transform
+from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
+    MetadataChangeProposal,
+)
+from datahub.metadata.schema_classes import ChangeTypeClass
+def _decode_bytes(value: Union[str, bytes]) -> str:
+    """Decode bytes to string, if necessary."""
+    if isinstance(value, bytes):
+        return value.decode()
+    return value
 def _format_header(name: str, value: Union[str, bytes]) -> str:
     if name == "Authorization":
         return f"{name!s}: <redacted>"
-    return f"{name!s}: {value!s}"
+    return f"{name!s}: {_decode_bytes(value)}"
 def make_curl_command(
@@ -21,7 +38,9 @@ def make_curl_command(
     if session.auth:
         if isinstance(session.auth, HTTPBasicAuth):
-            fragments.extend(["-u", f"{session.auth.username}:<redacted>"])
+            fragments.extend(
+                ["-u", f"{_decode_bytes(session.auth.username)}:<redacted>"]
+            )
         else:
             # For other auth types, they should be handled via headers
             fragments.extend(["-H", "<unknown auth type>"])
@@ -31,3 +50,97 @@ def make_curl_command(
     fragments.append(url)
     return shlex.join(fragments)
+@dataclass
+class OpenApiRequest:
+    """Represents an OpenAPI request for entity operations."""
+    method: str
+    url: str
+    payload: List[Dict[str, Any]]
+    @classmethod
+    def from_mcp(
+        cls,
+        mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
+        gms_server: str,
+        async_flag: Optional[bool] = None,
+        async_default: bool = False,
+    ) -> Optional["OpenApiRequest"]:
+        """Factory method to create an OpenApiRequest from a MetadataChangeProposal."""
+        if not mcp.aspectName or (
+            mcp.changeType != ChangeTypeClass.DELETE and not mcp.aspect
+        ):
+            return None
+        resolved_async_flag = async_flag if async_flag is not None else async_default
+        method = "post"
+        url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
+        payload = []
+        if mcp.changeType == ChangeTypeClass.DELETE:
+            method = "delete"
+            url = f"{gms_server}/openapi/v3/entity/{mcp.entityType}/{mcp.entityUrn}"
+        else:
+            if mcp.aspect:
+                if mcp.changeType == ChangeTypeClass.PATCH:
+                    method = "patch"
+                    obj = mcp.aspect.to_obj()
+                    content_type = obj.get("contentType")
+                    if obj.get("value") and content_type == JSON_PATCH_CONTENT_TYPE:
+                        # Undo double serialization.
+                        obj = json.loads(obj["value"])
+                        patch_value = obj
+                    else:
+                        raise NotImplementedError(
+                            f"ChangeType {mcp.changeType} only supports context type {JSON_PATCH_CONTENT_TYPE}, found {content_type}."
+                        )
+                    if isinstance(patch_value, list):
+                        patch_value = {"patch": patch_value}
+                    payload = [
+                        {
+                            "urn": mcp.entityUrn,
+                            mcp.aspectName: {
+                                "value": patch_value,
+                                "systemMetadata": mcp.systemMetadata.to_obj()
+                                if mcp.systemMetadata
+                                else None,
+                            },
+                        }
+                    ]
+                else:
+                    if isinstance(mcp, MetadataChangeProposalWrapper):
+                        aspect_value = pre_json_transform(
+                            mcp.to_obj(simplified_structure=True)
+                        )["aspect"]["json"]
+                    else:
+                        obj = mcp.aspect.to_obj()
+                        content_type = obj.get("contentType")
+                        if obj.get("value") and content_type == JSON_CONTENT_TYPE:
+                            # Undo double serialization.
+                            obj = json.loads(obj["value"])
+                        elif content_type == JSON_PATCH_CONTENT_TYPE:
+                            raise NotImplementedError(
+                                f"ChangeType {mcp.changeType} does not support patch."
+                            )
+                        aspect_value = pre_json_transform(obj)
+                    payload = [
+                        {
+                            "urn": mcp.entityUrn,
+                            mcp.aspectName: {
+                                "value": aspect_value,
+                                "systemMetadata": mcp.systemMetadata.to_obj()
+                                if mcp.systemMetadata
+                                else None,
+                            },
+                        }
+                    ]
+            else:
+                raise ValueError(f"ChangeType {mcp.changeType} requires a value.")
+        return cls(method=method, url=url, payload=payload)

datahub/emitter/rest_emitter.py CHANGED Viewed

@@ -41,10 +41,9 @@ from datahub.configuration.common import (
     TraceTimeoutError,
     TraceValidationError,
 )
-from datahub.emitter.aspect import JSON_CONTENT_TYPE, JSON_PATCH_CONTENT_TYPE
 from datahub.emitter.generic_emitter import Emitter
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.emitter.request_helper import make_curl_command
+from datahub.emitter.request_helper import OpenApiRequest, make_curl_command
 from datahub.emitter.response_helper import (
     TraceData,
     extract_trace_data,
@@ -348,43 +347,24 @@ class DataHubRestEmitter(Closeable, Emitter):
         mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
         async_flag: Optional[bool] = None,
         async_default: bool = False,
-    ) -> Optional[Tuple[str, List[Dict[str, Any]]]]:
-        if mcp.aspect and mcp.aspectName:
-            resolved_async_flag = (
-                async_flag if async_flag is not None else async_default
-            )
-            url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
+    ) -> Optional[OpenApiRequest]:
+        """
+        Convert a MetadataChangeProposal to an OpenAPI request format.
-            if isinstance(mcp, MetadataChangeProposalWrapper):
-                aspect_value = pre_json_transform(
-                    mcp.to_obj(simplified_structure=True)
-                )["aspect"]["json"]
-            else:
-                obj = mcp.aspect.to_obj()
-                content_type = obj.get("contentType")
-                if obj.get("value") and content_type == JSON_CONTENT_TYPE:
-                    # Undo double serialization.
-                    obj = json.loads(obj["value"])
-                elif content_type == JSON_PATCH_CONTENT_TYPE:
-                    raise NotImplementedError(
-                        "Patches are not supported for OpenAPI ingestion. Set the endpoint to RESTLI."
-                    )
-                aspect_value = pre_json_transform(obj)
-            return (
-                url,
-                [
-                    {
-                        "urn": mcp.entityUrn,
-                        mcp.aspectName: {
-                            "value": aspect_value,
-                            "systemMetadata": mcp.systemMetadata.to_obj()
-                            if mcp.systemMetadata
-                            else None,
-                        },
-                    }
-                ],
-            )
-        return None
+        Args:
+            mcp: The metadata change proposal
+            async_flag: Optional flag to override async behavior
+            async_default: Default async behavior if not specified
+        Returns:
+            An OpenApiRequest object or None if the MCP doesn't have required fields
+        """
+        return OpenApiRequest.from_mcp(
+            mcp=mcp,
+            gms_server=self._gms_server,
+            async_flag=async_flag,
+            async_default=async_default,
+        )
     def emit(
         self,
@@ -448,7 +428,9 @@ class DataHubRestEmitter(Closeable, Emitter):
         if self._openapi_ingestion:
             request = self._to_openapi_request(mcp, async_flag, async_default=False)
             if request:
-                response = self._emit_generic(request[0], payload=request[1])
+                response = self._emit_generic(
+                    request.url, payload=request.payload, method=request.method
+                )
                 if self._should_trace(async_flag, trace_flag):
                     trace_data = extract_trace_data(response) if response else None
@@ -503,31 +485,36 @@ class DataHubRestEmitter(Closeable, Emitter):
         trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
     ) -> int:
         """
-        1. Grouping MCPs by their entity URL
+        1. Grouping MCPs by their HTTP method and entity URL
         2. Breaking down large batches into smaller chunks based on both:
          * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
          * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
         The Chunk class encapsulates both the items and their byte size tracking
-        Serializing the items only once with json.dumps(request[1]) and reusing that
+        Serializing the items only once with json.dumps(request.payload) and reusing that
         The chunking logic handles edge cases (always accepting at least one item per chunk)
         The joining logic is efficient with a simple string concatenation
         :param mcps: metadata change proposals to transmit
         :param async_flag: the mode
+        :param trace_flag: whether to trace the requests
+        :param trace_timeout: timeout for tracing
         :return: number of requests
         """
-        # group by entity url
-        batches: Dict[str, List[_Chunk]] = defaultdict(
+        # Group by entity URL and HTTP method
+        batches: Dict[Tuple[str, str], List[_Chunk]] = defaultdict(
             lambda: [_Chunk(items=[])]
         )  # Initialize with one empty Chunk
         for mcp in mcps:
             request = self._to_openapi_request(mcp, async_flag, async_default=True)
             if request:
-                current_chunk = batches[request[0]][-1]  # Get the last chunk
-                # Only serialize once
-                serialized_item = json.dumps(request[1][0])
+                # Create a composite key with both method and URL
+                key = (request.method, request.url)
+                current_chunk = batches[key][-1]  # Get the last chunk
+                # Only serialize once - we're serializing a single payload item
+                serialized_item = json.dumps(request.payload[0])
                 item_bytes = len(serialized_item.encode())
                 # If adding this item would exceed max_bytes, create a new chunk
@@ -537,15 +524,17 @@ class DataHubRestEmitter(Closeable, Emitter):
                     or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
                 ):
                     new_chunk = _Chunk(items=[])
-                    batches[request[0]].append(new_chunk)
+                    batches[key].append(new_chunk)
                     current_chunk = new_chunk
                 current_chunk.add_item(serialized_item)
         responses = []
-        for url, chunks in batches.items():
+        for (method, url), chunks in batches.items():
             for chunk in chunks:
-                response = self._emit_generic(url, payload=_Chunk.join(chunk))
+                response = self._emit_generic(
+                    url, payload=_Chunk.join(chunk), method=method
+                )
                 responses.append(response)
         if self._should_trace(async_flag, trace_flag, async_default=True):
@@ -618,11 +607,13 @@ class DataHubRestEmitter(Closeable, Emitter):
         payload = json.dumps(snapshot)
         self._emit_generic(url, payload)
-    def _emit_generic(self, url: str, payload: Union[str, Any]) -> requests.Response:
+    def _emit_generic(
+        self, url: str, payload: Union[str, Any], method: str = "POST"
+    ) -> requests.Response:
         if not isinstance(payload, str):
             payload = json.dumps(payload)
-        curl_command = make_curl_command(self._session, "POST", url, payload)
+        curl_command = make_curl_command(self._session, method, url, payload)
         payload_size = len(payload)
         if payload_size > INGEST_MAX_PAYLOAD_BYTES:
             # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
@@ -635,7 +626,8 @@ class DataHubRestEmitter(Closeable, Emitter):
             curl_command,
         )
         try:
-            response = self._session.post(url, data=payload)
+            method_func = getattr(self._session, method.lower())
+            response = method_func(url, data=payload) if payload else method_func(url)
             response.raise_for_status()
             return response
         except HTTPError as e:

datahub/ingestion/api/source.py CHANGED Viewed

@@ -420,12 +420,9 @@ class Source(Closeable, metaclass=ABCMeta):
         Run in order, first in list is applied first. Be careful with order when overriding.
         """
         browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
-        if (
-            self.ctx.pipeline_config
-            and self.ctx.pipeline_config.flags.generate_browse_path_v2
-        ):
+        if self.ctx.flags.generate_browse_path_v2:
             browse_path_processor = self._get_browse_path_processor(
-                self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
+                self.ctx.flags.generate_browse_path_v2_dry_run
             )
         auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None

datahub/ingestion/glossary/classification_mixin.py CHANGED Viewed

@@ -319,8 +319,10 @@ def classification_workunit_processor(
                         partial(
                             data_reader.get_sample_data_for_table,
                             table_id,
-                            classification_handler.config.classification.sample_size
-                            * SAMPLE_SIZE_MULTIPLIER,
+                            int(
+                                classification_handler.config.classification.sample_size
+                                * SAMPLE_SIZE_MULTIPLIER
+                            ),
                             **(data_reader_kwargs or {}),
                         )
                         if data_reader

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -158,7 +158,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
             ca_certificate_path=self.config.ca_certificate_path,
             client_certificate_path=self.config.client_certificate_path,
             disable_ssl_verification=self.config.disable_ssl_verification,
-            openapi_ingestion=DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI,
+            openapi_ingestion=self.config.openapi_ingestion
+            if self.config.openapi_ingestion is not None
+            else (DEFAULT_REST_EMITTER_ENDPOINT == RestSinkEndpoint.OPENAPI),
             default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
         )

datahub/ingestion/graph/config.py CHANGED Viewed

@@ -17,3 +17,4 @@ class DatahubClientConfig(ConfigModel):
     ca_certificate_path: Optional[str] = None
     client_certificate_path: Optional[str] = None
     disable_ssl_verification: bool = False
+    openapi_ingestion: Optional[bool] = None

datahub/ingestion/graph/filters.py CHANGED Viewed

@@ -18,7 +18,7 @@ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
 # This can be put directly into the orFilters parameter in GraphQL.
 RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
-# Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
+# Mirrors our GraphQL enum: https://docs.datahub.com/docs/graphql/enums#filteroperator
 FilterOperator: TypeAlias = Literal[
     "CONTAIN",
     "EQUAL",

datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py CHANGED Viewed

@@ -205,7 +205,7 @@ class FeatureGroupProcessor:
                     textwrap.dedent(
                         f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
                         To view full table metadata, run Glue ingestion
-                        (see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
+                        (see https://docs.datahub.com/docs/generated/ingestion/sources/glue)"""
                     )
                 )

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -270,29 +270,30 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
             ):
                 return
-            with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
-                with BigQueryQueriesExtractor(
-                    connection=self.config.get_bigquery_client(),
-                    schema_api=self.bq_schema_extractor.schema_api,
-                    config=BigQueryQueriesExtractorConfig(
-                        window=self.config,
-                        user_email_pattern=self.config.usage.user_email_pattern,
-                        include_lineage=self.config.include_table_lineage,
-                        include_usage_statistics=self.config.include_usage_statistics,
-                        include_operations=self.config.usage.include_operational_stats,
-                        include_queries=self.config.include_queries,
-                        include_query_usage_statistics=self.config.include_query_usage_statistics,
-                        top_n_queries=self.config.usage.top_n_queries,
-                        region_qualifiers=self.config.region_qualifiers,
-                    ),
-                    structured_report=self.report,
-                    filters=self.filters,
-                    identifiers=self.identifiers,
-                    schema_resolver=self.sql_parser_schema_resolver,
-                    discovered_tables=self.bq_schema_extractor.table_refs,
-                ) as queries_extractor:
-                    self.report.queries_extractor = queries_extractor.report
-                    yield from queries_extractor.get_workunits_internal()
+            with self.report.new_stage(
+                f"*: {QUERIES_EXTRACTION}"
+            ), BigQueryQueriesExtractor(
+                connection=self.config.get_bigquery_client(),
+                schema_api=self.bq_schema_extractor.schema_api,
+                config=BigQueryQueriesExtractorConfig(
+                    window=self.config,
+                    user_email_pattern=self.config.usage.user_email_pattern,
+                    include_lineage=self.config.include_table_lineage,
+                    include_usage_statistics=self.config.include_usage_statistics,
+                    include_operations=self.config.usage.include_operational_stats,
+                    include_queries=self.config.include_queries,
+                    include_query_usage_statistics=self.config.include_query_usage_statistics,
+                    top_n_queries=self.config.usage.top_n_queries,
+                    region_qualifiers=self.config.region_qualifiers,
+                ),
+                structured_report=self.report,
+                filters=self.filters,
+                identifiers=self.identifiers,
+                schema_resolver=self.sql_parser_schema_resolver,
+                discovered_tables=self.bq_schema_extractor.table_refs,
+            ) as queries_extractor:
+                self.report.queries_extractor = queries_extractor.report
+                yield from queries_extractor.get_workunits_internal()
         else:
             if self.config.include_usage_statistics:
                 yield from self.usage_extractor.get_usage_workunits(

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,30 +70,31 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
-                with ThreadPoolExecutor(
-                    max_workers=self.config.profiling.max_workers
-                ) as executor:
-                    future_to_dataset = {
-                        executor.submit(
-                            self.generate_profile,
-                            keyspace_name,
-                            table_name,
-                            cassandra_data.columns.get(table_name, []),
-                        ): table_name
-                        for table_name in tables
-                    }
-                    for future in as_completed(future_to_dataset):
-                        table_name = future_to_dataset[future]
-                        try:
-                            yield from future.result()
-                        except Exception as exc:
-                            self.report.profiling_skipped_other[table_name] += 1
-                            self.report.failure(
-                                message="Failed to profile for table",
-                                context=f"{keyspace_name}.{table_name}",
-                                exc=exc,
-                            )
+            with self.report.new_stage(
+                f"{keyspace_name}: {PROFILING}"
+            ), ThreadPoolExecutor(
+                max_workers=self.config.profiling.max_workers
+            ) as executor:
+                future_to_dataset = {
+                    executor.submit(
+                        self.generate_profile,
+                        keyspace_name,
+                        table_name,
+                        cassandra_data.columns.get(table_name, []),
+                    ): table_name
+                    for table_name in tables
+                }
+                for future in as_completed(future_to_dataset):
+                    table_name = future_to_dataset[future]
+                    try:
+                        yield from future.result()
+                    except Exception as exc:
+                        self.report.profiling_skipped_other[table_name] += 1
+                        self.report.failure(
+                            message="Failed to profile for table",
+                            context=f"{keyspace_name}.{table_name}",
+                            exc=exc,
+                        )
     def generate_profile(
         self,

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -195,17 +195,18 @@ class DataHubDatabaseReader:
         Yields:
             Row objects containing URNs of soft-deleted entities
         """
-        with self.engine.connect() as conn:
-            with contextlib.closing(conn.connection.cursor()) as cursor:
-                logger.debug("Polling soft-deleted urns from database")
-                cursor.execute(self.soft_deleted_urns_query)
-                columns = [desc[0] for desc in cursor.description]
-                while True:
-                    rows = cursor.fetchmany(self.config.database_query_batch_size)
-                    if not rows:
-                        return
-                    for row in rows:
-                        yield dict(zip(columns, row))
+        with self.engine.connect() as conn, contextlib.closing(
+            conn.connection.cursor()
+        ) as cursor:
+            logger.debug("Polling soft-deleted urns from database")
+            cursor.execute(self.soft_deleted_urns_query)
+            columns = [desc[0] for desc in cursor.description]
+            while True:
+                rows = cursor.fetchmany(self.config.database_query_batch_size)
+                if not rows:
+                    return
+                for row in rows:
+                    yield dict(zip(columns, row))
     def _parse_row(
         self, row: Dict[str, Any]

datahub/ingestion/source/dbt/dbt_cloud.py CHANGED Viewed

@@ -10,14 +10,12 @@ from pydantic import Field, root_validator
 from datahub.ingestion.api.decorators import (
     SupportStatus,
-    capability,
     config_class,
     platform_name,
     support_status,
 )
 from datahub.ingestion.api.source import (
     CapabilityReport,
-    SourceCapability,
     TestableSource,
     TestConnectionReport,
 )
@@ -262,16 +260,14 @@ query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{
 @platform_name("dbt")
 @config_class(DBTCloudConfig)
-@support_status(SupportStatus.INCUBATING)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@support_status(SupportStatus.CERTIFIED)
 class DBTCloudSource(DBTSourceBase, TestableSource):
     config: DBTCloudConfig
     @classmethod
     def create(cls, config_dict, ctx):
         config = DBTCloudConfig.parse_obj(config_dict)
-        return cls(config, ctx, "dbt")
+        return cls(config, ctx)
     @staticmethod
     def test_connection(config_dict: dict) -> TestConnectionReport:

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -125,6 +125,7 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
 @dataclass
 class DBTSourceReport(StaleEntityRemovalSourceReport):
     sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
+    sql_parser_skipped_non_sql_model: LossyList[str] = field(default_factory=LossyList)
     sql_parser_parse_failures: int = 0
     sql_parser_detach_ctes_failures: int = 0
     sql_parser_table_errors: int = 0
@@ -829,11 +830,13 @@ def get_column_type(
     "Enabled by default, configure using `include_column_lineage`",
 )
 class DBTSourceBase(StatefulIngestionSourceBase):
-    def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
+    def __init__(self, config: DBTCommonConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
+        self.platform: str = "dbt"
         self.config = config
-        self.platform: str = platform
         self.report: DBTSourceReport = DBTSourceReport()
         self.compiled_owner_extraction_pattern: Optional[Any] = None
         if self.config.owner_extraction_pattern:
             self.compiled_owner_extraction_pattern = re.compile(
@@ -1177,6 +1180,11 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                 logger.debug(
                     f"Not generating CLL for {node.dbt_name} because we don't need it."
                 )
+            elif node.language != "sql":
+                logger.debug(
+                    f"Not generating CLL for {node.dbt_name} because it is not a SQL model."
+                )
+                self.report.sql_parser_skipped_non_sql_model.append(node.dbt_name)
             elif node.compiled_code:
                 # Add CTE stops based on the upstreams list.
                 cte_mapping = {

acryl-datahub 1.0.0.3rc9__py3-none-any.whl → 1.0.0.3rc10__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.3rc9py3-none-any.whl → 1.0.0.3rc10py3-none-any.whl