PyPI - acryl-datahub - Versions diffs - 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show

{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -28
datahub/cli/specific/dataset_cli.py +26 -10
datahub/emitter/mce_builder.py +1 -3
datahub/emitter/mcp_builder.py +8 -0
datahub/emitter/request_helper.py +19 -14
datahub/emitter/response_helper.py +25 -18
datahub/emitter/rest_emitter.py +23 -7
datahub/errors.py +8 -0
datahub/ingestion/api/source.py +7 -2
datahub/ingestion/api/source_helpers.py +14 -2
datahub/ingestion/extractor/schema_util.py +1 -0
datahub/ingestion/graph/client.py +26 -20
datahub/ingestion/graph/filters.py +62 -17
datahub/ingestion/sink/datahub_rest.py +2 -2
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/common/data_platforms.py +23 -0
datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
datahub/ingestion/source/common/subtypes.py +17 -1
datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
datahub/ingestion/source/dbt/dbt_common.py +6 -4
datahub/ingestion/source/dbt/dbt_core.py +4 -6
datahub/ingestion/source/dbt/dbt_tests.py +8 -6
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
datahub/ingestion/source/dremio/dremio_entities.py +6 -5
datahub/ingestion/source/dremio/dremio_source.py +96 -117
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
datahub/ingestion/source/ge_data_profiler.py +11 -1
datahub/ingestion/source/hex/__init__.py +0 -0
datahub/ingestion/source/hex/api.py +394 -0
datahub/ingestion/source/hex/constants.py +3 -0
datahub/ingestion/source/hex/hex.py +167 -0
datahub/ingestion/source/hex/mapper.py +372 -0
datahub/ingestion/source/hex/model.py +68 -0
datahub/ingestion/source/iceberg/iceberg.py +193 -140
datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
datahub/ingestion/source/mlflow.py +217 -8
datahub/ingestion/source/mode.py +11 -1
datahub/ingestion/source/openapi.py +69 -34
datahub/ingestion/source/powerbi/config.py +31 -4
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
datahub/ingestion/source/powerbi/powerbi.py +41 -24
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
datahub/ingestion/source/redshift/lineage_v2.py +9 -1
datahub/ingestion/source/redshift/query.py +1 -1
datahub/ingestion/source/s3/source.py +11 -0
datahub/ingestion/source/sigma/config.py +3 -4
datahub/ingestion/source/sigma/sigma.py +10 -6
datahub/ingestion/source/slack/slack.py +399 -82
datahub/ingestion/source/snowflake/constants.py +1 -0
datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
datahub/ingestion/source/sql/mssql/job_models.py +15 -1
datahub/ingestion/source/sql/mssql/source.py +8 -4
datahub/ingestion/source/sql/oracle.py +51 -4
datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
datahub/ingestion/source/superset.py +291 -35
datahub/ingestion/source/usage/usage_common.py +0 -65
datahub/ingestion/source/vertexai/__init__.py +0 -0
datahub/ingestion/source/vertexai/vertexai.py +1055 -0
datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
datahub/metadata/_schema_classes.py +472 -1
datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
datahub/metadata/schema.avsc +313 -2
datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
datahub/metadata/schemas/Deprecation.avsc +2 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
datahub/metadata/schemas/QueryProperties.avsc +20 -0
datahub/metadata/schemas/Siblings.avsc +2 -0
datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/dataset.py +122 -0
datahub/sdk/entity.py +99 -3
datahub/sdk/entity_client.py +27 -3
datahub/sdk/main_client.py +24 -1
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +94 -37
datahub/sql_parsing/split_statements.py +17 -3
datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
datahub/sql_parsing/tool_meta_extractor.py +27 -2
datahub/testing/mcp_diff.py +1 -18
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/source/vertexai.py +0 -697
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
{acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0

datahub/sdk/search_client.py CHANGED Viewed

@@ -2,36 +2,106 @@ from __future__ import annotations
 from typing import (
     TYPE_CHECKING,
-    Dict,
     Iterable,
     List,
     Optional,
+    Tuple,
+    Type,
+    TypeVar,
 )
-from datahub.ingestion.graph.filters import RawSearchFilterRule
+from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
 from datahub.metadata.urns import Urn
-from datahub.sdk.search_filters import Filter
+from datahub.sdk.search_filters import (
+    Filter,
+    FilterDsl,
+    _EntityTypeFilter,
+    _OrFilters,
+    _StatusFilter,
+)
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient
+_FilterType = TypeVar("_FilterType", bound=Filter)
+def _typed_dfs(
+    filter: Optional[_FilterType], type: Type[_FilterType]
+) -> Optional[List[_FilterType]]:
+    if filter is None:
+        return None
+    found: Optional[List[_FilterType]] = None
+    for f in filter.dfs():
+        if isinstance(f, type):
+            if found is None:
+                found = []
+            found.append(f)
+    return found
 def compile_filters(
     filter: Optional[Filter],
-) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
+) -> Tuple[Optional[List[str]], RawSearchFilter]:
     # TODO: Not every filter type is supported for every entity type.
     # If we can detect issues with the filters at compile time, we should
     # raise an error.
-    if filter is None:
-        return None
+    existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
+    if existing_soft_deleted_filter is None:
+        soft_deleted_filter = FilterDsl.soft_deleted(
+            RemovedStatusFilter.NOT_SOFT_DELETED
+        )
+        if filter is None:
+            filter = soft_deleted_filter
+        else:
+            filter = FilterDsl.and_(filter, soft_deleted_filter)
+    # This should be safe - if filter were None coming in, then we would replace it
+    # with the soft-deleted filter.
+    assert filter is not None
     initial_filters = filter.compile()
-    return [
+    compiled_filters: RawSearchFilter = [
         {"and": [rule.to_raw() for rule in andClause["and"]]}
         for andClause in initial_filters
     ]
+    entity_types = compute_entity_types(initial_filters)
+    return entity_types, compiled_filters
+def compute_entity_types(
+    filters: _OrFilters,
+) -> Optional[List[str]]:
+    found_filters = False
+    found_positive_filters = False
+    entity_types: List[str] = []
+    for ands in filters:
+        for clause in ands["and"]:
+            if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
+                found_filters = True
+                if not clause.negated:
+                    found_positive_filters = True
+                entity_types.extend(clause.values)
+    if not found_filters:
+        # If we didn't find any filters, use None so we use the default set.
+        return None
+    if not found_positive_filters:
+        # If we only found negated filters, then it's probably a query like
+        # "find me all entities except for dashboards". In that case, we
+        # still want to use the default set.
+        return None
+    return entity_types
 class SearchClient:
     def __init__(self, client: DataHubClient):
@@ -43,8 +113,11 @@ class SearchClient:
         filter: Optional[Filter] = None,
     ) -> Iterable[Urn]:
         # TODO: Add better limit / pagination support.
+        types, compiled_filters = compile_filters(filter)
         for urn in self._client._graph.get_urns_by_filter(
             query=query,
-            extra_or_filters=compile_filters(filter),
+            status=None,
+            extra_or_filters=compiled_filters,
+            entity_types=types,
         ):
             yield Urn.from_string(urn)

datahub/sdk/search_filters.py CHANGED Viewed

@@ -3,7 +3,10 @@ from __future__ import annotations
 import abc
 from typing import (
     Any,
+    ClassVar,
+    Iterator,
     List,
+    Optional,
     Sequence,
     TypedDict,
     Union,
@@ -13,8 +16,13 @@ import pydantic
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
-from datahub.ingestion.graph.client import entity_type_to_graphql
-from datahub.ingestion.graph.filters import SearchFilterRule
+from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
+from datahub.ingestion.graph.filters import (
+    FilterOperator,
+    RemovedStatusFilter,
+    SearchFilterRule,
+    _get_status_filter,
+)
 from datahub.metadata.schema_classes import EntityTypeName
 from datahub.metadata.urns import DataPlatformUrn, DomainUrn
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
     def compile(self) -> _OrFilters:
         pass
-def _flexible_entity_type_to_graphql(entity_type: str) -> str:
-    if entity_type.upper() == entity_type:
-        # Assume that we were passed a graphql EntityType enum value,
-        # so no conversion is needed.
-        return entity_type
-    return entity_type_to_graphql(entity_type)
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
 class _EntityTypeFilter(_BaseFilter):
+    """Filter for specific entity types.
+    If no entity type filter is specified, we will search all entity types in the
+    default search set, mirroring the behavior of the DataHub UI.
+    """
+    ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
     entity_type: List[str] = pydantic.Field(
         description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
     )
     def _build_rule(self) -> SearchFilterRule:
         return SearchFilterRule(
-            field="_entityType",
+            field=self.ENTITY_TYPE_FIELD,
             condition="EQUAL",
-            values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
+            values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
         )
     def compile(self) -> _OrFilters:
@@ -63,25 +74,39 @@ class _EntityTypeFilter(_BaseFilter):
 class _EntitySubtypeFilter(_BaseFilter):
-    entity_type: str
     entity_subtype: str = pydantic.Field(
         description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
     )
+    def _build_rule(self) -> SearchFilterRule:
+        return SearchFilterRule(
+            field="typeNames",
+            condition="EQUAL",
+            values=[self.entity_subtype],
+        )
     def compile(self) -> _OrFilters:
-        rules = [
-            SearchFilterRule(
-                field="_entityType",
-                condition="EQUAL",
-                values=[_flexible_entity_type_to_graphql(self.entity_type)],
-            ),
-            SearchFilterRule(
-                field="typeNames",
-                condition="EQUAL",
-                values=[self.entity_subtype],
-            ),
-        ]
-        return [{"and": rules}]
+        return [{"and": [self._build_rule()]}]
+class _StatusFilter(_BaseFilter):
+    """Filter for the status of entities during search.
+    If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
+    """
+    status: RemovedStatusFilter
+    def _build_rule(self) -> Optional[SearchFilterRule]:
+        return _get_status_filter(self.status)
+    def compile(self) -> _OrFilters:
+        rule = self._build_rule()
+        if rule:
+            return [{"and": [rule]}]
+        else:
+            # Our boolean algebra logic requires something here - returning [] would cause errors.
+            return FilterDsl.true().compile()
 class _PlatformFilter(_BaseFilter):
@@ -157,10 +182,10 @@ class _EnvFilter(_BaseFilter):
 class _CustomCondition(_BaseFilter):
-    """Represents a single field condition"""
+    """Represents a single field condition."""
     field: str
-    condition: str
+    condition: FilterOperator
     values: List[str]
     def compile(self) -> _OrFilters:
@@ -173,7 +198,7 @@ class _CustomCondition(_BaseFilter):
 class _And(_BaseFilter):
-    """Represents an AND conjunction of filters"""
+    """Represents an AND conjunction of filters."""
     and_: Sequence["Filter"] = pydantic.Field(alias="and")
     # TODO: Add validator to ensure that the "and" field is not empty
@@ -219,9 +244,14 @@ class _And(_BaseFilter):
             ]
         }
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        for filter in self.and_:
+            yield from filter.dfs()
 class _Or(_BaseFilter):
-    """Represents an OR conjunction of filters"""
+    """Represents an OR conjunction of filters."""
     or_: Sequence["Filter"] = pydantic.Field(alias="or")
     # TODO: Add validator to ensure that the "or" field is not empty
@@ -232,9 +262,14 @@ class _Or(_BaseFilter):
             merged_filter.extend(filter.compile())
         return merged_filter
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        for filter in self.or_:
+            yield from filter.dfs()
 class _Not(_BaseFilter):
-    """Represents a NOT filter"""
+    """Represents a NOT filter."""
     not_: "Filter" = pydantic.Field(alias="not")
@@ -262,6 +297,10 @@ class _Not(_BaseFilter):
         return final_filters
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        yield from self.not_.dfs()
 # TODO: With pydantic 2, we can use a RootModel with a
 # discriminated union to make the error messages more informative.
@@ -271,6 +310,7 @@ Filter = Union[
     _Not,
     _EntityTypeFilter,
     _EntitySubtypeFilter,
+    _StatusFilter,
     _PlatformFilter,
     _DomainFilter,
     _EnvFilter,
@@ -318,6 +358,18 @@ class FilterDsl:
     def not_(arg: "Filter") -> _Not:
         return _Not(not_=arg)
+    @staticmethod
+    def true() -> "Filter":
+        return _CustomCondition(
+            field="urn",
+            condition="EXISTS",
+            values=[],
+        )
+    @staticmethod
+    def false() -> "Filter":
+        return FilterDsl.not_(FilterDsl.true())
     @staticmethod
     def entity_type(
         entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -329,14 +381,15 @@ class FilterDsl:
         )
     @staticmethod
-    def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
+    def entity_subtype(
+        entity_subtype: Union[str, Sequence[str]],
+    ) -> _EntitySubtypeFilter:
         return _EntitySubtypeFilter(
-            entity_type=entity_type,
-            entity_subtype=subtype,
+            entity_subtype=entity_subtype,
         )
     @staticmethod
-    def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
+    def platform(platform: Union[str, Sequence[str]], /) -> _PlatformFilter:
         return _PlatformFilter(
             platform=[platform] if isinstance(platform, str) else platform
         )
@@ -344,11 +397,11 @@ class FilterDsl:
     # TODO: Add a platform_instance filter
     @staticmethod
-    def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
+    def domain(domain: Union[str, Sequence[str]], /) -> _DomainFilter:
         return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
     @staticmethod
-    def env(env: Union[str, List[str]], /) -> _EnvFilter:
+    def env(env: Union[str, Sequence[str]], /) -> _EnvFilter:
         return _EnvFilter(env=[env] if isinstance(env, str) else env)
     @staticmethod
@@ -359,13 +412,17 @@ class FilterDsl:
             values=[f"{key}={value}"],
         )
+    @staticmethod
+    def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
+        return _StatusFilter(status=status)
     # TODO: Add a soft-deletion status filter
     # TODO: add a container / browse path filter
     # TODO add shortcut for custom filters
     @staticmethod
     def custom_filter(
-        field: str, condition: str, values: List[str]
+        field: str, condition: FilterOperator, values: Sequence[str]
     ) -> _CustomCondition:
         return _CustomCondition(
             field=field,

datahub/sql_parsing/split_statements.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import logging
 import re
 from enum import Enum
 from typing import Iterator, List, Tuple
+logger = logging.getLogger(__name__)
 SELECT_KEYWORD = "SELECT"
 CASE_KEYWORD = "CASE"
 END_KEYWORD = "END"
@@ -120,7 +122,9 @@ class _StatementSplitter:
         # Reset current_statement-specific state.
         self.does_select_mean_new_statement = False
         if self.current_case_statements != 0:
-            breakpoint()
+            logger.warning(
+                f"Unexpected END keyword. Current case statements: {self.current_case_statements}"
+            )
         self.current_case_statements = 0
     def process(self) -> Iterator[str]:
@@ -233,8 +237,10 @@ class _StatementSplitter:
             ),
         )
         if (
-            is_force_new_statement_keyword and most_recent_real_char != ")"
-        ):  # usually we'd have a close paren that closes a CTE
+            is_force_new_statement_keyword
+            and not self._has_preceding_cte(most_recent_real_char)
+            and not self._is_part_of_merge_query()
+        ):
             # Force termination of current statement
             yield from self._yield_if_complete()
@@ -247,6 +253,14 @@ class _StatementSplitter:
         else:
             self.current_statement.append(c)
+    def _has_preceding_cte(self, most_recent_real_char: str) -> bool:
+        # usually we'd have a close paren that closes a CTE
+        return most_recent_real_char == ")"
+    def _is_part_of_merge_query(self) -> bool:
+        # In merge statement we'd have `when matched then` or `when not matched then"
+        return "".join(self.current_statement).strip().lower().endswith("then")
 def split_statements(sql: str) -> Iterator[str]:
     """

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -30,6 +30,7 @@ from datahub.metadata.urns import (
     DatasetUrn,
     QueryUrn,
     SchemaFieldUrn,
+    Urn,
 )
 from datahub.sql_parsing.schema_resolver import (
     SchemaResolver,
@@ -139,6 +140,8 @@ class QueryMetadata:
     used_temp_tables: bool = True
+    origin: Optional[Urn] = None
     def make_created_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=make_ts_millis(self.latest_timestamp) or 0,
@@ -221,6 +224,7 @@ class PreparsedQuery:
     )
     # Use this to store addtitional key-value information about query for debugging
     extra_info: Optional[dict] = None
+    origin: Optional[Urn] = None
 @dataclasses.dataclass
@@ -903,6 +907,7 @@ class SqlParsingAggregator(Closeable):
                 column_usage=parsed.column_usage or {},
                 confidence_score=parsed.confidence_score,
                 used_temp_tables=session_has_temp_tables,
+                origin=parsed.origin,
             )
         )
@@ -1464,6 +1469,7 @@ class SqlParsingAggregator(Closeable):
                     source=models.QuerySourceClass.SYSTEM,
                     created=query.make_created_audit_stamp(),
                     lastModified=query.make_last_modified_audit_stamp(),
+                    origin=query.origin.urn() if query.origin else None,
                 ),
                 models.QuerySubjectsClass(
                     subjects=[

datahub/sql_parsing/tool_meta_extractor.py CHANGED Viewed

@@ -13,7 +13,7 @@ from datahub.api.entities.platformresource.platform_resource import (
 )
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
+from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn, DataPlatformUrn, Urn
 from datahub.utilities.search_utils import LogicalOperator
 from datahub.utilities.stats_collections import int_top_k_dict
@@ -21,6 +21,10 @@ UrnStr = str
 logger = logging.getLogger(__name__)
+MODE_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:mode")
+LOOKER_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:looker")
+HEX_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:hex")
 class QueryLog(Protocol):
     """Represents Query Log Entry
@@ -30,6 +34,7 @@ class QueryLog(Protocol):
     query_text: str
     user: Optional[Union[CorpUserUrn, CorpGroupUrn]]
     extra_info: Optional[dict]
+    origin: Optional[Urn]
 def _get_last_line(query: str) -> str:
@@ -67,6 +72,10 @@ class ToolMetaExtractor:
                 "looker",
                 self._extract_looker_query,
             ),
+            (
+                "hex",
+                self._extract_hex_query,
+            ),
         ]
         # maps user id (as string) to email address
         self.looker_user_mapping = looker_user_mapping
@@ -153,7 +162,7 @@ class ToolMetaExtractor:
         entry.extra_info = entry.extra_info or {}
         entry.extra_info["user_via"] = original_user
-        # TODO: Generate an "origin" urn.
+        entry.origin = MODE_PLATFORM_URN
         return True
@@ -190,6 +199,22 @@ class ToolMetaExtractor:
         entry.extra_info = entry.extra_info or {}
         entry.extra_info["user_via"] = original_user
+        entry.origin = LOOKER_PLATFORM_URN
+        return True
+    def _extract_hex_query(self, entry: QueryLog) -> bool:
+        """
+        Returns:
+            bool: whether QueryLog entry is that of hex.
+        """
+        last_line = _get_last_line(entry.query_text)
+        if not last_line.startswith("-- Hex query metadata:"):
+            return False
+        entry.origin = HEX_PLATFORM_URN
         return True
     def extract_bi_metadata(self, entry: QueryLog) -> bool:

datahub/testing/mcp_diff.py CHANGED Viewed

@@ -8,7 +8,6 @@ import deepdiff.serialization
 import yaml
 from deepdiff import DeepDiff
 from deepdiff.model import DiffLevel
-from deepdiff.operator import BaseOperator
 from typing_extensions import Literal
 ReportType = Literal[
@@ -59,27 +58,12 @@ class AspectForDiff:
 @dataclasses.dataclass
 class DeltaInfo:
-    """Information about an MCP used to construct a diff delta.
-    In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
-    """
+    """Information about an MCP used to construct a diff delta."""
     idx: int  # Location in list of MCEs in golden file
     original: Dict[str, Any]  # Original json-serialized MCP
-class DeltaInfoOperator(BaseOperator):
-    """Warning: Doesn't seem to be working right now.
-    Ignored via an ignore path as an extra layer of defense.
-    """
-    def __init__(self):
-        super().__init__(types=[DeltaInfo])
-    def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
-        return True
 AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
@@ -176,7 +160,6 @@ class MCPDiff:
                     t2=t2,
                     exclude_regex_paths=ignore_paths,
                     ignore_order=True,
-                    custom_operators=[DeltaInfoOperator()],
                 )
                 if diff:
                     aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)

datahub/utilities/threaded_iterator_executor.py CHANGED Viewed

@@ -1,7 +1,15 @@
 import concurrent.futures
 import contextlib
 import queue
-from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    Optional,
+    Tuple,
+    TypeVar,
+)
 T = TypeVar("T")
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
         worker_func: Callable[..., Iterable[T]],
         args_list: Iterable[Tuple[Any, ...]],
         max_workers: int,
-    ) -> Generator[T, None, None]:
-        out_q: queue.Queue[T] = queue.Queue()
+        max_backpressure: Optional[int] = None,
+    ) -> Iterator[T]:
+        if max_backpressure is None:
+            max_backpressure = 10 * max_workers
+        assert max_backpressure >= max_workers
+        out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
         def _worker_wrapper(
             worker_func: Callable[..., Iterable[T]], *args: Any

acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc17py3-none-any.whl → 1.0.0.1py3-none-any.whl