PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2rc1__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/METADATA +2518 -2518
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/RECORD +25 -25
datahub/_version.py +1 -1
datahub/cli/ingest_cli.py +4 -4
datahub/emitter/mcp_builder.py +4 -0
datahub/errors.py +4 -0
datahub/ingestion/api/source.py +2 -1
datahub/ingestion/api/source_helpers.py +9 -1
datahub/ingestion/graph/client.py +122 -7
datahub/ingestion/graph/filters.py +41 -16
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/iceberg/iceberg.py +9 -9
datahub/ingestion/source/mlflow.py +3 -7
datahub/ingestion/source/powerbi/powerbi.py +14 -1
datahub/ingestion/source/sql/trino.py +4 -3
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/vertexai/vertexai.py +7 -7
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +73 -11
datahub/utilities/ingest_utils.py +2 -2
datahub/utilities/threaded_iterator_executor.py +16 -3
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/mlflow.py CHANGED Viewed

@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
 )
 from datahub.configuration.source_common import EnvConfigMixin
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.emitter.mcp_builder import ContainerKey
+from datahub.emitter.mcp_builder import ExperimentKey
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -77,10 +77,6 @@ from datahub.sdk.dataset import Dataset
 T = TypeVar("T")
-class ContainerKeyWithId(ContainerKey):
-    id: str
 class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
     tracking_uri: Optional[str] = Field(
         default=None,
@@ -252,7 +248,7 @@ class MLflowSource(StatefulIngestionSourceBase):
         self, experiment: Experiment
     ) -> Iterable[MetadataWorkUnit]:
         experiment_container = Container(
-            container_key=ContainerKeyWithId(
+            container_key=ExperimentKey(
                 platform=str(DataPlatformUrn(platform_name=self.platform)),
                 id=experiment.name,
             ),
@@ -470,7 +466,7 @@ class MLflowSource(StatefulIngestionSourceBase):
     def _get_run_workunits(
         self, experiment: Experiment, run: Run
     ) -> Iterable[MetadataWorkUnit]:
-        experiment_key = ContainerKeyWithId(
+        experiment_key = ExperimentKey(
             platform=str(DataPlatformUrn(self.platform)), id=experiment.name
         )

datahub/ingestion/source/powerbi/powerbi.py CHANGED Viewed

@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
     UpstreamLineageClass,
     ViewPropertiesClass,
 )
-from datahub.metadata.urns import ChartUrn
+from datahub.metadata.urns import ChartUrn, DatasetUrn
 from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
 from datahub.utilities.dedup_list import deduplicate_list
 from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
@@ -1083,6 +1083,7 @@ class Mapper:
         report: powerbi_data_classes.Report,
         chart_mcps: List[MetadataChangeProposalWrapper],
         user_mcps: List[MetadataChangeProposalWrapper],
+        dataset_edges: List[EdgeClass],
     ) -> List[MetadataChangeProposalWrapper]:
         """
         Map PowerBi report to Datahub dashboard
@@ -1104,6 +1105,7 @@ class Mapper:
             charts=chart_urn_list,
             lastModified=ChangeAuditStamps(),
             dashboardUrl=report.webUrl,
+            datasetEdges=dataset_edges,
         )
         info_mcp = self.new_mcp(
@@ -1197,12 +1199,23 @@ class Mapper:
         ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
         chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
+        # collect all upstream datasets; using a set to retain unique urns
+        dataset_urns = {
+            dataset.entityUrn
+            for dataset in ds_mcps
+            if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
+        }
+        dataset_edges = [
+            EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
+        ]
         # Let's convert report to datahub dashboard
         report_mcps = self.report_to_dashboard(
             workspace=workspace,
             report=report,
             chart_mcps=chart_mcps,
             user_mcps=user_mcps,
+            dataset_edges=dataset_edges,
         )
         # Now add MCPs in sequence

datahub/ingestion/source/sql/trino.py CHANGED Viewed

@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
         if catalog_name is None:
             raise exc.NoSuchTableError("catalog is required in connection")
         connector_name = get_catalog_connector_name(connection.engine, catalog_name)
-        if connector_name is None:
-            return {}
-        if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
+        if (
+            connector_name is not None
+            and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
+        ):
             properties_table = self._get_full_table(f"{table_name}$properties", schema)
             query = f"SELECT * FROM {properties_table}"
             row = connection.execute(sql.text(query)).fetchone()

datahub/ingestion/source/state/stale_entity_removal_handler.py CHANGED Viewed

@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
         description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
         le=100.0,
         ge=0.0,
-        hidden_from_docs=True,
     )

datahub/ingestion/source/vertexai/vertexai.py CHANGED Viewed

@@ -22,7 +22,11 @@ from google.oauth2 import service_account
 import datahub.emitter.mce_builder as builder
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.emitter.mcp_builder import ContainerKey, ProjectIdKey, gen_containers
+from datahub.emitter.mcp_builder import (
+    ExperimentKey,
+    ProjectIdKey,
+    gen_containers,
+)
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -96,10 +100,6 @@ class ModelMetadata:
     endpoints: Optional[List[Endpoint]] = None
-class ContainerKeyWithId(ContainerKey):
-    id: str
 @platform_name("Vertex AI", id="vertexai")
 @config_class(VertexAIConfig)
 @support_status(SupportStatus.TESTING)
@@ -173,7 +173,7 @@ class VertexAISource(Source):
     ) -> Iterable[MetadataWorkUnit]:
         yield from gen_containers(
             parent_container_key=self._get_project_container(),
-            container_key=ContainerKeyWithId(
+            container_key=ExperimentKey(
                 platform=self.platform,
                 id=self._make_vertexai_experiment_name(experiment.name),
             ),
@@ -309,7 +309,7 @@ class VertexAISource(Source):
     def _gen_experiment_run_mcps(
         self, experiment: Experiment, run: ExperimentRun
     ) -> Iterable[MetadataChangeProposalWrapper]:
-        experiment_key = ContainerKeyWithId(
+        experiment_key = ExperimentKey(
             platform=self.platform,
             id=self._make_vertexai_experiment_name(experiment.name),
         )

datahub/sdk/search_client.py CHANGED Viewed

@@ -2,36 +2,106 @@ from __future__ import annotations
 from typing import (
     TYPE_CHECKING,
-    Dict,
     Iterable,
     List,
     Optional,
+    Tuple,
+    Type,
+    TypeVar,
 )
-from datahub.ingestion.graph.filters import RawSearchFilterRule
+from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
 from datahub.metadata.urns import Urn
-from datahub.sdk.search_filters import Filter
+from datahub.sdk.search_filters import (
+    Filter,
+    FilterDsl,
+    _EntityTypeFilter,
+    _OrFilters,
+    _StatusFilter,
+)
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient
+_FilterType = TypeVar("_FilterType", bound=Filter)
+def _typed_dfs(
+    filter: Optional[_FilterType], type: Type[_FilterType]
+) -> Optional[List[_FilterType]]:
+    if filter is None:
+        return None
+    found: Optional[List[_FilterType]] = None
+    for f in filter.dfs():
+        if isinstance(f, type):
+            if found is None:
+                found = []
+            found.append(f)
+    return found
 def compile_filters(
     filter: Optional[Filter],
-) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
+) -> Tuple[Optional[List[str]], RawSearchFilter]:
     # TODO: Not every filter type is supported for every entity type.
     # If we can detect issues with the filters at compile time, we should
     # raise an error.
-    if filter is None:
-        return None
+    existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
+    if existing_soft_deleted_filter is None:
+        soft_deleted_filter = FilterDsl.soft_deleted(
+            RemovedStatusFilter.NOT_SOFT_DELETED
+        )
+        if filter is None:
+            filter = soft_deleted_filter
+        else:
+            filter = FilterDsl.and_(filter, soft_deleted_filter)
+    # This should be safe - if filter were None coming in, then we would replace it
+    # with the soft-deleted filter.
+    assert filter is not None
     initial_filters = filter.compile()
-    return [
+    compiled_filters: RawSearchFilter = [
         {"and": [rule.to_raw() for rule in andClause["and"]]}
         for andClause in initial_filters
     ]
+    entity_types = compute_entity_types(initial_filters)
+    return entity_types, compiled_filters
+def compute_entity_types(
+    filters: _OrFilters,
+) -> Optional[List[str]]:
+    found_filters = False
+    found_positive_filters = False
+    entity_types: List[str] = []
+    for ands in filters:
+        for clause in ands["and"]:
+            if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
+                found_filters = True
+                if not clause.negated:
+                    found_positive_filters = True
+                entity_types.extend(clause.values)
+    if not found_filters:
+        # If we didn't find any filters, use None so we use the default set.
+        return None
+    if not found_positive_filters:
+        # If we only found negated filters, then it's probably a query like
+        # "find me all entities except for dashboards". In that case, we
+        # still want to use the default set.
+        return None
+    return entity_types
 class SearchClient:
     def __init__(self, client: DataHubClient):
@@ -43,8 +113,11 @@ class SearchClient:
         filter: Optional[Filter] = None,
     ) -> Iterable[Urn]:
         # TODO: Add better limit / pagination support.
+        types, compiled_filters = compile_filters(filter)
         for urn in self._client._graph.get_urns_by_filter(
             query=query,
-            extra_or_filters=compile_filters(filter),
+            status=None,
+            extra_or_filters=compiled_filters,
+            entity_types=types,
         ):
             yield Urn.from_string(urn)

datahub/sdk/search_filters.py CHANGED Viewed

@@ -3,7 +3,10 @@ from __future__ import annotations
 import abc
 from typing import (
     Any,
+    ClassVar,
+    Iterator,
     List,
+    Optional,
     Sequence,
     TypedDict,
     Union,
@@ -13,8 +16,13 @@ import pydantic
 from datahub.configuration.common import ConfigModel
 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
-from datahub.ingestion.graph.client import entity_type_to_graphql
-from datahub.ingestion.graph.filters import FilterOperator, SearchFilterRule
+from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
+from datahub.ingestion.graph.filters import (
+    FilterOperator,
+    RemovedStatusFilter,
+    SearchFilterRule,
+    _get_status_filter,
+)
 from datahub.metadata.schema_classes import EntityTypeName
 from datahub.metadata.urns import DataPlatformUrn, DomainUrn
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
     def compile(self) -> _OrFilters:
         pass
-def _flexible_entity_type_to_graphql(entity_type: str) -> str:
-    if entity_type.upper() == entity_type:
-        # Assume that we were passed a graphql EntityType enum value,
-        # so no conversion is needed.
-        return entity_type
-    return entity_type_to_graphql(entity_type)
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
 class _EntityTypeFilter(_BaseFilter):
+    """Filter for specific entity types.
+    If no entity type filter is specified, we will search all entity types in the
+    default search set, mirroring the behavior of the DataHub UI.
+    """
+    ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
     entity_type: List[str] = pydantic.Field(
         description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
     )
     def _build_rule(self) -> SearchFilterRule:
         return SearchFilterRule(
-            field="_entityType",
+            field=self.ENTITY_TYPE_FIELD,
             condition="EQUAL",
-            values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
+            values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
         )
     def compile(self) -> _OrFilters:
@@ -78,6 +89,26 @@ class _EntitySubtypeFilter(_BaseFilter):
         return [{"and": [self._build_rule()]}]
+class _StatusFilter(_BaseFilter):
+    """Filter for the status of entities during search.
+    If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
+    """
+    status: RemovedStatusFilter
+    def _build_rule(self) -> Optional[SearchFilterRule]:
+        return _get_status_filter(self.status)
+    def compile(self) -> _OrFilters:
+        rule = self._build_rule()
+        if rule:
+            return [{"and": [rule]}]
+        else:
+            # Our boolean algebra logic requires something here - returning [] would cause errors.
+            return FilterDsl.true().compile()
 class _PlatformFilter(_BaseFilter):
     platform: List[str]
     # TODO: Add validator to convert string -> list of strings
@@ -213,6 +244,11 @@ class _And(_BaseFilter):
             ]
         }
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        for filter in self.and_:
+            yield from filter.dfs()
 class _Or(_BaseFilter):
     """Represents an OR conjunction of filters."""
@@ -226,6 +262,11 @@ class _Or(_BaseFilter):
             merged_filter.extend(filter.compile())
         return merged_filter
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        for filter in self.or_:
+            yield from filter.dfs()
 class _Not(_BaseFilter):
     """Represents a NOT filter."""
@@ -256,6 +297,10 @@ class _Not(_BaseFilter):
         return final_filters
+    def dfs(self) -> Iterator[_BaseFilter]:
+        yield self
+        yield from self.not_.dfs()
 # TODO: With pydantic 2, we can use a RootModel with a
 # discriminated union to make the error messages more informative.
@@ -265,6 +310,7 @@ Filter = Union[
     _Not,
     _EntityTypeFilter,
     _EntitySubtypeFilter,
+    _StatusFilter,
     _PlatformFilter,
     _DomainFilter,
     _EnvFilter,
@@ -312,6 +358,18 @@ class FilterDsl:
     def not_(arg: "Filter") -> _Not:
         return _Not(not_=arg)
+    @staticmethod
+    def true() -> "Filter":
+        return _CustomCondition(
+            field="urn",
+            condition="EXISTS",
+            values=[],
+        )
+    @staticmethod
+    def false() -> "Filter":
+        return FilterDsl.not_(FilterDsl.true())
     @staticmethod
     def entity_type(
         entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -354,6 +412,10 @@ class FilterDsl:
             values=[f"{key}={value}"],
         )
+    @staticmethod
+    def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
+        return _StatusFilter(status=status)
     # TODO: Add a soft-deletion status filter
     # TODO: add a container / browse path filter
     # TODO add shortcut for custom filters

datahub/utilities/ingest_utils.py CHANGED Viewed

@@ -32,10 +32,10 @@ def deploy_source_vars(
     name: Optional[str],
     config: str,
     urn: Optional[str],
-    executor_id: str,
+    executor_id: Optional[str],
     cli_version: Optional[str],
     schedule: Optional[str],
-    time_zone: str,
+    time_zone: Optional[str],
     extra_pip: Optional[str],
     debug: bool = False,
 ) -> dict:

datahub/utilities/threaded_iterator_executor.py CHANGED Viewed

@@ -1,7 +1,15 @@
 import concurrent.futures
 import contextlib
 import queue
-from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    Optional,
+    Tuple,
+    TypeVar,
+)
 T = TypeVar("T")
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
         worker_func: Callable[..., Iterable[T]],
         args_list: Iterable[Tuple[Any, ...]],
         max_workers: int,
-    ) -> Generator[T, None, None]:
-        out_q: queue.Queue[T] = queue.Queue()
+        max_backpressure: Optional[int] = None,
+    ) -> Iterator[T]:
+        if max_backpressure is None:
+            max_backpressure = 10 * max_workers
+        assert max_backpressure >= max_workers
+        out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
         def _worker_wrapper(
             worker_func: Callable[..., Iterable[T]], *args: Any

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2rc1py3-none-any.whl