PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show

{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +1 -1
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +3 -5
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +3 -3
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +6 -12
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +7 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +251 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +29 -5
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +20 -13
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/sdk/dataset.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import warnings
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Type, Union
 from typing_extensions import Self, TypeAlias, assert_never
@@ -13,37 +13,43 @@ from datahub.errors import (
     IngestionAttributionWarning,
     ItemNotFoundError,
     SchemaFieldKeyError,
+    SdkUsageError,
 )
 from datahub.ingestion.source.sql.sql_types import resolve_sql_type
 from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
 from datahub.sdk._attribution import is_ingestion_attribution
-from datahub.sdk._entity import Entity
 from datahub.sdk._shared import (
-    ContainerInputType,
     DatasetUrnOrStr,
     DomainInputType,
     HasContainer,
     HasDomain,
+    HasInstitutionalMemory,
     HasOwnership,
     HasPlatformInstance,
     HasSubtype,
     HasTags,
     HasTerms,
+    LinksInputType,
     OwnersInputType,
+    ParentContainerInputType,
+    TagInputType,
     TagsInputType,
+    TermInputType,
     TermsInputType,
     make_time_stamp,
     parse_time_stamp,
 )
+from datahub.sdk._utils import add_list_unique, remove_list_unique
+from datahub.sdk.entity import Entity, ExtraAspectsType
+from datahub.utilities.sentinels import Unset, unset
 SchemaFieldInputType: TypeAlias = Union[
-    str,
     Tuple[str, str],  # (name, type)
     Tuple[str, str, str],  # (name, type, description)
     models.SchemaFieldClass,
 ]
 SchemaFieldsInputType: TypeAlias = Union[
-    List[SchemaFieldInputType],
+    Sequence[SchemaFieldInputType],
     models.SchemaMetadataClass,
 ]
@@ -68,9 +74,9 @@ UpstreamLineageInputType: TypeAlias = Union[
 def _parse_upstream_input(
     upstream_input: UpstreamInputType,
 ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
-    if isinstance(upstream_input, models.UpstreamClass):
-        return upstream_input
-    elif isinstance(upstream_input, models.FineGrainedLineageClass):
+    if isinstance(
+        upstream_input, (models.UpstreamClass, models.FineGrainedLineageClass)
+    ):
         return upstream_input
     elif isinstance(upstream_input, (str, DatasetUrn)):
         return models.UpstreamClass(
@@ -271,6 +277,51 @@ class SchemaField:
                 tags=parsed_tags
             )
+    def add_tag(self, tag: TagInputType) -> None:
+        parsed_tag = self._parent._parse_tag_association_class(tag)
+        if is_ingestion_attribution():
+            raise SdkUsageError(
+                "Adding field tags in ingestion mode is not yet supported. "
+                "Use set_tags instead."
+            )
+        else:
+            editable_field = self._ensure_editable_schema_field()
+            if editable_field.globalTags is None:
+                editable_field.globalTags = models.GlobalTagsClass(tags=[])
+            add_list_unique(
+                editable_field.globalTags.tags,
+                key=self._parent._tag_key,
+                item=parsed_tag,
+            )
+    def remove_tag(self, tag: TagInputType) -> None:
+        parsed_tag = self._parent._parse_tag_association_class(tag)
+        if is_ingestion_attribution():
+            raise SdkUsageError(
+                "Adding field tags in ingestion mode is not yet supported. "
+                "Use set_tags instead."
+            )
+        else:
+            base_field = self._base_schema_field()
+            if base_field.globalTags is not None:
+                remove_list_unique(
+                    base_field.globalTags.tags,
+                    key=self._parent._tag_key,
+                    item=parsed_tag,
+                    missing_ok=True,
+                )
+            editable_field = self._ensure_editable_schema_field()
+            if editable_field.globalTags is not None:
+                remove_list_unique(
+                    editable_field.globalTags.tags,
+                    key=self._parent._tag_key,
+                    item=parsed_tag,
+                )
     @property
     def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
         # TODO: Basically the same implementation as tags - can we share code?
@@ -287,7 +338,7 @@ class SchemaField:
         return terms
-    def set_terms(self, terms: List[models.GlossaryTermAssociationClass]) -> None:
+    def set_terms(self, terms: TermsInputType) -> None:
         parsed_terms = [
             self._parent._parse_glossary_term_association_class(term) for term in terms
         ]
@@ -318,12 +369,62 @@ class SchemaField:
                 )
             )
+    def add_term(self, term: TermInputType) -> None:
+        parsed_term = self._parent._parse_glossary_term_association_class(term)
+        if is_ingestion_attribution():
+            raise SdkUsageError(
+                "Adding field terms in ingestion mode is not yet supported. "
+                "Use set_terms instead."
+            )
+        else:
+            editable_field = self._ensure_editable_schema_field()
+            if editable_field.glossaryTerms is None:
+                editable_field.glossaryTerms = models.GlossaryTermsClass(
+                    terms=[],
+                    auditStamp=self._parent._terms_audit_stamp(),
+                )
+            add_list_unique(
+                editable_field.glossaryTerms.terms,
+                key=self._parent._terms_key,
+                item=parsed_term,
+            )
+    def remove_term(self, term: TermInputType) -> None:
+        parsed_term = self._parent._parse_glossary_term_association_class(term)
+        if is_ingestion_attribution():
+            raise SdkUsageError(
+                "Removing field terms in ingestion mode is not yet supported. "
+                "Use set_terms instead."
+            )
+        else:
+            base_field = self._base_schema_field()
+            if base_field.glossaryTerms is not None:
+                remove_list_unique(
+                    base_field.glossaryTerms.terms,
+                    key=self._parent._terms_key,
+                    item=parsed_term,
+                    missing_ok=True,
+                )
+            editable_field = self._ensure_editable_schema_field()
+            if editable_field.glossaryTerms is not None:
+                remove_list_unique(
+                    editable_field.glossaryTerms.terms,
+                    key=self._parent._terms_key,
+                    item=parsed_term,
+                    missing_ok=True,
+                )
 class Dataset(
     HasPlatformInstance,
     HasSubtype,
     HasContainer,
     HasOwnership,
+    HasInstitutionalMemory,
     HasTags,
     HasTerms,
     HasDomain,
@@ -352,13 +453,15 @@ class Dataset(
         created: Optional[datetime] = None,
         last_modified: Optional[datetime] = None,
         # Standard aspects.
+        parent_container: ParentContainerInputType | Unset = unset,
         subtype: Optional[str] = None,
-        container: Optional[ContainerInputType] = None,
         owners: Optional[OwnersInputType] = None,
+        links: Optional[LinksInputType] = None,
         tags: Optional[TagsInputType] = None,
         terms: Optional[TermsInputType] = None,
         # TODO structured_properties
         domain: Optional[DomainInputType] = None,
+        extra_aspects: ExtraAspectsType = None,
         # Dataset-specific aspects.
         schema: Optional[SchemaFieldsInputType] = None,
         upstreams: Optional[models.UpstreamLineageClass] = None,
@@ -370,6 +473,7 @@ class Dataset(
             env=env,
         )
         super().__init__(urn)
+        self._set_extra_aspects(extra_aspects)
         self._set_platform_instance(urn.platform, platform_instance)
@@ -393,12 +497,14 @@ class Dataset(
         if last_modified is not None:
             self.set_last_modified(last_modified)
+        if parent_container is not unset:
+            self._set_container(parent_container)
         if subtype is not None:
             self.set_subtype(subtype)
-        if container is not None:
-            self._set_container(container)
         if owners is not None:
             self.set_owners(owners)
+        if links is not None:
+            self.set_links(links)
         if tags is not None:
             self.set_tags(tags)
         if terms is not None:
@@ -537,14 +643,6 @@ class Dataset(
                 nativeDataType=field_type,
                 description=description,
             )
-        elif isinstance(schema_field_input, str):
-            # TODO: Not sure this branch makes sense - we should probably just require types?
-            return models.SchemaFieldClass(
-                fieldPath=schema_field_input,
-                type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
-                nativeDataType="unknown",
-                description=None,
-            )
         else:
             assert_never(schema_field_input)

datahub/sdk/{_entity.py → entity.py} RENAMED Viewed

@@ -1,5 +1,7 @@
+from __future__ import annotations
 import abc
-from typing import List, Optional, Type, Union
+from typing import TYPE_CHECKING, List, Optional, Type, Union
 from typing_extensions import Self
@@ -10,6 +12,12 @@ from datahub.errors import SdkUsageError
 from datahub.metadata.urns import Urn
 from datahub.utilities.urns._urn_base import _SpecificUrn
+if TYPE_CHECKING:
+    from datahub.ingestion.api.workunit import MetadataWorkUnit
+ExtraAspectsType = Union[None, List[AspectTypeVar]]
 class Entity:
     __slots__ = ("_urn", "_prev_aspects", "_aspects")
@@ -36,6 +44,8 @@ class Entity:
     def _init_from_graph(self, current_aspects: models.AspectBag) -> Self:
         self._prev_aspects = current_aspects
+        self._aspects = {}
         aspect: models._Aspect
         for aspect_name, aspect in (current_aspects or {}).items():  # type: ignore
             aspect_copy = type(aspect).from_obj(aspect.to_obj())
@@ -46,6 +56,10 @@ class Entity:
     @abc.abstractmethod
     def get_urn_type(cls) -> Type[_SpecificUrn]: ...
+    @classmethod
+    def entity_type_name(cls) -> str:
+        return cls.get_urn_type().ENTITY_TYPE
     @property
     def urn(self) -> _SpecificUrn:
         return self._urn
@@ -85,5 +99,14 @@ class Entity:
             )
         return mcps
+    def as_workunits(self) -> List[MetadataWorkUnit]:
+        return [mcp.as_workunit() for mcp in self._as_mcps()]
+    def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None:
+        # TODO: Add validation to ensure that an "extra aspect" does not conflict
+        # with / get overridden by a standard aspect.
+        for aspect in extra_aspects or []:
+            self._set_aspect(aspect)
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}('{self.urn}')"

datahub/sdk/entity_client.py CHANGED Viewed

@@ -14,10 +14,10 @@ from datahub.metadata.urns import (
     Urn,
 )
 from datahub.sdk._all_entities import ENTITY_CLASSES
-from datahub.sdk._entity import Entity
 from datahub.sdk._shared import UrnOrStr
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
+from datahub.sdk.entity import Entity
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient

datahub/sdk/main_client.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.ingestion.graph.config import DatahubClientConfig
 from datahub.sdk.entity_client import EntityClient
 from datahub.sdk.resolver_client import ResolverClient
+from datahub.sdk.search_client import SearchClient
 class DataHubClient:
@@ -39,12 +40,28 @@ class DataHubClient:
         self._graph = graph
+    # TODO: test connection
     @classmethod
     def from_env(cls) -> "DataHubClient":
+        """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
+        This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
+        it will read credentials from ~/.datahubenv. That file can be created using
+        the `datahub init` command.
+        If you're looking to specify the server/token in code, use the
+        DataHubClient(server=..., token=...) constructor instead.
+        Returns:
+            A DataHubClient instance.
+        """
         # Inspired by the DockerClient.from_env() method.
         # TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
         # That file is part of the "environment", but is not a traditional "env variable".
         graph = get_default_graph()
         return cls(graph=graph)
     @property
@@ -54,3 +71,9 @@ class DataHubClient:
     @property
     def resolve(self) -> ResolverClient:
         return ResolverClient(self)
+    @property
+    def search(self) -> SearchClient:
+        return SearchClient(self)
+    # TODO: lineage client

datahub/sdk/resolver_client.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datahub.metadata.urns import (
     DomainUrn,
     GlossaryTermUrn,
 )
+from datahub.sdk.search_filters import Filter, FilterDsl as F
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient
@@ -38,37 +39,28 @@ class ResolverClient:
         self, *, name: Optional[str] = None, email: Optional[str] = None
     ) -> CorpUserUrn:
         filter_explanation: str
-        filters = []
+        filter: Filter
         if name is not None:
             if email is not None:
                 raise SdkUsageError("Cannot specify both name and email for auto_user")
-            # TODO: do we filter on displayName or fullName?
+            # We're filtering on both fullName and displayName. It's not clear
+            # what the right behavior is here.
             filter_explanation = f"with name {name}"
-            filters.append(
-                {
-                    "field": "fullName",
-                    "values": [name],
-                    "condition": "EQUAL",
-                }
+            filter = F.or_(
+                F.custom_filter("fullName", "EQUAL", [name]),
+                F.custom_filter("displayName", "EQUAL", [name]),
             )
         elif email is not None:
             filter_explanation = f"with email {email}"
-            filters.append(
-                {
-                    "field": "email",
-                    "values": [email],
-                    "condition": "EQUAL",
-                }
-            )
+            filter = F.custom_filter("email", "EQUAL", [email])
         else:
             raise SdkUsageError("Must specify either name or email for auto_user")
-        users = list(
-            self._graph.get_urns_by_filter(
-                entity_types=[CorpUserUrn.ENTITY_TYPE],
-                extraFilters=filters,
-            )
+        filter = F.and_(
+            F.entity_type(CorpUserUrn.ENTITY_TYPE),
+            filter,
         )
+        users = list(self._client.search.get_urns(filter=filter))
         if len(users) == 0:
             # TODO: In auto methods, should we just create the user/domain/etc if it doesn't exist?
             raise ItemNotFoundError(f"User {filter_explanation} not found")
@@ -82,15 +74,11 @@ class ResolverClient:
     def term(self, *, name: str) -> GlossaryTermUrn:
         # TODO: Add some limits on the graph fetch
         terms = list(
-            self._graph.get_urns_by_filter(
-                entity_types=[GlossaryTermUrn.ENTITY_TYPE],
-                extraFilters=[
-                    {
-                        "field": "id",
-                        "values": [name],
-                        "condition": "EQUAL",
-                    }
-                ],
+            self._client.search.get_urns(
+                filter=F.and_(
+                    F.entity_type(GlossaryTermUrn.ENTITY_TYPE),
+                    F.custom_filter("name", "EQUAL", [name]),
+                ),
             )
         )
         if len(terms) == 0:

datahub/sdk/search_client.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+)
+from datahub.ingestion.graph.filters import RawSearchFilterRule
+from datahub.metadata.urns import Urn
+from datahub.sdk.search_filters import Filter
+if TYPE_CHECKING:
+    from datahub.sdk.main_client import DataHubClient
+def compile_filters(
+    filter: Optional[Filter],
+) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
+    # TODO: Not every filter type is supported for every entity type.
+    # If we can detect issues with the filters at compile time, we should
+    # raise an error.
+    if filter is None:
+        return None
+    initial_filters = filter.compile()
+    return [
+        {"and": [rule.to_raw() for rule in andClause["and"]]}
+        for andClause in initial_filters
+    ]
+class SearchClient:
+    def __init__(self, client: DataHubClient):
+        self._client = client
+    def get_urns(
+        self,
+        query: Optional[str] = None,
+        filter: Optional[Filter] = None,
+    ) -> Iterable[Urn]:
+        # TODO: Add better limit / pagination support.
+        for urn in self._client._graph.get_urns_by_filter(
+            query=query,
+            extra_or_filters=compile_filters(filter),
+        ):
+            yield Urn.from_string(urn)

acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc3py3-none-any.whl → 1.0.0py3-none-any.whl