PyPI - acryl-datahub - Versions diffs - 1.0.0rc8__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl - Mend

acryl-datahub 1.0.0rc8py3-none-any.whl → 1.0.0rc9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (46) hide show

{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2445 -2445
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +46 -42
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/specific/dataset_cli.py +128 -14
datahub/ingestion/graph/client.py +15 -11
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/superset.py +158 -24
datahub/metadata/_schema_classes.py +157 -14
datahub/metadata/_urns/urn_defs.py +58 -58
datahub/metadata/schema.avsc +23 -10
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +1 -0
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_shared.py +88 -3
datahub/sdk/container.py +7 -1
datahub/sdk/dataset.py +7 -1
datahub/sdk/{_entity.py → entity.py} +4 -0
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +7 -1
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0

datahub/metadata/schemas/MLModelKey.avsc CHANGED Viewed

@@ -30,7 +30,8 @@
       "structuredProperties",
       "forms",
       "testResults",
-      "versionProperties"
+      "versionProperties",
+      "subTypes"
     ]
   },
   "name": "MLModelKey",

datahub/metadata/schemas/MLPrimaryKeyKey.avsc CHANGED Viewed

@@ -17,7 +17,8 @@
       "dataPlatformInstance",
       "structuredProperties",
       "forms",
-      "testResults"
+      "testResults",
+      "subTypes"
     ]
   },
   "name": "MLPrimaryKeyKey",

datahub/metadata/schemas/PostKey.avsc CHANGED Viewed

@@ -5,7 +5,8 @@
     "keyForEntity": "post",
     "entityCategory": "core",
     "entityAspects": [
-      "postInfo"
+      "postInfo",
+      "subTypes"
     ]
   },
   "name": "PostKey",

datahub/metadata/schemas/SchemaFieldKey.avsc CHANGED Viewed

@@ -13,7 +13,8 @@
       "schemaFieldAliases",
       "documentation",
       "testResults",
-      "deprecation"
+      "deprecation",
+      "subTypes"
     ]
   },
   "name": "SchemaFieldKey",

datahub/metadata/schemas/VersionProperties.avsc CHANGED Viewed

@@ -137,6 +137,24 @@
       "name": "sortId",
       "doc": "Sort identifier that determines where a version lives in the order of the Version Set.\nWhat this looks like depends on the Version Scheme. For sort ids generated by DataHub we use an 8 character string representation."
     },
+    {
+      "type": {
+        "type": "enum",
+        "symbolDocs": {
+          "ALPHANUMERIC_GENERATED_BY_DATAHUB": "String managed by DataHub. Currently, an 8 character alphabetical string.",
+          "LEXICOGRAPHIC_STRING": "String sorted lexicographically."
+        },
+        "name": "VersioningScheme",
+        "namespace": "com.linkedin.pegasus2avro.versionset",
+        "symbols": [
+          "LEXICOGRAPHIC_STRING",
+          "ALPHANUMERIC_GENERATED_BY_DATAHUB"
+        ]
+      },
+      "name": "versioningScheme",
+      "default": "LEXICOGRAPHIC_STRING",
+      "doc": "What versioning scheme `sortId` belongs to.\nDefaults to a plain string that is lexicographically sorted."
+    },
     {
       "type": [
         "null",

datahub/metadata/schemas/VersionSetProperties.avsc CHANGED Viewed

@@ -36,9 +36,14 @@
     {
       "type": {
         "type": "enum",
+        "symbolDocs": {
+          "ALPHANUMERIC_GENERATED_BY_DATAHUB": "String managed by DataHub. Currently, an 8 character alphabetical string.",
+          "LEXICOGRAPHIC_STRING": "String sorted lexicographically."
+        },
         "name": "VersioningScheme",
         "namespace": "com.linkedin.pegasus2avro.versionset",
         "symbols": [
+          "LEXICOGRAPHIC_STRING",
           "ALPHANUMERIC_GENERATED_BY_DATAHUB"
         ]
       },

datahub/pydantic/__init__.py ADDED Viewed

File without changes

datahub/pydantic/compat.py ADDED Viewed

@@ -0,0 +1,58 @@
+import functools
+from typing import Any, Callable, Optional, TypeVar, cast
+# Define a type variable for the decorator
+F = TypeVar("F", bound=Callable[..., Any])
+# Check which Pydantic version is installed
+def get_pydantic_version() -> int:
+    """Determine if Pydantic v1 or v2 is installed."""
+    try:
+        import pydantic
+        version = pydantic.__version__
+        return 1 if version.startswith("1.") else 2
+    except (ImportError, AttributeError):
+        # Default to v1 if we can't determine version
+        return 1
+PYDANTIC_VERSION = get_pydantic_version()
+# Create compatibility layer for dict-like methods
+def compat_dict_method(v1_method: Optional[Callable] = None) -> Callable:
+    """
+    Decorator to make a dict method work with both Pydantic v1 and v2.
+    In v1: Uses the decorated method (typically dict)
+    In v2: Redirects to model_dump with appropriate parameter mapping
+    """
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if PYDANTIC_VERSION >= 2:
+                # Map v1 parameters to v2 parameters
+                # exclude -> exclude
+                # exclude_unset -> exclude_unset
+                # exclude_defaults -> exclude_defaults
+                # exclude_none -> exclude_none
+                # by_alias -> by_alias
+                model_dump_kwargs = kwargs.copy()
+                # Handle the 'exclude' parameter differently between versions
+                exclude = kwargs.get("exclude", set())
+                if isinstance(exclude, (set, dict)):
+                    model_dump_kwargs["exclude"] = exclude
+                return self.model_dump(**model_dump_kwargs)
+            return func(self, *args, **kwargs)
+        return cast(F, wrapper)
+    # Allow use as both @compat_dict_method and @compat_dict_method()
+    if v1_method is None:
+        return decorator
+    return decorator(v1_method)

datahub/sdk/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ from datahub.metadata.urns import (
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
 from datahub.sdk.main_client import DataHubClient
+from datahub.sdk.search_filters import Filter, FilterDsl
 # We want to print out the warning if people do `from datahub.sdk import X`.
 # But we don't want to print out warnings if they're doing a more direct

datahub/sdk/_all_entities.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from typing import Dict, List, Type
-from datahub.sdk._entity import Entity
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
+from datahub.sdk.entity import Entity
 # TODO: Is there a better way to declare this?
 ENTITY_CLASSES_LIST: List[Type[Entity]] = [

datahub/sdk/_shared.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import (
     Callable,
     List,
     Optional,
+    Sequence,
     Tuple,
     Union,
 )
@@ -36,8 +37,8 @@ from datahub.metadata.urns import (
     TagUrn,
     Urn,
 )
-from datahub.sdk._entity import Entity
 from datahub.sdk._utils import add_list_unique, remove_list_unique
+from datahub.sdk.entity import Entity
 from datahub.utilities.urns.error import InvalidUrnError
 if TYPE_CHECKING:
@@ -49,6 +50,8 @@ DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
 ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
+_DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
 def make_time_stamp(ts: Optional[datetime]) -> Optional[models.TimeStampClass]:
     if ts is None:
@@ -438,8 +441,7 @@ class HasTerms(Entity):
     def _terms_audit_stamp(self) -> models.AuditStampClass:
         return models.AuditStampClass(
             time=0,
-            # TODO figure out what to put here
-            actor=CorpUserUrn("__ingestion").urn(),
+            actor=_DEFAULT_ACTOR_URN,
         )
     def set_terms(self, terms: TermsInputType) -> None:
@@ -493,3 +495,86 @@ class HasDomain(Entity):
     def set_domain(self, domain: DomainInputType) -> None:
         domain_urn = DomainUrn.from_string(domain)  # basically a type assertion
         self._set_aspect(models.DomainsClass(domains=[str(domain_urn)]))
+LinkInputType: TypeAlias = Union[
+    str,
+    Tuple[str, str],  # url, description
+    models.InstitutionalMemoryMetadataClass,
+]
+LinksInputType: TypeAlias = Sequence[LinkInputType]
+class HasInstitutionalMemory(Entity):
+    __slots__ = ()
+    # Internally the aspect is called institutionalMemory, and so much of the code
+    # uses that name. However, the public-facing API is called "links", since
+    # that's what we call these in the UI.
+    def _ensure_institutional_memory(
+        self,
+    ) -> List[models.InstitutionalMemoryMetadataClass]:
+        return self._setdefault_aspect(
+            models.InstitutionalMemoryClass(elements=[])
+        ).elements
+    @property
+    def links(self) -> Optional[List[models.InstitutionalMemoryMetadataClass]]:
+        if institutional_memory := self._get_aspect(models.InstitutionalMemoryClass):
+            return institutional_memory.elements
+        return None
+    @classmethod
+    def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
+        return models.AuditStampClass(
+            time=0,
+            actor=_DEFAULT_ACTOR_URN,
+        )
+    @classmethod
+    def _parse_link_association_class(
+        cls, link: LinkInputType
+    ) -> models.InstitutionalMemoryMetadataClass:
+        if isinstance(link, models.InstitutionalMemoryMetadataClass):
+            return link
+        elif isinstance(link, str):
+            return models.InstitutionalMemoryMetadataClass(
+                url=link,
+                description=link,
+                createStamp=cls._institutional_memory_audit_stamp(),
+            )
+        elif isinstance(link, tuple) and len(link) == 2:
+            url, description = link
+            return models.InstitutionalMemoryMetadataClass(
+                url=url,
+                description=description,
+                createStamp=cls._institutional_memory_audit_stamp(),
+            )
+        else:
+            assert_never(link)
+    def set_links(self, links: LinksInputType) -> None:
+        self._set_aspect(
+            models.InstitutionalMemoryClass(
+                elements=[self._parse_link_association_class(link) for link in links]
+            )
+        )
+    @classmethod
+    def _link_key(self, link: models.InstitutionalMemoryMetadataClass) -> str:
+        return link.url
+    def add_link(self, link: LinkInputType) -> None:
+        add_list_unique(
+            self._ensure_institutional_memory(),
+            self._link_key,
+            self._parse_link_association_class(link),
+        )
+    def remove_link(self, link: LinkInputType) -> None:
+        remove_list_unique(
+            self._ensure_institutional_memory(),
+            self._link_key,
+            self._parse_link_association_class(link),
+        )

datahub/sdk/container.py CHANGED Viewed

@@ -16,16 +16,17 @@ from datahub.metadata.urns import (
     ContainerUrn,
     Urn,
 )
-from datahub.sdk._entity import Entity, ExtraAspectsType
 from datahub.sdk._shared import (
     DomainInputType,
     HasContainer,
     HasDomain,
+    HasInstitutionalMemory,
     HasOwnership,
     HasPlatformInstance,
     HasSubtype,
     HasTags,
     HasTerms,
+    LinksInputType,
     OwnersInputType,
     ParentContainerInputType,
     TagsInputType,
@@ -33,6 +34,7 @@ from datahub.sdk._shared import (
     make_time_stamp,
     parse_time_stamp,
 )
+from datahub.sdk.entity import Entity, ExtraAspectsType
 from datahub.utilities.sentinels import Auto, auto
@@ -41,6 +43,7 @@ class Container(
     HasSubtype,
     HasContainer,
     HasOwnership,
+    HasInstitutionalMemory,
     HasTags,
     HasTerms,
     HasDomain,
@@ -71,6 +74,7 @@ class Container(
         parent_container: Auto | ParentContainerInputType | None = auto,
         subtype: Optional[str] = None,
         owners: Optional[OwnersInputType] = None,
+        links: Optional[LinksInputType] = None,
         tags: Optional[TagsInputType] = None,
         terms: Optional[TermsInputType] = None,
         domain: Optional[DomainInputType] = None,
@@ -133,6 +137,8 @@ class Container(
             self.set_subtype(subtype)
         if owners is not None:
             self.set_owners(owners)
+        if links is not None:
+            self.set_links(links)
         if tags is not None:
             self.set_tags(tags)
         if terms is not None:

datahub/sdk/dataset.py CHANGED Viewed

@@ -18,17 +18,18 @@ from datahub.errors import (
 from datahub.ingestion.source.sql.sql_types import resolve_sql_type
 from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
 from datahub.sdk._attribution import is_ingestion_attribution
-from datahub.sdk._entity import Entity, ExtraAspectsType
 from datahub.sdk._shared import (
     DatasetUrnOrStr,
     DomainInputType,
     HasContainer,
     HasDomain,
+    HasInstitutionalMemory,
     HasOwnership,
     HasPlatformInstance,
     HasSubtype,
     HasTags,
     HasTerms,
+    LinksInputType,
     OwnersInputType,
     ParentContainerInputType,
     TagInputType,
@@ -39,6 +40,7 @@ from datahub.sdk._shared import (
     parse_time_stamp,
 )
 from datahub.sdk._utils import add_list_unique, remove_list_unique
+from datahub.sdk.entity import Entity, ExtraAspectsType
 from datahub.utilities.sentinels import Unset, unset
 SchemaFieldInputType: TypeAlias = Union[
@@ -422,6 +424,7 @@ class Dataset(
     HasSubtype,
     HasContainer,
     HasOwnership,
+    HasInstitutionalMemory,
     HasTags,
     HasTerms,
     HasDomain,
@@ -453,6 +456,7 @@ class Dataset(
         parent_container: ParentContainerInputType | Unset = unset,
         subtype: Optional[str] = None,
         owners: Optional[OwnersInputType] = None,
+        links: Optional[LinksInputType] = None,
         tags: Optional[TagsInputType] = None,
         terms: Optional[TermsInputType] = None,
         # TODO structured_properties
@@ -499,6 +503,8 @@ class Dataset(
             self.set_subtype(subtype)
         if owners is not None:
             self.set_owners(owners)
+        if links is not None:
+            self.set_links(links)
         if tags is not None:
             self.set_tags(tags)
         if terms is not None:

datahub/sdk/{_entity.py → entity.py} RENAMED Viewed

@@ -56,6 +56,10 @@ class Entity:
     @abc.abstractmethod
     def get_urn_type(cls) -> Type[_SpecificUrn]: ...
+    @classmethod
+    def entity_type_name(cls) -> str:
+        return cls.get_urn_type().ENTITY_TYPE
     @property
     def urn(self) -> _SpecificUrn:
         return self._urn

datahub/sdk/entity_client.py CHANGED Viewed

@@ -14,10 +14,10 @@ from datahub.metadata.urns import (
     Urn,
 )
 from datahub.sdk._all_entities import ENTITY_CLASSES
-from datahub.sdk._entity import Entity
 from datahub.sdk._shared import UrnOrStr
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
+from datahub.sdk.entity import Entity
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient

datahub/sdk/main_client.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
 from datahub.ingestion.graph.config import DatahubClientConfig
 from datahub.sdk.entity_client import EntityClient
 from datahub.sdk.resolver_client import ResolverClient
+from datahub.sdk.search_client import SearchClient
 class DataHubClient:
@@ -39,6 +40,8 @@ class DataHubClient:
         self._graph = graph
+    # TODO: test connection
     @classmethod
     def from_env(cls) -> "DataHubClient":
         """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
@@ -69,5 +72,8 @@ class DataHubClient:
     def resolve(self) -> ResolverClient:
         return ResolverClient(self)
-    # TODO: search client
+    @property
+    def search(self) -> SearchClient:
+        return SearchClient(self)
     # TODO: lineage client

datahub/sdk/resolver_client.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datahub.metadata.urns import (
     DomainUrn,
     GlossaryTermUrn,
 )
+from datahub.sdk.search_filters import Filter, FilterDsl as F
 if TYPE_CHECKING:
     from datahub.sdk.main_client import DataHubClient
@@ -38,37 +39,28 @@ class ResolverClient:
         self, *, name: Optional[str] = None, email: Optional[str] = None
     ) -> CorpUserUrn:
         filter_explanation: str
-        filters = []
+        filter: Filter
         if name is not None:
             if email is not None:
                 raise SdkUsageError("Cannot specify both name and email for auto_user")
-            # TODO: do we filter on displayName or fullName?
+            # We're filtering on both fullName and displayName. It's not clear
+            # what the right behavior is here.
             filter_explanation = f"with name {name}"
-            filters.append(
-                {
-                    "field": "fullName",
-                    "values": [name],
-                    "condition": "EQUAL",
-                }
+            filter = F.or_(
+                F.custom_filter("fullName", "EQUAL", [name]),
+                F.custom_filter("displayName", "EQUAL", [name]),
             )
         elif email is not None:
             filter_explanation = f"with email {email}"
-            filters.append(
-                {
-                    "field": "email",
-                    "values": [email],
-                    "condition": "EQUAL",
-                }
-            )
+            filter = F.custom_filter("email", "EQUAL", [email])
         else:
             raise SdkUsageError("Must specify either name or email for auto_user")
-        users = list(
-            self._graph.get_urns_by_filter(
-                entity_types=[CorpUserUrn.ENTITY_TYPE],
-                extraFilters=filters,
-            )
+        filter = F.and_(
+            F.entity_type(CorpUserUrn.ENTITY_TYPE),
+            filter,
         )
+        users = list(self._client.search.get_urns(filter=filter))
         if len(users) == 0:
             # TODO: In auto methods, should we just create the user/domain/etc if it doesn't exist?
             raise ItemNotFoundError(f"User {filter_explanation} not found")
@@ -82,15 +74,11 @@ class ResolverClient:
     def term(self, *, name: str) -> GlossaryTermUrn:
         # TODO: Add some limits on the graph fetch
         terms = list(
-            self._graph.get_urns_by_filter(
-                entity_types=[GlossaryTermUrn.ENTITY_TYPE],
-                extraFilters=[
-                    {
-                        "field": "id",
-                        "values": [name],
-                        "condition": "EQUAL",
-                    }
-                ],
+            self._client.search.get_urns(
+                filter=F.and_(
+                    F.entity_type(GlossaryTermUrn.ENTITY_TYPE),
+                    F.custom_filter("name", "EQUAL", [name]),
+                ),
             )
         )
         if len(terms) == 0:

datahub/sdk/search_client.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+)
+from datahub.ingestion.graph.filters import RawSearchFilterRule
+from datahub.metadata.urns import Urn
+from datahub.sdk.search_filters import Filter
+if TYPE_CHECKING:
+    from datahub.sdk.main_client import DataHubClient
+def compile_filters(
+    filter: Optional[Filter],
+) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
+    # TODO: Not every filter type is supported for every entity type.
+    # If we can detect issues with the filters at compile time, we should
+    # raise an error.
+    if filter is None:
+        return None
+    initial_filters = filter.compile()
+    return [
+        {"and": [rule.to_raw() for rule in andClause["and"]]}
+        for andClause in initial_filters
+    ]
+class SearchClient:
+    def __init__(self, client: DataHubClient):
+        self._client = client
+    def get_urns(
+        self,
+        query: Optional[str] = None,
+        filter: Optional[Filter] = None,
+    ) -> Iterable[Urn]:
+        # TODO: Add better limit / pagination support.
+        for urn in self._client._graph.get_urns_by_filter(
+            query=query,
+            extra_or_filters=compile_filters(filter),
+        ):
+            yield Urn.from_string(urn)

acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0rc8py3-none-any.whl → 1.0.0rc9py3-none-any.whl