PyPI - acryl-datahub - Versions diffs - 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl - Mend

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show

{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
datahub/_version.py +1 -1
datahub/api/entities/datajob/dataflow.py +15 -0
datahub/api/entities/datajob/datajob.py +17 -0
datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
datahub/api/entities/dataset/dataset.py +2 -2
datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
datahub/cli/ingest_cli.py +4 -4
datahub/cli/migrate.py +6 -6
datahub/configuration/common.py +1 -1
datahub/emitter/mcp_builder.py +4 -0
datahub/errors.py +4 -0
datahub/ingestion/api/common.py +9 -0
datahub/ingestion/api/source.py +6 -2
datahub/ingestion/api/source_helpers.py +35 -2
datahub/ingestion/graph/client.py +122 -7
datahub/ingestion/graph/filters.py +41 -16
datahub/ingestion/run/pipeline.py +0 -6
datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
datahub/ingestion/source/cassandra/cassandra.py +1 -10
datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
datahub/ingestion/source/fivetran/fivetran.py +1 -0
datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
datahub/ingestion/source/hex/constants.py +5 -0
datahub/ingestion/source/hex/hex.py +150 -22
datahub/ingestion/source/hex/mapper.py +28 -2
datahub/ingestion/source/hex/model.py +10 -2
datahub/ingestion/source/hex/query_fetcher.py +300 -0
datahub/ingestion/source/iceberg/iceberg.py +106 -18
datahub/ingestion/source/kafka/kafka.py +1 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
datahub/ingestion/source/looker/looker_source.py +2 -3
datahub/ingestion/source/mlflow.py +6 -7
datahub/ingestion/source/mode.py +2 -2
datahub/ingestion/source/nifi.py +3 -3
datahub/ingestion/source/openapi.py +3 -3
datahub/ingestion/source/openapi_parser.py +8 -8
datahub/ingestion/source/powerbi/config.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +16 -3
datahub/ingestion/source/redshift/profile.py +2 -2
datahub/ingestion/source/sigma/sigma.py +6 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
datahub/ingestion/source/sql/trino.py +4 -3
datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
datahub/ingestion/source/superset.py +108 -81
datahub/ingestion/source/tableau/tableau.py +4 -4
datahub/ingestion/source/tableau/tableau_common.py +2 -2
datahub/ingestion/source/unity/source.py +1 -1
datahub/ingestion/source/vertexai/vertexai.py +7 -7
datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
datahub/ingestion/transformer/dataset_domain.py +1 -1
datahub/lite/lite_util.py +2 -2
datahub/metadata/_schema_classes.py +47 -2
datahub/metadata/_urns/urn_defs.py +56 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/schema.avsc +121 -85
datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
datahub/metadata/schemas/FormInfo.avsc +5 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
datahub/metadata/schemas/QueryProperties.avsc +4 -2
datahub/metadata/schemas/SystemMetadata.avsc +86 -0
datahub/sdk/search_client.py +81 -8
datahub/sdk/search_filters.py +73 -11
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +6 -6
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/ingest_utils.py +2 -2
datahub/utilities/threaded_iterator_executor.py +16 -3
datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0

datahub/api/entities/dataprocess/dataprocess_instance.py CHANGED Viewed

@@ -159,6 +159,7 @@ class DataProcessInstance:
                         env=self.template_urn.get_env(),
                         orchestrator=self.template_urn.get_orchestrator_name(),
                         id=self.template_urn.get_flow_id(),
+                        platform_instance=self.data_platform_instance,
                     )
                     for mcp in template_object.generate_mcp():
                         self._emit_mcp(mcp, emitter, callback)
@@ -168,6 +169,7 @@ class DataProcessInstance:
                         id=self.template_urn.get_job_id(),
                         upstream_urns=input_datajob_urns,
                         flow_urn=self.template_urn.get_data_flow_urn(),
+                        platform_instance=self.data_platform_instance,
                     )
                     for mcp in template_object.generate_mcp():
                         self._emit_mcp(mcp, emitter, callback)
@@ -382,6 +384,7 @@ class DataProcessInstance:
             cluster=datajob.flow_urn.cluster,
             template_urn=datajob.urn,
             id=id,
+            data_platform_instance=datajob.platform_instance,
         )
         dpi._template_object = datajob
@@ -438,6 +441,7 @@ class DataProcessInstance:
             orchestrator=dataflow.orchestrator,
             cluster=cast(str, dataflow.env),
             template_urn=dataflow.urn,
+            data_platform_instance=dataflow.platform_instance,
         )
         dpi._template_object = dataflow
         return dpi

datahub/api/entities/dataset/dataset.py CHANGED Viewed

@@ -483,7 +483,7 @@ class Dataset(StrictModel):
                                 f"{urn_prefix}:{prop_key}"
                                 if not prop_key.startswith(urn_prefix)
                                 else prop_key
-                                for prop_key in field.structured_properties.keys()
+                                for prop_key in field.structured_properties
                             ]
                         )
                     if field.glossaryTerms:
@@ -497,7 +497,7 @@ class Dataset(StrictModel):
                     f"{urn_prefix}:{prop_key}"
                     if not prop_key.startswith(urn_prefix)
                     else prop_key
-                    for prop_key in self.structured_properties.keys()
+                    for prop_key in self.structured_properties
                 ]
             )
         if self.glossary_terms:

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -43,7 +43,7 @@ class AllowedValue(ConfigModel):
 VALID_ENTITY_TYPE_URNS = [
-    Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
+    Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
 ]
 _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."

datahub/cli/ingest_cli.py CHANGED Viewed

@@ -216,9 +216,9 @@ def run(
 @click.option(
     "--executor-id",
     type=str,
-    default="default",
     help="Executor id to route execution requests to. Do not use this unless you have configured a custom executor.",
     required=False,
+    default=None,
 )
 @click.option(
     "--cli-version",
@@ -239,7 +239,7 @@ def run(
     type=str,
     help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.",
     required=False,
-    default="UTC",
+    default=None,
 )
 @click.option(
     "--debug", type=bool, help="Should we debug.", required=False, default=False
@@ -255,10 +255,10 @@ def deploy(
     name: Optional[str],
     config: str,
     urn: Optional[str],
-    executor_id: str,
+    executor_id: Optional[str],
     cli_version: Optional[str],
     schedule: Optional[str],
-    time_zone: str,
+    time_zone: Optional[str],
     extra_pip: Optional[str],
     debug: bool = False,
 ) -> None:

datahub/cli/migrate.py CHANGED Viewed

@@ -76,13 +76,13 @@ class MigrationReport:
     def __repr__(self) -> str:
         repr = f"{self._get_prefix()}Migration Report:\n--------------\n"
         repr += f"{self._get_prefix()}Migration Run Id: {self.run_id}\n"
-        repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created.keys()]))}\n"
-        repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected.keys()]))}\n"
-        repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated.keys()]))}\n"
+        repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created]))}\n"
+        repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected]))}\n"
+        repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated]))}\n"
         repr += f"{self._get_prefix()}Details:\n"
-        repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created.keys()]) or 'None'}\n"
-        repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected.keys()]) or 'None'}\n"
-        repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated.keys()]) or 'None'}\n"
+        repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created]) or 'None'}\n"
+        repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected]) or 'None'}\n"
+        repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated]) or 'None'}\n"
         return repr

datahub/configuration/common.py CHANGED Viewed

@@ -317,7 +317,7 @@ class KeyValuePattern(ConfigModel):
         return KeyValuePattern()
     def value(self, string: str) -> List[str]:
-        matching_keys = [key for key in self.rules.keys() if re.match(key, string)]
+        matching_keys = [key for key in self.rules if re.match(key, string)]
         if not matching_keys:
             return []
         elif self.first_match_only:

datahub/emitter/mcp_builder.py CHANGED Viewed

@@ -137,6 +137,10 @@ class ProjectIdKey(ContainerKey):
     project_id: str
+class ExperimentKey(ContainerKey):
+    id: str
 class MetastoreKey(ContainerKey):
     metastore: str

datahub/errors.py CHANGED Viewed

@@ -31,6 +31,10 @@ class MultipleSubtypesWarning(Warning):
     pass
+class SearchFilterWarning(Warning):
+    pass
 class ExperimentalWarning(Warning):
     pass

datahub/ingestion/api/common.py CHANGED Viewed

@@ -12,6 +12,9 @@ if TYPE_CHECKING:
 T = TypeVar("T")
+if TYPE_CHECKING:
+    from datahub.ingestion.run.pipeline_config import FlagsConfig
 @dataclass
 class RecordEnvelope(Generic[T]):
@@ -60,6 +63,12 @@ class PipelineContext:
         self._set_dataset_urn_to_lower_if_needed()
+    @property
+    def flags(self) -> "FlagsConfig":
+        from datahub.ingestion.run.pipeline_config import FlagsConfig
+        return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
     def _set_dataset_urn_to_lower_if_needed(self) -> None:
         # TODO: Get rid of this function once lower-casing is the standard.
         if self.graph:

datahub/ingestion/api/source.py CHANGED Viewed

@@ -39,6 +39,7 @@ from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.api.source_helpers import (
+    AutoSystemMetadata,
     auto_browse_path_v2,
     auto_fix_duplicate_schema_field_paths,
     auto_fix_empty_field_paths,
@@ -51,6 +52,7 @@ from datahub.ingestion.api.source_helpers import (
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.schema_classes import UpstreamLineageClass
+from datahub.sdk.entity import Entity
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.type_annotations import get_class_from_annotation
@@ -474,13 +476,15 @@ class Source(Closeable, metaclass=ABCMeta):
         return stream
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
+        workunit_processors = self.get_workunit_processors()
+        workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
         return self._apply_workunit_processors(
-            self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
+            workunit_processors, auto_workunit(self.get_workunits_internal())
         )
     def get_workunits_internal(
         self,
-    ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
+    ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
         raise NotImplementedError(
             "get_workunits_internal must be implemented if get_workunits is not overriden."
         )

datahub/ingestion/api/source_helpers.py CHANGED Viewed

@@ -13,9 +13,14 @@ from typing import (
 )
 from datahub.configuration.time_window_config import BaseTimeWindowConfig
-from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
+from datahub.emitter.mce_builder import (
+    get_sys_time,
+    make_dataplatform_instance_urn,
+    parse_ts_millis,
+)
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import entity_supports_aspect
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.schema_classes import (
     BrowsePathEntryClass,
@@ -35,6 +40,7 @@ from datahub.metadata.schema_classes import (
     TimeWindowSizeClass,
 )
 from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
+from datahub.sdk.entity import Entity
 from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.telemetry import telemetry
 from datahub.utilities.urns.error import InvalidUrnError
@@ -49,7 +55,12 @@ logger = logging.getLogger(__name__)
 def auto_workunit(
     stream: Iterable[
-        Union[MetadataChangeEventClass, MetadataChangeProposalWrapper, MetadataWorkUnit]
+        Union[
+            MetadataChangeEventClass,
+            MetadataChangeProposalWrapper,
+            MetadataWorkUnit,
+            Entity,
+        ]
     ],
 ) -> Iterable[MetadataWorkUnit]:
     """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
@@ -62,6 +73,8 @@ def auto_workunit(
             )
         elif isinstance(item, MetadataChangeProposalWrapper):
             yield item.as_workunit()
+        elif isinstance(item, Entity):
+            yield from item.as_workunits()
         else:
             yield item
@@ -536,3 +549,23 @@ def _prepend_platform_instance(
         return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
     return entries
+class AutoSystemMetadata:
+    def __init__(self, ctx: PipelineContext):
+        self.ctx = ctx
+    def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
+        for wu in stream:
+            yield self.stamp_wu(wu)
+    def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
+        if self.ctx.flags.set_system_metadata:
+            if not wu.metadata.systemMetadata:
+                wu.metadata.systemMetadata = SystemMetadataClass()
+            wu.metadata.systemMetadata.runId = self.ctx.run_id
+            if not wu.metadata.systemMetadata.lastObserved:
+                wu.metadata.systemMetadata.lastObserved = get_sys_time()
+            if self.ctx.flags.set_system_metadata_pipeline_name:
+                wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
+        return wu

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -27,6 +27,7 @@ from pydantic import BaseModel
 from requests.models import HTTPError
 from typing_extensions import deprecated
+from datahub._codegen.aspect import _Aspect
 from datahub.cli import config_utils
 from datahub.configuration.common import ConfigModel, GraphError, OperationalError
 from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
@@ -49,6 +50,7 @@ from datahub.ingestion.graph.connections import (
 )
 from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
 from datahub.ingestion.graph.filters import (
+    RawSearchFilter,
     RawSearchFilterRule,
     RemovedStatusFilter,
     generate_filter,
@@ -75,10 +77,11 @@ from datahub.metadata.schema_classes import (
     SystemMetadataClass,
     TelemetryClientIdClass,
 )
+from datahub.metadata.urns import CorpUserUrn, Urn
 from datahub.telemetry.telemetry import telemetry_instance
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.str_enum import StrEnum
-from datahub.utilities.urns.urn import Urn, guess_entity_type
+from datahub.utilities.urns.urn import guess_entity_type
 if TYPE_CHECKING:
     from datahub.ingestion.sink.datahub_rest import (
@@ -116,7 +119,7 @@ def entity_type_to_graphql(entity_type: str) -> str:
     """Convert the entity types into GraphQL "EntityType" enum values."""
     # Hard-coded special cases.
-    if entity_type == "corpuser":
+    if entity_type == CorpUserUrn.ENTITY_TYPE:
         return "CORP_USER"
     # Convert camelCase to UPPER_UNDERSCORE.
@@ -133,6 +136,14 @@ def entity_type_to_graphql(entity_type: str) -> str:
     return entity_type
+def flexible_entity_type_to_graphql(entity_type: str) -> str:
+    if entity_type.upper() == entity_type:
+        # Assume that we were passed a graphql EntityType enum value,
+        # so no conversion is needed.
+        return entity_type
+    return entity_type_to_graphql(entity_type)
 class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
     def __init__(self, config: DatahubClientConfig) -> None:
         self.config = config
@@ -805,7 +816,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         :return: An iterable of (urn, schema info) tuple that match the filters.
         """
-        types = [entity_type_to_graphql("dataset")]
+        types = self._get_types(["dataset"])
         # Add the query default of * if no query is specified.
         query = query or "*"
@@ -873,10 +884,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         env: Optional[str] = None,
         query: Optional[str] = None,
         container: Optional[str] = None,
-        status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
+        status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
         batch_size: int = 10000,
         extraFilters: Optional[List[RawSearchFilterRule]] = None,
-        extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
+        extra_or_filters: Optional[RawSearchFilter] = None,
     ) -> Iterable[str]:
         """Fetch all urns that match all of the given filters.
@@ -968,7 +979,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
         batch_size: int = 10000,
         extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
-        extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
+        extra_or_filters: Optional[RawSearchFilter] = None,
         extra_source_fields: Optional[List[str]] = None,
         skip_cache: bool = False,
     ) -> Iterable[dict]:
@@ -1121,7 +1132,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                 )
             types = [
-                entity_type_to_graphql(entity_type) for entity_type in entity_types
+                flexible_entity_type_to_graphql(entity_type)
+                for entity_type in entity_types
             ]
         return types
@@ -1686,6 +1698,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
         return res["runAssertionsForAsset"]
+    @deprecated("Use get_entities instead which returns typed aspects")
     def get_entities_v2(
         self,
         entity_name: str,
@@ -1725,6 +1738,108 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
                     retval[entity_urn][aspect_key] = aspect_value
         return retval
+    def get_entities(
+        self,
+        entity_name: str,
+        urns: List[str],
+        aspects: Optional[List[str]] = None,
+        with_system_metadata: bool = False,
+    ) -> Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]]:
+        """
+        Get entities using the OpenAPI v3 endpoint, deserializing aspects into typed objects.
+        Args:
+            entity_name: The entity type name
+            urns: List of entity URNs to fetch
+            aspects: Optional list of aspect names to fetch. If None, all aspects will be fetched.
+            with_system_metadata: If True, return system metadata along with each aspect.
+        Returns:
+            A dictionary mapping URNs to a dictionary of aspect name to tuples of
+            (typed aspect object, system metadata). If with_system_metadata is False,
+            the system metadata in the tuple will be None.
+        """
+        aspects = aspects or []
+        request_payload = []
+        for urn in urns:
+            entity_request: Dict[str, Any] = {"urn": urn}
+            for aspect_name in aspects:
+                entity_request[aspect_name] = {}
+            request_payload.append(entity_request)
+        headers: Dict[str, Any] = {
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+        }
+        url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
+        if with_system_metadata:
+            url += "?systemMetadata=true"
+        response = self._session.post(
+            url, data=json.dumps(request_payload), headers=headers
+        )
+        response.raise_for_status()
+        entities = response.json()
+        result: Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]] = {}
+        for entity in entities:
+            entity_urn = entity.get("urn")
+            if entity_urn is None:
+                logger.warning(
+                    f"Missing URN in entity response: {entity}, skipping deserialization"
+                )
+                continue
+            entity_aspects: Dict[
+                str, Tuple[_Aspect, Optional[SystemMetadataClass]]
+            ] = {}
+            for aspect_name, aspect_obj in entity.items():
+                if aspect_name == "urn":
+                    continue
+                aspect_class = ASPECT_NAME_MAP.get(aspect_name)
+                if aspect_class is None:
+                    logger.warning(
+                        f"Unknown aspect type {aspect_name}, skipping deserialization"
+                    )
+                    continue
+                aspect_value = aspect_obj.get("value")
+                if aspect_value is None:
+                    logger.warning(
+                        f"Unknown aspect value for aspect {aspect_name}, skipping deserialization"
+                    )
+                    continue
+                try:
+                    post_json_obj = post_json_transform(aspect_value)
+                    typed_aspect = aspect_class.from_obj(post_json_obj)
+                    assert isinstance(typed_aspect, aspect_class) and isinstance(
+                        typed_aspect, _Aspect
+                    )
+                    system_metadata = None
+                    if with_system_metadata:
+                        system_metadata_obj = aspect_obj.get("systemMetadata")
+                        if system_metadata_obj:
+                            system_metadata = SystemMetadataClass.from_obj(
+                                system_metadata_obj
+                            )
+                    entity_aspects[aspect_name] = (typed_aspect, system_metadata)
+                except Exception as e:
+                    logger.error(f"Error deserializing aspect {aspect_name}: {e}")
+                    raise
+            if entity_aspects:
+                result[entity_urn] = entity_aspects
+        return result
     def upsert_custom_assertion(
         self,
         urn: Optional[str],

datahub/ingestion/graph/filters.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import dataclasses
 import enum
-from typing import Any, Dict, List, Literal, Optional
+import warnings
+from typing import Dict, List, Literal, Optional, Union
 from typing_extensions import TypeAlias
@@ -8,9 +9,14 @@ from datahub.emitter.mce_builder import (
     make_data_platform_urn,
     make_dataplatform_instance_urn,
 )
+from datahub.errors import SearchFilterWarning
 from datahub.utilities.urns.urn import guess_entity_type
-RawSearchFilterRule = Dict[str, Any]
+RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
+# This is a list of OR filters, each of which is a list of AND filters.
+# This can be put directly into the orFilters parameter in GraphQL.
+RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
 # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
 FilterOperator: TypeAlias = Literal[
@@ -39,12 +45,14 @@ class SearchFilterRule:
     negated: bool = False
     def to_raw(self) -> RawSearchFilterRule:
-        return {
+        rule: RawSearchFilterRule = {
             "field": self.field,
             "condition": self.condition,
             "values": self.values,
-            "negated": self.negated,
         }
+        if self.negated:
+            rule["negated"] = True
+        return rule
     def negate(self) -> "SearchFilterRule":
         return SearchFilterRule(
@@ -73,10 +81,10 @@ def generate_filter(
     platform_instance: Optional[str],
     env: Optional[str],
     container: Optional[str],
-    status: RemovedStatusFilter,
+    status: Optional[RemovedStatusFilter],
     extra_filters: Optional[List[RawSearchFilterRule]],
-    extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
-) -> List[Dict[str, List[RawSearchFilterRule]]]:
+    extra_or_filters: Optional[RawSearchFilter] = None,
+) -> RawSearchFilter:
     """
     Generate a search filter based on the provided parameters.
     :param platform: The platform to filter by.
@@ -105,15 +113,16 @@ def generate_filter(
         and_filters.append(_get_container_filter(container).to_raw())
     # Status filter.
-    status_filter = _get_status_filter(status)
-    if status_filter:
-        and_filters.append(status_filter.to_raw())
+    if status:
+        status_filter = _get_status_filter(status)
+        if status_filter:
+            and_filters.append(status_filter.to_raw())
     # Extra filters.
     if extra_filters:
         and_filters += extra_filters
-    or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
+    or_filters: RawSearchFilter = [{"and": and_filters}]
     # Env filter
     if env:
@@ -127,11 +136,27 @@ def generate_filter(
     # Extra OR filters are distributed across the top level and lists.
     if extra_or_filters:
-        or_filters = [
-            {"and": and_filter["and"] + [extra_or_filter]}
-            for extra_or_filter in extra_or_filters
-            for and_filter in or_filters
-        ]
+        new_or_filters: RawSearchFilter = []
+        for and_filter in or_filters:
+            for extra_or_filter in extra_or_filters:
+                if isinstance(extra_or_filter, dict) and "and" in extra_or_filter:
+                    new_or_filters.append(
+                        {"and": and_filter["and"] + extra_or_filter["and"]}
+                    )
+                else:
+                    # Hack for backwards compatibility.
+                    # We have some code that erroneously passed a List[RawSearchFilterRule]
+                    # instead of a List[Dict["and", List[RawSearchFilterRule]]].
+                    warnings.warn(
+                        "Passing a List[RawSearchFilterRule] to extra_or_filters is deprecated. "
+                        "Please pass a List[Dict[str, List[RawSearchFilterRule]]] instead.",
+                        SearchFilterWarning,
+                        stacklevel=3,
+                    )
+                    new_or_filters.append(
+                        {"and": and_filter["and"] + [extra_or_filter]}  # type: ignore
+                    )
+        or_filters = new_or_filters
     return or_filters

datahub/ingestion/run/pipeline.py CHANGED Viewed

@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
 from datahub.ingestion.sink.datahub_rest import DatahubRestSink
 from datahub.ingestion.sink.sink_registry import sink_registry
 from datahub.ingestion.source.source_registry import source_registry
-from datahub.ingestion.transformer.system_metadata_transformer import (
-    SystemMetadataTransformer,
-)
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.sdk._attribution import KnownAttribution, change_default_attribution
 from datahub.telemetry import stats
@@ -286,9 +283,6 @@ class Pipeline:
                     f"Transformer type:{transformer_type},{transformer_class} configured"
                 )
-        # Add the system metadata transformer at the end of the list.
-        self.transformers.append(SystemMetadataTransformer(self.ctx))
     def _configure_reporting(self, report_to: Optional[str]) -> None:
         if self.dry_run:
             # In dry run mode, we don't want to report anything.

datahub/ingestion/source/aws/sagemaker_processors/models.py CHANGED Viewed

@@ -323,7 +323,7 @@ class ModelProcessor:
             model_training_jobs = model_training_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.TRAINING
                 }
             )
@@ -331,7 +331,7 @@ class ModelProcessor:
             model_downstream_jobs = model_downstream_jobs.union(
                 {
                     job_urn
-                    for job_urn, job_direction in data_url_matched_jobs.keys()
+                    for job_urn, job_direction in data_url_matched_jobs
                     if job_direction == JobDirection.DOWNSTREAM
                 }
             )
@@ -368,7 +368,7 @@ class ModelProcessor:
         model_training_jobs = model_training_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.TRAINING
             }
         )
@@ -376,7 +376,7 @@ class ModelProcessor:
         model_downstream_jobs = model_downstream_jobs.union(
             {
                 job_urn
-                for job_urn, job_direction in name_matched_jobs.keys()
+                for job_urn, job_direction in name_matched_jobs
                 if job_direction == JobDirection.DOWNSTREAM
             }
         )

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
                 memory_footprint.total_size(lineage)
             )
-        for lineage_key in lineage.keys():
+        for lineage_key in lineage:
             # For views, we do not use the upstreams obtained by parsing audit logs
             # as they may contain indirectly referenced tables.
             if (

acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.0.0.1rc6py3-none-any.whl → 1.0.0.2py3-none-any.whl