acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/errors.py +4 -0
  14. datahub/ingestion/api/common.py +9 -0
  15. datahub/ingestion/api/source.py +6 -2
  16. datahub/ingestion/api/source_helpers.py +35 -2
  17. datahub/ingestion/graph/client.py +122 -7
  18. datahub/ingestion/graph/filters.py +41 -16
  19. datahub/ingestion/run/pipeline.py +0 -6
  20. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  21. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  22. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  23. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  24. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  25. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  26. datahub/ingestion/source/hex/constants.py +5 -0
  27. datahub/ingestion/source/hex/hex.py +150 -22
  28. datahub/ingestion/source/hex/mapper.py +28 -2
  29. datahub/ingestion/source/hex/model.py +10 -2
  30. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  31. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  32. datahub/ingestion/source/kafka/kafka.py +1 -4
  33. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  34. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  35. datahub/ingestion/source/looker/looker_source.py +2 -3
  36. datahub/ingestion/source/mlflow.py +6 -7
  37. datahub/ingestion/source/mode.py +2 -2
  38. datahub/ingestion/source/nifi.py +3 -3
  39. datahub/ingestion/source/openapi.py +3 -3
  40. datahub/ingestion/source/openapi_parser.py +8 -8
  41. datahub/ingestion/source/powerbi/config.py +1 -1
  42. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  43. datahub/ingestion/source/redshift/profile.py +2 -2
  44. datahub/ingestion/source/sigma/sigma.py +6 -2
  45. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  46. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  47. datahub/ingestion/source/sql/trino.py +4 -3
  48. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  49. datahub/ingestion/source/superset.py +108 -81
  50. datahub/ingestion/source/tableau/tableau.py +4 -4
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  52. datahub/ingestion/source/unity/source.py +1 -1
  53. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  54. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  55. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  56. datahub/ingestion/transformer/dataset_domain.py +1 -1
  57. datahub/lite/lite_util.py +2 -2
  58. datahub/metadata/_schema_classes.py +47 -2
  59. datahub/metadata/_urns/urn_defs.py +56 -0
  60. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  61. datahub/metadata/schema.avsc +121 -85
  62. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  63. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  64. datahub/metadata/schemas/FormInfo.avsc +5 -0
  65. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  66. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  67. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  68. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  69. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  70. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  71. datahub/sdk/search_client.py +81 -8
  72. datahub/sdk/search_filters.py +73 -11
  73. datahub/testing/mcp_diff.py +1 -1
  74. datahub/utilities/file_backed_collections.py +6 -6
  75. datahub/utilities/hive_schema_to_avro.py +2 -2
  76. datahub/utilities/ingest_utils.py +2 -2
  77. datahub/utilities/threaded_iterator_executor.py +16 -3
  78. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  79. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -159,6 +159,7 @@ class DataProcessInstance:
159
159
  env=self.template_urn.get_env(),
160
160
  orchestrator=self.template_urn.get_orchestrator_name(),
161
161
  id=self.template_urn.get_flow_id(),
162
+ platform_instance=self.data_platform_instance,
162
163
  )
163
164
  for mcp in template_object.generate_mcp():
164
165
  self._emit_mcp(mcp, emitter, callback)
@@ -168,6 +169,7 @@ class DataProcessInstance:
168
169
  id=self.template_urn.get_job_id(),
169
170
  upstream_urns=input_datajob_urns,
170
171
  flow_urn=self.template_urn.get_data_flow_urn(),
172
+ platform_instance=self.data_platform_instance,
171
173
  )
172
174
  for mcp in template_object.generate_mcp():
173
175
  self._emit_mcp(mcp, emitter, callback)
@@ -382,6 +384,7 @@ class DataProcessInstance:
382
384
  cluster=datajob.flow_urn.cluster,
383
385
  template_urn=datajob.urn,
384
386
  id=id,
387
+ data_platform_instance=datajob.platform_instance,
385
388
  )
386
389
  dpi._template_object = datajob
387
390
 
@@ -438,6 +441,7 @@ class DataProcessInstance:
438
441
  orchestrator=dataflow.orchestrator,
439
442
  cluster=cast(str, dataflow.env),
440
443
  template_urn=dataflow.urn,
444
+ data_platform_instance=dataflow.platform_instance,
441
445
  )
442
446
  dpi._template_object = dataflow
443
447
  return dpi
@@ -483,7 +483,7 @@ class Dataset(StrictModel):
483
483
  f"{urn_prefix}:{prop_key}"
484
484
  if not prop_key.startswith(urn_prefix)
485
485
  else prop_key
486
- for prop_key in field.structured_properties.keys()
486
+ for prop_key in field.structured_properties
487
487
  ]
488
488
  )
489
489
  if field.glossaryTerms:
@@ -497,7 +497,7 @@ class Dataset(StrictModel):
497
497
  f"{urn_prefix}:{prop_key}"
498
498
  if not prop_key.startswith(urn_prefix)
499
499
  else prop_key
500
- for prop_key in self.structured_properties.keys()
500
+ for prop_key in self.structured_properties
501
501
  ]
502
502
  )
503
503
  if self.glossary_terms:
@@ -43,7 +43,7 @@ class AllowedValue(ConfigModel):
43
43
 
44
44
 
45
45
  VALID_ENTITY_TYPE_URNS = [
46
- Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
46
+ Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES
47
47
  ]
48
48
  _VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
49
49
 
datahub/cli/ingest_cli.py CHANGED
@@ -216,9 +216,9 @@ def run(
216
216
  @click.option(
217
217
  "--executor-id",
218
218
  type=str,
219
- default="default",
220
219
  help="Executor id to route execution requests to. Do not use this unless you have configured a custom executor.",
221
220
  required=False,
221
+ default=None,
222
222
  )
223
223
  @click.option(
224
224
  "--cli-version",
@@ -239,7 +239,7 @@ def run(
239
239
  type=str,
240
240
  help="Timezone for the schedule in 'America/New_York' format. Uses UTC by default.",
241
241
  required=False,
242
- default="UTC",
242
+ default=None,
243
243
  )
244
244
  @click.option(
245
245
  "--debug", type=bool, help="Should we debug.", required=False, default=False
@@ -255,10 +255,10 @@ def deploy(
255
255
  name: Optional[str],
256
256
  config: str,
257
257
  urn: Optional[str],
258
- executor_id: str,
258
+ executor_id: Optional[str],
259
259
  cli_version: Optional[str],
260
260
  schedule: Optional[str],
261
- time_zone: str,
261
+ time_zone: Optional[str],
262
262
  extra_pip: Optional[str],
263
263
  debug: bool = False,
264
264
  ) -> None:
datahub/cli/migrate.py CHANGED
@@ -76,13 +76,13 @@ class MigrationReport:
76
76
  def __repr__(self) -> str:
77
77
  repr = f"{self._get_prefix()}Migration Report:\n--------------\n"
78
78
  repr += f"{self._get_prefix()}Migration Run Id: {self.run_id}\n"
79
- repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created.keys()]))}\n"
80
- repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected.keys()]))}\n"
81
- repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated.keys()]))}\n"
79
+ repr += f"{self._get_prefix()}Num entities created = {len(set([x[0] for x in self.entities_created]))}\n"
80
+ repr += f"{self._get_prefix()}Num entities affected = {len(set([x[0] for x in self.entities_affected]))}\n"
81
+ repr += f"{self._get_prefix()}Num entities {'kept' if self.keep else 'migrated'} = {len(set([x[0] for x in self.entities_migrated]))}\n"
82
82
  repr += f"{self._get_prefix()}Details:\n"
83
- repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created.keys()]) or 'None'}\n"
84
- repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected.keys()]) or 'None'}\n"
85
- repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated.keys()]) or 'None'}\n"
83
+ repr += f"{self._get_prefix()}New Entities Created: {set([x[0] for x in self.entities_created]) or 'None'}\n"
84
+ repr += f"{self._get_prefix()}External Entities Affected: {set([x[0] for x in self.entities_affected]) or 'None'}\n"
85
+ repr += f"{self._get_prefix()}Old Entities {'Kept' if self.keep else 'Migrated'} = {set([x[0] for x in self.entities_migrated]) or 'None'}\n"
86
86
  return repr
87
87
 
88
88
 
@@ -317,7 +317,7 @@ class KeyValuePattern(ConfigModel):
317
317
  return KeyValuePattern()
318
318
 
319
319
  def value(self, string: str) -> List[str]:
320
- matching_keys = [key for key in self.rules.keys() if re.match(key, string)]
320
+ matching_keys = [key for key in self.rules if re.match(key, string)]
321
321
  if not matching_keys:
322
322
  return []
323
323
  elif self.first_match_only:
@@ -137,6 +137,10 @@ class ProjectIdKey(ContainerKey):
137
137
  project_id: str
138
138
 
139
139
 
140
+ class ExperimentKey(ContainerKey):
141
+ id: str
142
+
143
+
140
144
  class MetastoreKey(ContainerKey):
141
145
  metastore: str
142
146
 
datahub/errors.py CHANGED
@@ -31,6 +31,10 @@ class MultipleSubtypesWarning(Warning):
31
31
  pass
32
32
 
33
33
 
34
+ class SearchFilterWarning(Warning):
35
+ pass
36
+
37
+
34
38
  class ExperimentalWarning(Warning):
35
39
  pass
36
40
 
@@ -12,6 +12,9 @@ if TYPE_CHECKING:
12
12
 
13
13
  T = TypeVar("T")
14
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
17
+
15
18
 
16
19
  @dataclass
17
20
  class RecordEnvelope(Generic[T]):
@@ -60,6 +63,12 @@ class PipelineContext:
60
63
 
61
64
  self._set_dataset_urn_to_lower_if_needed()
62
65
 
66
+ @property
67
+ def flags(self) -> "FlagsConfig":
68
+ from datahub.ingestion.run.pipeline_config import FlagsConfig
69
+
70
+ return self.pipeline_config.flags if self.pipeline_config else FlagsConfig()
71
+
63
72
  def _set_dataset_urn_to_lower_if_needed(self) -> None:
64
73
  # TODO: Get rid of this function once lower-casing is the standard.
65
74
  if self.graph:
@@ -39,6 +39,7 @@ from datahub.ingestion.api.closeable import Closeable
39
39
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
40
40
  from datahub.ingestion.api.report import Report
41
41
  from datahub.ingestion.api.source_helpers import (
42
+ AutoSystemMetadata,
42
43
  auto_browse_path_v2,
43
44
  auto_fix_duplicate_schema_field_paths,
44
45
  auto_fix_empty_field_paths,
@@ -51,6 +52,7 @@ from datahub.ingestion.api.source_helpers import (
51
52
  from datahub.ingestion.api.workunit import MetadataWorkUnit
52
53
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
54
  from datahub.metadata.schema_classes import UpstreamLineageClass
55
+ from datahub.sdk.entity import Entity
54
56
  from datahub.utilities.lossy_collections import LossyDict, LossyList
55
57
  from datahub.utilities.type_annotations import get_class_from_annotation
56
58
 
@@ -474,13 +476,15 @@ class Source(Closeable, metaclass=ABCMeta):
474
476
  return stream
475
477
 
476
478
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
479
+ workunit_processors = self.get_workunit_processors()
480
+ workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
477
481
  return self._apply_workunit_processors(
478
- self.get_workunit_processors(), auto_workunit(self.get_workunits_internal())
482
+ workunit_processors, auto_workunit(self.get_workunits_internal())
479
483
  )
480
484
 
481
485
  def get_workunits_internal(
482
486
  self,
483
- ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
487
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
484
488
  raise NotImplementedError(
485
489
  "get_workunits_internal must be implemented if get_workunits is not overriden."
486
490
  )
@@ -13,9 +13,14 @@ from typing import (
13
13
  )
14
14
 
15
15
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
16
- from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
16
+ from datahub.emitter.mce_builder import (
17
+ get_sys_time,
18
+ make_dataplatform_instance_urn,
19
+ parse_ts_millis,
20
+ )
17
21
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
22
  from datahub.emitter.mcp_builder import entity_supports_aspect
23
+ from datahub.ingestion.api.common import PipelineContext
19
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
25
  from datahub.metadata.schema_classes import (
21
26
  BrowsePathEntryClass,
@@ -35,6 +40,7 @@ from datahub.metadata.schema_classes import (
35
40
  TimeWindowSizeClass,
36
41
  )
37
42
  from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
43
+ from datahub.sdk.entity import Entity
38
44
  from datahub.specific.dataset import DatasetPatchBuilder
39
45
  from datahub.telemetry import telemetry
40
46
  from datahub.utilities.urns.error import InvalidUrnError
@@ -49,7 +55,12 @@ logger = logging.getLogger(__name__)
49
55
 
50
56
  def auto_workunit(
51
57
  stream: Iterable[
52
- Union[MetadataChangeEventClass, MetadataChangeProposalWrapper, MetadataWorkUnit]
58
+ Union[
59
+ MetadataChangeEventClass,
60
+ MetadataChangeProposalWrapper,
61
+ MetadataWorkUnit,
62
+ Entity,
63
+ ]
53
64
  ],
54
65
  ) -> Iterable[MetadataWorkUnit]:
55
66
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
@@ -62,6 +73,8 @@ def auto_workunit(
62
73
  )
63
74
  elif isinstance(item, MetadataChangeProposalWrapper):
64
75
  yield item.as_workunit()
76
+ elif isinstance(item, Entity):
77
+ yield from item.as_workunits()
65
78
  else:
66
79
  yield item
67
80
 
@@ -536,3 +549,23 @@ def _prepend_platform_instance(
536
549
  return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
537
550
 
538
551
  return entries
552
+
553
+
554
+ class AutoSystemMetadata:
555
+ def __init__(self, ctx: PipelineContext):
556
+ self.ctx = ctx
557
+
558
+ def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
559
+ for wu in stream:
560
+ yield self.stamp_wu(wu)
561
+
562
+ def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
563
+ if self.ctx.flags.set_system_metadata:
564
+ if not wu.metadata.systemMetadata:
565
+ wu.metadata.systemMetadata = SystemMetadataClass()
566
+ wu.metadata.systemMetadata.runId = self.ctx.run_id
567
+ if not wu.metadata.systemMetadata.lastObserved:
568
+ wu.metadata.systemMetadata.lastObserved = get_sys_time()
569
+ if self.ctx.flags.set_system_metadata_pipeline_name:
570
+ wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
571
+ return wu
@@ -27,6 +27,7 @@ from pydantic import BaseModel
27
27
  from requests.models import HTTPError
28
28
  from typing_extensions import deprecated
29
29
 
30
+ from datahub._codegen.aspect import _Aspect
30
31
  from datahub.cli import config_utils
31
32
  from datahub.configuration.common import ConfigModel, GraphError, OperationalError
32
33
  from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
@@ -49,6 +50,7 @@ from datahub.ingestion.graph.connections import (
49
50
  )
50
51
  from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
51
52
  from datahub.ingestion.graph.filters import (
53
+ RawSearchFilter,
52
54
  RawSearchFilterRule,
53
55
  RemovedStatusFilter,
54
56
  generate_filter,
@@ -75,10 +77,11 @@ from datahub.metadata.schema_classes import (
75
77
  SystemMetadataClass,
76
78
  TelemetryClientIdClass,
77
79
  )
80
+ from datahub.metadata.urns import CorpUserUrn, Urn
78
81
  from datahub.telemetry.telemetry import telemetry_instance
79
82
  from datahub.utilities.perf_timer import PerfTimer
80
83
  from datahub.utilities.str_enum import StrEnum
81
- from datahub.utilities.urns.urn import Urn, guess_entity_type
84
+ from datahub.utilities.urns.urn import guess_entity_type
82
85
 
83
86
  if TYPE_CHECKING:
84
87
  from datahub.ingestion.sink.datahub_rest import (
@@ -116,7 +119,7 @@ def entity_type_to_graphql(entity_type: str) -> str:
116
119
  """Convert the entity types into GraphQL "EntityType" enum values."""
117
120
 
118
121
  # Hard-coded special cases.
119
- if entity_type == "corpuser":
122
+ if entity_type == CorpUserUrn.ENTITY_TYPE:
120
123
  return "CORP_USER"
121
124
 
122
125
  # Convert camelCase to UPPER_UNDERSCORE.
@@ -133,6 +136,14 @@ def entity_type_to_graphql(entity_type: str) -> str:
133
136
  return entity_type
134
137
 
135
138
 
139
+ def flexible_entity_type_to_graphql(entity_type: str) -> str:
140
+ if entity_type.upper() == entity_type:
141
+ # Assume that we were passed a graphql EntityType enum value,
142
+ # so no conversion is needed.
143
+ return entity_type
144
+ return entity_type_to_graphql(entity_type)
145
+
146
+
136
147
  class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
137
148
  def __init__(self, config: DatahubClientConfig) -> None:
138
149
  self.config = config
@@ -805,7 +816,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
805
816
 
806
817
  :return: An iterable of (urn, schema info) tuple that match the filters.
807
818
  """
808
- types = [entity_type_to_graphql("dataset")]
819
+ types = self._get_types(["dataset"])
809
820
 
810
821
  # Add the query default of * if no query is specified.
811
822
  query = query or "*"
@@ -873,10 +884,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
873
884
  env: Optional[str] = None,
874
885
  query: Optional[str] = None,
875
886
  container: Optional[str] = None,
876
- status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
887
+ status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
877
888
  batch_size: int = 10000,
878
889
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
879
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
890
+ extra_or_filters: Optional[RawSearchFilter] = None,
880
891
  ) -> Iterable[str]:
881
892
  """Fetch all urns that match all of the given filters.
882
893
 
@@ -968,7 +979,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
968
979
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
969
980
  batch_size: int = 10000,
970
981
  extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
971
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
982
+ extra_or_filters: Optional[RawSearchFilter] = None,
972
983
  extra_source_fields: Optional[List[str]] = None,
973
984
  skip_cache: bool = False,
974
985
  ) -> Iterable[dict]:
@@ -1121,7 +1132,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1121
1132
  )
1122
1133
 
1123
1134
  types = [
1124
- entity_type_to_graphql(entity_type) for entity_type in entity_types
1135
+ flexible_entity_type_to_graphql(entity_type)
1136
+ for entity_type in entity_types
1125
1137
  ]
1126
1138
  return types
1127
1139
 
@@ -1686,6 +1698,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1686
1698
 
1687
1699
  return res["runAssertionsForAsset"]
1688
1700
 
1701
+ @deprecated("Use get_entities instead which returns typed aspects")
1689
1702
  def get_entities_v2(
1690
1703
  self,
1691
1704
  entity_name: str,
@@ -1725,6 +1738,108 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1725
1738
  retval[entity_urn][aspect_key] = aspect_value
1726
1739
  return retval
1727
1740
 
1741
+ def get_entities(
1742
+ self,
1743
+ entity_name: str,
1744
+ urns: List[str],
1745
+ aspects: Optional[List[str]] = None,
1746
+ with_system_metadata: bool = False,
1747
+ ) -> Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]]:
1748
+ """
1749
+ Get entities using the OpenAPI v3 endpoint, deserializing aspects into typed objects.
1750
+
1751
+ Args:
1752
+ entity_name: The entity type name
1753
+ urns: List of entity URNs to fetch
1754
+ aspects: Optional list of aspect names to fetch. If None, all aspects will be fetched.
1755
+ with_system_metadata: If True, return system metadata along with each aspect.
1756
+
1757
+ Returns:
1758
+ A dictionary mapping URNs to a dictionary of aspect name to tuples of
1759
+ (typed aspect object, system metadata). If with_system_metadata is False,
1760
+ the system metadata in the tuple will be None.
1761
+ """
1762
+ aspects = aspects or []
1763
+
1764
+ request_payload = []
1765
+ for urn in urns:
1766
+ entity_request: Dict[str, Any] = {"urn": urn}
1767
+ for aspect_name in aspects:
1768
+ entity_request[aspect_name] = {}
1769
+ request_payload.append(entity_request)
1770
+
1771
+ headers: Dict[str, Any] = {
1772
+ "Accept": "application/json",
1773
+ "Content-Type": "application/json",
1774
+ }
1775
+
1776
+ url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
1777
+ if with_system_metadata:
1778
+ url += "?systemMetadata=true"
1779
+
1780
+ response = self._session.post(
1781
+ url, data=json.dumps(request_payload), headers=headers
1782
+ )
1783
+ response.raise_for_status()
1784
+ entities = response.json()
1785
+
1786
+ result: Dict[str, Dict[str, Tuple[_Aspect, Optional[SystemMetadataClass]]]] = {}
1787
+
1788
+ for entity in entities:
1789
+ entity_urn = entity.get("urn")
1790
+ if entity_urn is None:
1791
+ logger.warning(
1792
+ f"Missing URN in entity response: {entity}, skipping deserialization"
1793
+ )
1794
+ continue
1795
+
1796
+ entity_aspects: Dict[
1797
+ str, Tuple[_Aspect, Optional[SystemMetadataClass]]
1798
+ ] = {}
1799
+
1800
+ for aspect_name, aspect_obj in entity.items():
1801
+ if aspect_name == "urn":
1802
+ continue
1803
+
1804
+ aspect_class = ASPECT_NAME_MAP.get(aspect_name)
1805
+ if aspect_class is None:
1806
+ logger.warning(
1807
+ f"Unknown aspect type {aspect_name}, skipping deserialization"
1808
+ )
1809
+ continue
1810
+
1811
+ aspect_value = aspect_obj.get("value")
1812
+ if aspect_value is None:
1813
+ logger.warning(
1814
+ f"Unknown aspect value for aspect {aspect_name}, skipping deserialization"
1815
+ )
1816
+ continue
1817
+
1818
+ try:
1819
+ post_json_obj = post_json_transform(aspect_value)
1820
+ typed_aspect = aspect_class.from_obj(post_json_obj)
1821
+ assert isinstance(typed_aspect, aspect_class) and isinstance(
1822
+ typed_aspect, _Aspect
1823
+ )
1824
+
1825
+ system_metadata = None
1826
+ if with_system_metadata:
1827
+ system_metadata_obj = aspect_obj.get("systemMetadata")
1828
+ if system_metadata_obj:
1829
+ system_metadata = SystemMetadataClass.from_obj(
1830
+ system_metadata_obj
1831
+ )
1832
+
1833
+ entity_aspects[aspect_name] = (typed_aspect, system_metadata)
1834
+ except Exception as e:
1835
+ logger.error(f"Error deserializing aspect {aspect_name}: {e}")
1836
+ raise
1837
+
1838
+ if entity_aspects:
1839
+ result[entity_urn] = entity_aspects
1840
+
1841
+ return result
1842
+
1728
1843
  def upsert_custom_assertion(
1729
1844
  self,
1730
1845
  urn: Optional[str],
@@ -1,6 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
- from typing import Any, Dict, List, Literal, Optional
3
+ import warnings
4
+ from typing import Dict, List, Literal, Optional, Union
4
5
 
5
6
  from typing_extensions import TypeAlias
6
7
 
@@ -8,9 +9,14 @@ from datahub.emitter.mce_builder import (
8
9
  make_data_platform_urn,
9
10
  make_dataplatform_instance_urn,
10
11
  )
12
+ from datahub.errors import SearchFilterWarning
11
13
  from datahub.utilities.urns.urn import guess_entity_type
12
14
 
13
- RawSearchFilterRule = Dict[str, Any]
15
+ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
16
+
17
+ # This is a list of OR filters, each of which is a list of AND filters.
18
+ # This can be put directly into the orFilters parameter in GraphQL.
19
+ RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
14
20
 
15
21
  # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
16
22
  FilterOperator: TypeAlias = Literal[
@@ -39,12 +45,14 @@ class SearchFilterRule:
39
45
  negated: bool = False
40
46
 
41
47
  def to_raw(self) -> RawSearchFilterRule:
42
- return {
48
+ rule: RawSearchFilterRule = {
43
49
  "field": self.field,
44
50
  "condition": self.condition,
45
51
  "values": self.values,
46
- "negated": self.negated,
47
52
  }
53
+ if self.negated:
54
+ rule["negated"] = True
55
+ return rule
48
56
 
49
57
  def negate(self) -> "SearchFilterRule":
50
58
  return SearchFilterRule(
@@ -73,10 +81,10 @@ def generate_filter(
73
81
  platform_instance: Optional[str],
74
82
  env: Optional[str],
75
83
  container: Optional[str],
76
- status: RemovedStatusFilter,
84
+ status: Optional[RemovedStatusFilter],
77
85
  extra_filters: Optional[List[RawSearchFilterRule]],
78
- extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
79
- ) -> List[Dict[str, List[RawSearchFilterRule]]]:
86
+ extra_or_filters: Optional[RawSearchFilter] = None,
87
+ ) -> RawSearchFilter:
80
88
  """
81
89
  Generate a search filter based on the provided parameters.
82
90
  :param platform: The platform to filter by.
@@ -105,15 +113,16 @@ def generate_filter(
105
113
  and_filters.append(_get_container_filter(container).to_raw())
106
114
 
107
115
  # Status filter.
108
- status_filter = _get_status_filter(status)
109
- if status_filter:
110
- and_filters.append(status_filter.to_raw())
116
+ if status:
117
+ status_filter = _get_status_filter(status)
118
+ if status_filter:
119
+ and_filters.append(status_filter.to_raw())
111
120
 
112
121
  # Extra filters.
113
122
  if extra_filters:
114
123
  and_filters += extra_filters
115
124
 
116
- or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
125
+ or_filters: RawSearchFilter = [{"and": and_filters}]
117
126
 
118
127
  # Env filter
119
128
  if env:
@@ -127,11 +136,27 @@ def generate_filter(
127
136
 
128
137
  # Extra OR filters are distributed across the top level and lists.
129
138
  if extra_or_filters:
130
- or_filters = [
131
- {"and": and_filter["and"] + [extra_or_filter]}
132
- for extra_or_filter in extra_or_filters
133
- for and_filter in or_filters
134
- ]
139
+ new_or_filters: RawSearchFilter = []
140
+ for and_filter in or_filters:
141
+ for extra_or_filter in extra_or_filters:
142
+ if isinstance(extra_or_filter, dict) and "and" in extra_or_filter:
143
+ new_or_filters.append(
144
+ {"and": and_filter["and"] + extra_or_filter["and"]}
145
+ )
146
+ else:
147
+ # Hack for backwards compatibility.
148
+ # We have some code that erroneously passed a List[RawSearchFilterRule]
149
+ # instead of a List[Dict["and", List[RawSearchFilterRule]]].
150
+ warnings.warn(
151
+ "Passing a List[RawSearchFilterRule] to extra_or_filters is deprecated. "
152
+ "Please pass a List[Dict[str, List[RawSearchFilterRule]]] instead.",
153
+ SearchFilterWarning,
154
+ stacklevel=3,
155
+ )
156
+ new_or_filters.append(
157
+ {"and": and_filter["and"] + [extra_or_filter]} # type: ignore
158
+ )
159
+ or_filters = new_or_filters
135
160
 
136
161
  return or_filters
137
162
 
@@ -39,9 +39,6 @@ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, Logging
39
39
  from datahub.ingestion.sink.datahub_rest import DatahubRestSink
40
40
  from datahub.ingestion.sink.sink_registry import sink_registry
41
41
  from datahub.ingestion.source.source_registry import source_registry
42
- from datahub.ingestion.transformer.system_metadata_transformer import (
43
- SystemMetadataTransformer,
44
- )
45
42
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
43
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
47
44
  from datahub.telemetry import stats
@@ -286,9 +283,6 @@ class Pipeline:
286
283
  f"Transformer type:{transformer_type},{transformer_class} configured"
287
284
  )
288
285
 
289
- # Add the system metadata transformer at the end of the list.
290
- self.transformers.append(SystemMetadataTransformer(self.ctx))
291
-
292
286
  def _configure_reporting(self, report_to: Optional[str]) -> None:
293
287
  if self.dry_run:
294
288
  # In dry run mode, we don't want to report anything.
@@ -323,7 +323,7 @@ class ModelProcessor:
323
323
  model_training_jobs = model_training_jobs.union(
324
324
  {
325
325
  job_urn
326
- for job_urn, job_direction in data_url_matched_jobs.keys()
326
+ for job_urn, job_direction in data_url_matched_jobs
327
327
  if job_direction == JobDirection.TRAINING
328
328
  }
329
329
  )
@@ -331,7 +331,7 @@ class ModelProcessor:
331
331
  model_downstream_jobs = model_downstream_jobs.union(
332
332
  {
333
333
  job_urn
334
- for job_urn, job_direction in data_url_matched_jobs.keys()
334
+ for job_urn, job_direction in data_url_matched_jobs
335
335
  if job_direction == JobDirection.DOWNSTREAM
336
336
  }
337
337
  )
@@ -368,7 +368,7 @@ class ModelProcessor:
368
368
  model_training_jobs = model_training_jobs.union(
369
369
  {
370
370
  job_urn
371
- for job_urn, job_direction in name_matched_jobs.keys()
371
+ for job_urn, job_direction in name_matched_jobs
372
372
  if job_direction == JobDirection.TRAINING
373
373
  }
374
374
  )
@@ -376,7 +376,7 @@ class ModelProcessor:
376
376
  model_downstream_jobs = model_downstream_jobs.union(
377
377
  {
378
378
  job_urn
379
- for job_urn, job_direction in name_matched_jobs.keys()
379
+ for job_urn, job_direction in name_matched_jobs
380
380
  if job_direction == JobDirection.DOWNSTREAM
381
381
  }
382
382
  )
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
375
375
  memory_footprint.total_size(lineage)
376
376
  )
377
377
 
378
- for lineage_key in lineage.keys():
378
+ for lineage_key in lineage:
379
379
  # For views, we do not use the upstreams obtained by parsing audit logs
380
380
  # as they may contain indirectly referenced tables.
381
381
  if (