acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ from enum import Enum, auto
3
3
  from typing import Callable, Dict, Optional, Type
4
4
 
5
5
  from datahub.ingestion.api.common import PipelineContext
6
- from datahub.ingestion.api.source import ( # noqa: I250
6
+ from datahub.ingestion.api.source import (
7
7
  Source,
8
8
  SourceCapability as SourceCapability,
9
9
  )
@@ -250,6 +250,10 @@ def auto_browse_path_v2(
250
250
  emitted_urns: Set[str] = set()
251
251
  containers_used_as_parent: Set[str] = set()
252
252
  for urn, batch in _batch_workunits_by_urn(stream):
253
+ # Do not generate browse path v2 for entities that do not support it
254
+ if not entity_supports_aspect(guess_entity_type(urn), BrowsePathsV2Class):
255
+ yield from batch
256
+ continue
253
257
  container_path: Optional[List[BrowsePathEntryClass]] = None
254
258
  legacy_path: Optional[List[BrowsePathEntryClass]] = None
255
259
  browse_path_v2: Optional[List[BrowsePathEntryClass]] = None
@@ -48,12 +48,12 @@ class S3ListIterator(Iterator):
48
48
  def __next__(self) -> FileInfo:
49
49
  try:
50
50
  return next(self._file_statuses)
51
- except StopIteration:
51
+ except StopIteration as e:
52
52
  if self._token:
53
53
  self.fetch()
54
54
  return next(self._file_statuses)
55
55
  else:
56
- raise StopIteration()
56
+ raise e
57
57
 
58
58
  def fetch(self):
59
59
  params = dict(Bucket=self._bucket, Prefix=self._prefix, MaxKeys=self._max_keys)
@@ -279,11 +279,7 @@ class ClassificationHandler:
279
279
  "Dataset_Name": dataset_name,
280
280
  }
281
281
  ),
282
- values=(
283
- sample_data[schema_field.fieldPath]
284
- if schema_field.fieldPath in sample_data.keys()
285
- else []
286
- ),
282
+ values=sample_data.get(schema_field.fieldPath, []),
287
283
  )
288
284
  )
289
285
 
@@ -16,6 +16,7 @@ from typing import (
16
16
  List,
17
17
  Literal,
18
18
  Optional,
19
+ Sequence,
19
20
  Tuple,
20
21
  Type,
21
22
  Union,
@@ -31,9 +32,15 @@ from datahub.configuration.common import ConfigModel, GraphError, OperationalErr
31
32
  from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
32
33
  from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect
33
34
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
34
- from datahub.emitter.rest_emitter import DatahubRestEmitter
35
+ from datahub.emitter.rest_emitter import (
36
+ DEFAULT_REST_SINK_ENDPOINT,
37
+ DEFAULT_REST_TRACE_MODE,
38
+ DatahubRestEmitter,
39
+ RestSinkEndpoint,
40
+ RestTraceMode,
41
+ )
35
42
  from datahub.emitter.serialization_helper import post_json_transform
36
- from datahub.ingestion.graph.config import ( # noqa: I250; TODO: Remove this alias
43
+ from datahub.ingestion.graph.config import (
37
44
  DatahubClientConfig as DatahubClientConfig,
38
45
  )
39
46
  from datahub.ingestion.graph.connections import (
@@ -42,8 +49,8 @@ from datahub.ingestion.graph.connections import (
42
49
  )
43
50
  from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
44
51
  from datahub.ingestion.graph.filters import (
52
+ RawSearchFilterRule,
45
53
  RemovedStatusFilter,
46
- SearchFilterRule,
47
54
  generate_filter,
48
55
  )
49
56
  from datahub.ingestion.source.state.checkpoint import Checkpoint
@@ -105,7 +112,7 @@ class RelatedEntity:
105
112
  via: Optional[str] = None
106
113
 
107
114
 
108
- def _graphql_entity_type(entity_type: str) -> str:
115
+ def entity_type_to_graphql(entity_type: str) -> str:
109
116
  """Convert the entity types into GraphQL "EntityType" enum values."""
110
117
 
111
118
  # Hard-coded special cases.
@@ -140,6 +147,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
140
147
  ca_certificate_path=self.config.ca_certificate_path,
141
148
  client_certificate_path=self.config.client_certificate_path,
142
149
  disable_ssl_verification=self.config.disable_ssl_verification,
150
+ openapi_ingestion=DEFAULT_REST_SINK_ENDPOINT == RestSinkEndpoint.OPENAPI,
151
+ default_trace_mode=DEFAULT_REST_TRACE_MODE == RestTraceMode.ENABLED,
143
152
  )
144
153
 
145
154
  self.server_id = _MISSING_SERVER_ID
@@ -330,7 +339,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
330
339
  aspect_type_name: Optional[str] = None,
331
340
  version: int = 0,
332
341
  ) -> Optional[Aspect]:
333
- assert aspect_type.ASPECT_NAME == aspect
342
+ assert aspect == aspect_type.ASPECT_NAME
334
343
  return self.get_aspect(
335
344
  entity_urn=entity_urn,
336
345
  aspect_type=aspect_type,
@@ -781,9 +790,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
781
790
  results: Dict = self._post_generic(url, search_body)
782
791
  num_entities = results["value"]["numEntities"]
783
792
  logger.debug(f"Matched {num_entities} containers")
784
- entities_yielded: int = 0
785
793
  for x in results["value"]["entities"]:
786
- entities_yielded += 1
787
794
  logger.debug(f"yielding {x['entity']}")
788
795
  yield x["entity"]
789
796
 
@@ -797,13 +804,13 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
797
804
  container: Optional[str] = None,
798
805
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
799
806
  batch_size: int = 100,
800
- extraFilters: Optional[List[SearchFilterRule]] = None,
807
+ extraFilters: Optional[List[RawSearchFilterRule]] = None,
801
808
  ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]:
802
809
  """Fetch schema info for datasets that match all of the given filters.
803
810
 
804
811
  :return: An iterable of (urn, schema info) tuple that match the filters.
805
812
  """
806
- types = [_graphql_entity_type("dataset")]
813
+ types = [entity_type_to_graphql("dataset")]
807
814
 
808
815
  # Add the query default of * if no query is specified.
809
816
  query = query or "*"
@@ -865,7 +872,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
865
872
  def get_urns_by_filter(
866
873
  self,
867
874
  *,
868
- entity_types: Optional[List[str]] = None,
875
+ entity_types: Optional[Sequence[str]] = None,
869
876
  platform: Optional[str] = None,
870
877
  platform_instance: Optional[str] = None,
871
878
  env: Optional[str] = None,
@@ -873,8 +880,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
873
880
  container: Optional[str] = None,
874
881
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
875
882
  batch_size: int = 10000,
876
- extraFilters: Optional[List[SearchFilterRule]] = None,
877
- extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
883
+ extraFilters: Optional[List[RawSearchFilterRule]] = None,
884
+ extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
878
885
  ) -> Iterable[str]:
879
886
  """Fetch all urns that match all of the given filters.
880
887
 
@@ -965,8 +972,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
965
972
  container: Optional[str] = None,
966
973
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
967
974
  batch_size: int = 10000,
968
- extra_and_filters: Optional[List[SearchFilterRule]] = None,
969
- extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
975
+ extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
976
+ extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
970
977
  extra_source_fields: Optional[List[str]] = None,
971
978
  skip_cache: bool = False,
972
979
  ) -> Iterable[dict]:
@@ -1109,7 +1116,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1109
1116
  f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
1110
1117
  )
1111
1118
 
1112
- def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]:
1119
+ @classmethod
1120
+ def _get_types(cls, entity_types: Optional[Sequence[str]]) -> Optional[List[str]]:
1113
1121
  types: Optional[List[str]] = None
1114
1122
  if entity_types is not None:
1115
1123
  if not entity_types:
@@ -1117,7 +1125,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1117
1125
  "entity_types cannot be an empty list; use None for all entities"
1118
1126
  )
1119
1127
 
1120
- types = [_graphql_entity_type(entity_type) for entity_type in entity_types]
1128
+ types = [
1129
+ entity_type_to_graphql(entity_type) for entity_type in entity_types
1130
+ ]
1121
1131
  return types
1122
1132
 
1123
1133
  def get_latest_pipeline_checkpoint(
@@ -1547,7 +1557,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1547
1557
  return fragment
1548
1558
 
1549
1559
  def _run_assertion_build_params(
1550
- self, params: Optional[Dict[str, str]] = {}
1560
+ self, params: Optional[Dict[str, str]] = None
1551
1561
  ) -> List[Any]:
1552
1562
  if params is None:
1553
1563
  return []
@@ -1566,9 +1576,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1566
1576
  self,
1567
1577
  urn: str,
1568
1578
  save_result: bool = True,
1569
- parameters: Optional[Dict[str, str]] = {},
1579
+ parameters: Optional[Dict[str, str]] = None,
1570
1580
  async_flag: bool = False,
1571
1581
  ) -> Dict:
1582
+ if parameters is None:
1583
+ parameters = {}
1572
1584
  params = self._run_assertion_build_params(parameters)
1573
1585
  graph_query: str = """
1574
1586
  %s
@@ -1597,9 +1609,11 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1597
1609
  self,
1598
1610
  urns: List[str],
1599
1611
  save_result: bool = True,
1600
- parameters: Optional[Dict[str, str]] = {},
1612
+ parameters: Optional[Dict[str, str]] = None,
1601
1613
  async_flag: bool = False,
1602
1614
  ) -> Dict:
1615
+ if parameters is None:
1616
+ parameters = {}
1603
1617
  params = self._run_assertion_build_params(parameters)
1604
1618
  graph_query: str = """
1605
1619
  %s
@@ -1636,10 +1650,14 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1636
1650
  def run_assertions_for_asset(
1637
1651
  self,
1638
1652
  urn: str,
1639
- tag_urns: Optional[List[str]] = [],
1640
- parameters: Optional[Dict[str, str]] = {},
1653
+ tag_urns: Optional[List[str]] = None,
1654
+ parameters: Optional[Dict[str, str]] = None,
1641
1655
  async_flag: bool = False,
1642
1656
  ) -> Dict:
1657
+ if tag_urns is None:
1658
+ tag_urns = []
1659
+ if parameters is None:
1660
+ parameters = {}
1643
1661
  params = self._run_assertion_build_params(parameters)
1644
1662
  graph_query: str = """
1645
1663
  %s
@@ -1677,9 +1695,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1677
1695
  self,
1678
1696
  entity_name: str,
1679
1697
  urns: List[str],
1680
- aspects: List[str] = [],
1698
+ aspects: Optional[List[str]] = None,
1681
1699
  with_system_metadata: bool = False,
1682
1700
  ) -> Dict[str, Any]:
1701
+ aspects = aspects or []
1683
1702
  payload = {
1684
1703
  "urns": urns,
1685
1704
  "aspectNames": aspects,
@@ -93,7 +93,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
93
93
  try:
94
94
  return response["linkAssetVersion"]["urn"]
95
95
  except KeyError:
96
- raise ValueError(f"Unexpected response: {response}")
96
+ raise ValueError(f"Unexpected response: {response}") from None
97
97
 
98
98
  def link_asset_to_versioned_asset(
99
99
  self,
@@ -165,7 +165,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
165
165
  try:
166
166
  return response["unlinkAssetVersion"]["urn"]
167
167
  except KeyError:
168
- raise ValueError(f"Unexpected response: {response}")
168
+ raise ValueError(f"Unexpected response: {response}") from None
169
169
 
170
170
  def unlink_latest_asset_from_version_set(
171
171
  self, version_set_urn: str
@@ -198,4 +198,4 @@ class EntityVersioningAPI(DataHubGraphProtocol):
198
198
  try:
199
199
  return response["unlinkAssetVersion"]["urn"]
200
200
  except KeyError:
201
- raise ValueError(f"Unexpected response: {response}")
201
+ raise ValueError(f"Unexpected response: {response}") from None
@@ -1,3 +1,4 @@
1
+ import dataclasses
1
2
  import enum
2
3
  from typing import Any, Dict, List, Optional
3
4
 
@@ -7,7 +8,31 @@ from datahub.emitter.mce_builder import (
7
8
  )
8
9
  from datahub.utilities.urns.urn import guess_entity_type
9
10
 
10
- SearchFilterRule = Dict[str, Any]
11
+ RawSearchFilterRule = Dict[str, Any]
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class SearchFilterRule:
16
+ field: str
17
+ condition: str # TODO: convert to an enum
18
+ values: List[str]
19
+ negated: bool = False
20
+
21
+ def to_raw(self) -> RawSearchFilterRule:
22
+ return {
23
+ "field": self.field,
24
+ "condition": self.condition,
25
+ "values": self.values,
26
+ "negated": self.negated,
27
+ }
28
+
29
+ def negate(self) -> "SearchFilterRule":
30
+ return SearchFilterRule(
31
+ field=self.field,
32
+ condition=self.condition,
33
+ values=self.values,
34
+ negated=not self.negated,
35
+ )
11
36
 
12
37
 
13
38
  class RemovedStatusFilter(enum.Enum):
@@ -29,9 +54,9 @@ def generate_filter(
29
54
  env: Optional[str],
30
55
  container: Optional[str],
31
56
  status: RemovedStatusFilter,
32
- extra_filters: Optional[List[SearchFilterRule]],
33
- extra_or_filters: Optional[List[SearchFilterRule]] = None,
34
- ) -> List[Dict[str, List[SearchFilterRule]]]:
57
+ extra_filters: Optional[List[RawSearchFilterRule]],
58
+ extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
59
+ ) -> List[Dict[str, List[RawSearchFilterRule]]]:
35
60
  """
36
61
  Generate a search filter based on the provided parameters.
37
62
  :param platform: The platform to filter by.
@@ -43,30 +68,32 @@ def generate_filter(
43
68
  :param extra_or_filters: Extra OR filters to apply. These are combined with
44
69
  the AND filters using an OR at the top level.
45
70
  """
46
- and_filters: List[SearchFilterRule] = []
71
+ and_filters: List[RawSearchFilterRule] = []
47
72
 
48
73
  # Platform filter.
49
74
  if platform:
50
- and_filters.append(_get_platform_filter(platform))
75
+ and_filters.append(_get_platform_filter(platform).to_raw())
51
76
 
52
77
  # Platform instance filter.
53
78
  if platform_instance:
54
- and_filters.append(_get_platform_instance_filter(platform, platform_instance))
79
+ and_filters.append(
80
+ _get_platform_instance_filter(platform, platform_instance).to_raw()
81
+ )
55
82
 
56
83
  # Browse path v2 filter.
57
84
  if container:
58
- and_filters.append(_get_container_filter(container))
85
+ and_filters.append(_get_container_filter(container).to_raw())
59
86
 
60
87
  # Status filter.
61
88
  status_filter = _get_status_filter(status)
62
89
  if status_filter:
63
- and_filters.append(status_filter)
90
+ and_filters.append(status_filter.to_raw())
64
91
 
65
92
  # Extra filters.
66
93
  if extra_filters:
67
94
  and_filters += extra_filters
68
95
 
69
- or_filters: List[Dict[str, List[SearchFilterRule]]] = [{"and": and_filters}]
96
+ or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
70
97
 
71
98
  # Env filter
72
99
  if env:
@@ -89,7 +116,7 @@ def generate_filter(
89
116
  return or_filters
90
117
 
91
118
 
92
- def _get_env_filters(env: str) -> List[SearchFilterRule]:
119
+ def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
93
120
  # The env filter is a bit more tricky since it's not always stored
94
121
  # in the same place in ElasticSearch.
95
122
  return [
@@ -125,19 +152,19 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
125
152
  # removed field is simply not present in the ElasticSearch document. Ideally this
126
153
  # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
127
154
  # use a negated filter.
128
- return {
129
- "field": "removed",
130
- "values": ["true"],
131
- "condition": "EQUAL",
132
- "negated": True,
133
- }
155
+ return SearchFilterRule(
156
+ field="removed",
157
+ values=["true"],
158
+ condition="EQUAL",
159
+ negated=True,
160
+ )
134
161
 
135
162
  elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
136
- return {
137
- "field": "removed",
138
- "values": ["true"],
139
- "condition": "EQUAL",
140
- }
163
+ return SearchFilterRule(
164
+ field="removed",
165
+ values=["true"],
166
+ condition="EQUAL",
167
+ )
141
168
 
142
169
  elif status == RemovedStatusFilter.ALL:
143
170
  # We don't need to add a filter for this case.
@@ -152,11 +179,11 @@ def _get_container_filter(container: str) -> SearchFilterRule:
152
179
  if guess_entity_type(container) != "container":
153
180
  raise ValueError(f"Invalid container urn: {container}")
154
181
 
155
- return {
156
- "field": "browsePathV2",
157
- "values": [container],
158
- "condition": "CONTAIN",
159
- }
182
+ return SearchFilterRule(
183
+ field="browsePathV2",
184
+ values=[container],
185
+ condition="CONTAIN",
186
+ )
160
187
 
161
188
 
162
189
  def _get_platform_instance_filter(
@@ -171,16 +198,16 @@ def _get_platform_instance_filter(
171
198
  if guess_entity_type(platform_instance) != "dataPlatformInstance":
172
199
  raise ValueError(f"Invalid data platform instance urn: {platform_instance}")
173
200
 
174
- return {
175
- "field": "platformInstance",
176
- "values": [platform_instance],
177
- "condition": "EQUAL",
178
- }
201
+ return SearchFilterRule(
202
+ field="platformInstance",
203
+ condition="EQUAL",
204
+ values=[platform_instance],
205
+ )
179
206
 
180
207
 
181
208
  def _get_platform_filter(platform: str) -> SearchFilterRule:
182
- return {
183
- "field": "platform.keyword",
184
- "values": [make_data_platform_urn(platform)],
185
- "condition": "EQUAL",
186
- }
209
+ return SearchFilterRule(
210
+ field="platform.keyword",
211
+ condition="EQUAL",
212
+ values=[make_data_platform_urn(platform)],
213
+ )
@@ -163,12 +163,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
163
163
  key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
164
164
  for key, value in obj.items()
165
165
  }
166
- elif isinstance(obj, list):
167
- return [
168
- DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
169
- for element in obj
170
- ]
171
- elif isinstance(obj, set):
166
+ elif isinstance(obj, (list, set)):
172
167
  return [
173
168
  DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
174
169
  for element in obj