acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,9 +1,9 @@
1
- acryl_datahub-1.0.0.1rc6.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.1rc7.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=7kPZA7tlu74CuyNJP7xcQUj0ju3f89-XcI4tNSrX7u0,323
4
+ datahub/_version.py,sha256=AZj-rwp4edRcZvS9Mq4fxTeV64QHFW-6zysNAtjc2qg,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
- datahub/errors.py,sha256=bwtiNzFdVFze0IVKDEXQutkwk5j7cZkfXCUYCZIDSYg,565
6
+ datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  datahub/_codegen/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  datahub/_codegen/aspect.py,sha256=PJRa-Z4ouXHq3OkulfyWhwZn-fFUBDK_UPvmqaWdbWk,1063
@@ -145,8 +145,8 @@ datahub/ingestion/api/registry.py,sha256=LbdZr89465Lj7ptQRVB4vI1JR1igWABvQFj9-WX
145
145
  datahub/ingestion/api/report.py,sha256=eM_TWWz6iJNd-c_S2_4eg2qKLGYP8vSROb_TMiCwBhY,4644
146
146
  datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
147
147
  datahub/ingestion/api/sink.py,sha256=nfal7nsYY1AT2WQRjqO48uAHitpjax7TsRVzYXnqbeM,4918
148
- datahub/ingestion/api/source.py,sha256=Kz8xo0IY_5O3p5WE1i5dTmSK9IU20nqo4x6fvWcMAYw,19303
149
- datahub/ingestion/api/source_helpers.py,sha256=poP6EvkLsaiPM5fhjS5bcf-JMHMdPNMLv-eXCIVMUzM,19971
148
+ datahub/ingestion/api/source.py,sha256=HrQahSEBeapMDnW8S6wSEyNLLE9RCs2R6eUrVaibuuc,19349
149
+ datahub/ingestion/api/source_helpers.py,sha256=OhgBLdpUIuqF_gl4uV8Y2csp-z97zzXeFj2I5aUypCI,20158
150
150
  datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188mmC8J8,582
151
151
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
152
152
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -171,11 +171,11 @@ datahub/ingestion/glossary/classifier.py,sha256=daLxnVv_JlfB_jBOxH5LrU_xQRndrsGo
171
171
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
172
172
  datahub/ingestion/glossary/datahub_classifier.py,sha256=O7wm6gQT1Jf2QSKdWjJQbS5oSzJwplXzfza26Gdq5Mg,7555
173
173
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
- datahub/ingestion/graph/client.py,sha256=791U-QMJXG3_RuNiQ4ennQ6NsOPQToSeKELHbncwzIQ,65573
174
+ datahub/ingestion/graph/client.py,sha256=Qtjf5YrQeQzcTb0qxr6-y4MSEKSJm8f0hO6BoeRA_yI,65916
175
175
  datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
176
176
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
177
177
  datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
178
- datahub/ingestion/graph/filters.py,sha256=VFZKmef7ay1sQ5zRDDC1M_i6T96VzIgs-FzMs5eibiQ,7347
178
+ datahub/ingestion/graph/filters.py,sha256=hZ8YOQRxC0_mbAx_SLkgqyYXr0Fw3O4U2wo2UMuDHJY,8653
179
179
  datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
180
  datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=iEulcZMLBQuUfe9MAYyobMekvMcNm4dqVcS_C_2KfrI,9736
181
181
  datahub/ingestion/reporting/file_reporter.py,sha256=tiWukmMxHrTQI3rOAumsq6lRlw8T6spqpS6XBDYnrZU,1640
@@ -263,7 +263,7 @@ datahub/ingestion/source/bigquery_v2/queries.py,sha256=c1BpeQP8p8y-FOhmiQkkY2IqG
263
263
  datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=_5cAXVU8b8T_nAPDsvN2JRd2dmM1t1J1mRylfKiPen4,19530
264
264
  datahub/ingestion/source/bigquery_v2/usage.py,sha256=A9c-ofclaRk0NSnc4IRaqJYqMPv6ecCld_TPy3V2qFs,40748
265
265
  datahub/ingestion/source/cassandra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
266
- datahub/ingestion/source/cassandra/cassandra.py,sha256=eIB2qiE6UaXCDmogWHhEafoPrBu3nlHsgc_1-2udOcc,14834
266
+ datahub/ingestion/source/cassandra/cassandra.py,sha256=lKvPP0Uahi9xw_yh9cArPPtwvAauXolaEk-6f-jhpz4,14558
267
267
  datahub/ingestion/source/cassandra/cassandra_api.py,sha256=UVGQTsk6O57Q6wrWo54bQPLtStTWhw_Fq6fgW3Bjgk8,12515
268
268
  datahub/ingestion/source/cassandra/cassandra_config.py,sha256=vIMUOzazWTGi03B51vI0-YMxaMJHUGmCxJJgd8pKhC8,3791
269
269
  datahub/ingestion/source/cassandra/cassandra_profiling.py,sha256=DkSIryZNwLei5PaKuu9fNEKxEbhIrPI-T9gaVoM87NQ,11063
@@ -906,8 +906,8 @@ datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
906
906
  datahub/sdk/entity_client.py,sha256=Sxe6H6Vr_tqLJu5KW7MJfLWJ6mgh4mbsx7u7MOBpM64,5052
907
907
  datahub/sdk/main_client.py,sha256=h2MKRhR-BO0zGCMhF7z2bTncX4hagKrAYwR3wTNTtzA,3666
908
908
  datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
909
- datahub/sdk/search_client.py,sha256=h9O_rsphkTdpd5hMPay3xSXfJM761cf4PjNCBwCnFzU,1309
910
- datahub/sdk/search_filters.py,sha256=WaJKFUKT9P70NBkh36f44rZrJ7zRJpRwu8mN-rEx5y0,11364
909
+ datahub/sdk/search_client.py,sha256=BJR5t7Ff2oDNOGLcSCp9YHzrGKbgOQr7T8XQKGEpucw,3437
910
+ datahub/sdk/search_filters.py,sha256=BcMhvG5hGYAATtLPLz4WLRjKApX2oLYrrcGn-CG__ek,12901
911
911
  datahub/secret/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
912
912
  datahub/secret/datahub_secret_store.py,sha256=9u9S87-15jwhj4h0EsAVIMdQLgvstKc8voQux2slxgU,2477
913
913
  datahub/secret/datahub_secrets_client.py,sha256=nDmhziKdvseJHlaDVUcAwK8Fv8maeAaG-ktZtWG2b70,1316
@@ -996,7 +996,7 @@ datahub/utilities/sqllineage_patch.py,sha256=0Buh50bmEqJFg1HFRCknCnePo1cecI4JmGx
996
996
  datahub/utilities/stats_collections.py,sha256=CxaTcrF7J6am7iX5jPhFKne535UcyDk_oreVwR013fU,1625
997
997
  datahub/utilities/str_enum.py,sha256=EsqCLPbrqyQ2YU_wt7QP-a6P5fnpIshXJ3AI8gLBlVA,474
998
998
  datahub/utilities/tee_io.py,sha256=jBrsUfTPTk9IICntfGOG0HR-Fjp8BQMde-FPQ4r3kuI,601
999
- datahub/utilities/threaded_iterator_executor.py,sha256=WC4tvJ4TQRkH0VO_FD91GbedcKUqx0lc4tHDNOiF6ps,1770
999
+ datahub/utilities/threaded_iterator_executor.py,sha256=6BpCE0os3d-uMYxHBilPQC-JvEBkU6JQY4bGs06JKYI,2004
1000
1000
  datahub/utilities/threading_timeout.py,sha256=hOzDI55E3onXblHNwGsePJUWMXo5zqaWCnoYdL2-KPM,1316
1001
1001
  datahub/utilities/time.py,sha256=Q7S_Zyom8C2zcl2xFbjNw6K8nZsCub5XGAB4OEmIS34,1847
1002
1002
  datahub/utilities/topological_sort.py,sha256=kcK5zPSR393fgItr-KSLV3bDqfJfBRS8E5kkCpPBgUY,1358
@@ -1043,8 +1043,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1043
1043
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1044
1044
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1045
1045
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1046
- acryl_datahub-1.0.0.1rc6.dist-info/METADATA,sha256=8vR0C58PC28apLdQmFrOElrjYK-ArD0warcqjSc-fD8,176849
1047
- acryl_datahub-1.0.0.1rc6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1048
- acryl_datahub-1.0.0.1rc6.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1049
- acryl_datahub-1.0.0.1rc6.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1050
- acryl_datahub-1.0.0.1rc6.dist-info/RECORD,,
1046
+ acryl_datahub-1.0.0.1rc7.dist-info/METADATA,sha256=QkeMAnAXXez9FFTnJVpxhOJMNDadF0gfVQ3uz3Fh4i8,176849
1047
+ acryl_datahub-1.0.0.1rc7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1048
+ acryl_datahub-1.0.0.1rc7.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1049
+ acryl_datahub-1.0.0.1rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1050
+ acryl_datahub-1.0.0.1rc7.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.1rc6"
3
+ __version__ = "1.0.0.1rc7"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
datahub/errors.py CHANGED
@@ -31,6 +31,10 @@ class MultipleSubtypesWarning(Warning):
31
31
  pass
32
32
 
33
33
 
34
+ class SearchFilterWarning(Warning):
35
+ pass
36
+
37
+
34
38
  class ExperimentalWarning(Warning):
35
39
  pass
36
40
 
@@ -51,6 +51,7 @@ from datahub.ingestion.api.source_helpers import (
51
51
  from datahub.ingestion.api.workunit import MetadataWorkUnit
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
53
  from datahub.metadata.schema_classes import UpstreamLineageClass
54
+ from datahub.sdk.entity import Entity
54
55
  from datahub.utilities.lossy_collections import LossyDict, LossyList
55
56
  from datahub.utilities.type_annotations import get_class_from_annotation
56
57
 
@@ -480,7 +481,7 @@ class Source(Closeable, metaclass=ABCMeta):
480
481
 
481
482
  def get_workunits_internal(
482
483
  self,
483
- ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
484
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
484
485
  raise NotImplementedError(
485
486
  "get_workunits_internal must be implemented if get_workunits is not overriden."
486
487
  )
@@ -35,6 +35,7 @@ from datahub.metadata.schema_classes import (
35
35
  TimeWindowSizeClass,
36
36
  )
37
37
  from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
38
+ from datahub.sdk.entity import Entity
38
39
  from datahub.specific.dataset import DatasetPatchBuilder
39
40
  from datahub.telemetry import telemetry
40
41
  from datahub.utilities.urns.error import InvalidUrnError
@@ -49,7 +50,12 @@ logger = logging.getLogger(__name__)
49
50
 
50
51
  def auto_workunit(
51
52
  stream: Iterable[
52
- Union[MetadataChangeEventClass, MetadataChangeProposalWrapper, MetadataWorkUnit]
53
+ Union[
54
+ MetadataChangeEventClass,
55
+ MetadataChangeProposalWrapper,
56
+ MetadataWorkUnit,
57
+ Entity,
58
+ ]
53
59
  ],
54
60
  ) -> Iterable[MetadataWorkUnit]:
55
61
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
@@ -62,6 +68,8 @@ def auto_workunit(
62
68
  )
63
69
  elif isinstance(item, MetadataChangeProposalWrapper):
64
70
  yield item.as_workunit()
71
+ elif isinstance(item, Entity):
72
+ yield from item.as_workunits()
65
73
  else:
66
74
  yield item
67
75
 
@@ -49,6 +49,7 @@ from datahub.ingestion.graph.connections import (
49
49
  )
50
50
  from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
51
51
  from datahub.ingestion.graph.filters import (
52
+ RawSearchFilter,
52
53
  RawSearchFilterRule,
53
54
  RemovedStatusFilter,
54
55
  generate_filter,
@@ -75,10 +76,11 @@ from datahub.metadata.schema_classes import (
75
76
  SystemMetadataClass,
76
77
  TelemetryClientIdClass,
77
78
  )
79
+ from datahub.metadata.urns import CorpUserUrn, Urn
78
80
  from datahub.telemetry.telemetry import telemetry_instance
79
81
  from datahub.utilities.perf_timer import PerfTimer
80
82
  from datahub.utilities.str_enum import StrEnum
81
- from datahub.utilities.urns.urn import Urn, guess_entity_type
83
+ from datahub.utilities.urns.urn import guess_entity_type
82
84
 
83
85
  if TYPE_CHECKING:
84
86
  from datahub.ingestion.sink.datahub_rest import (
@@ -116,7 +118,7 @@ def entity_type_to_graphql(entity_type: str) -> str:
116
118
  """Convert the entity types into GraphQL "EntityType" enum values."""
117
119
 
118
120
  # Hard-coded special cases.
119
- if entity_type == "corpuser":
121
+ if entity_type == CorpUserUrn.ENTITY_TYPE:
120
122
  return "CORP_USER"
121
123
 
122
124
  # Convert camelCase to UPPER_UNDERSCORE.
@@ -133,6 +135,14 @@ def entity_type_to_graphql(entity_type: str) -> str:
133
135
  return entity_type
134
136
 
135
137
 
138
+ def flexible_entity_type_to_graphql(entity_type: str) -> str:
139
+ if entity_type.upper() == entity_type:
140
+ # Assume that we were passed a graphql EntityType enum value,
141
+ # so no conversion is needed.
142
+ return entity_type
143
+ return entity_type_to_graphql(entity_type)
144
+
145
+
136
146
  class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
137
147
  def __init__(self, config: DatahubClientConfig) -> None:
138
148
  self.config = config
@@ -805,7 +815,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
805
815
 
806
816
  :return: An iterable of (urn, schema info) tuple that match the filters.
807
817
  """
808
- types = [entity_type_to_graphql("dataset")]
818
+ types = self._get_types(["dataset"])
809
819
 
810
820
  # Add the query default of * if no query is specified.
811
821
  query = query or "*"
@@ -873,10 +883,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
873
883
  env: Optional[str] = None,
874
884
  query: Optional[str] = None,
875
885
  container: Optional[str] = None,
876
- status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
886
+ status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
877
887
  batch_size: int = 10000,
878
888
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
879
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
889
+ extra_or_filters: Optional[RawSearchFilter] = None,
880
890
  ) -> Iterable[str]:
881
891
  """Fetch all urns that match all of the given filters.
882
892
 
@@ -968,7 +978,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
968
978
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
969
979
  batch_size: int = 10000,
970
980
  extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
971
- extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
981
+ extra_or_filters: Optional[RawSearchFilter] = None,
972
982
  extra_source_fields: Optional[List[str]] = None,
973
983
  skip_cache: bool = False,
974
984
  ) -> Iterable[dict]:
@@ -1121,7 +1131,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1121
1131
  )
1122
1132
 
1123
1133
  types = [
1124
- entity_type_to_graphql(entity_type) for entity_type in entity_types
1134
+ flexible_entity_type_to_graphql(entity_type)
1135
+ for entity_type in entity_types
1125
1136
  ]
1126
1137
  return types
1127
1138
 
@@ -1,6 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
- from typing import Any, Dict, List, Literal, Optional
3
+ import warnings
4
+ from typing import Dict, List, Literal, Optional, Union
4
5
 
5
6
  from typing_extensions import TypeAlias
6
7
 
@@ -8,9 +9,14 @@ from datahub.emitter.mce_builder import (
8
9
  make_data_platform_urn,
9
10
  make_dataplatform_instance_urn,
10
11
  )
12
+ from datahub.errors import SearchFilterWarning
11
13
  from datahub.utilities.urns.urn import guess_entity_type
12
14
 
13
- RawSearchFilterRule = Dict[str, Any]
15
+ RawSearchFilterRule: TypeAlias = Dict[str, Union[str, bool, List[str]]]
16
+
17
+ # This is a list of OR filters, each of which is a list of AND filters.
18
+ # This can be put directly into the orFilters parameter in GraphQL.
19
+ RawSearchFilter: TypeAlias = List[Dict[Literal["and"], List[RawSearchFilterRule]]]
14
20
 
15
21
  # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
16
22
  FilterOperator: TypeAlias = Literal[
@@ -39,12 +45,14 @@ class SearchFilterRule:
39
45
  negated: bool = False
40
46
 
41
47
  def to_raw(self) -> RawSearchFilterRule:
42
- return {
48
+ rule: RawSearchFilterRule = {
43
49
  "field": self.field,
44
50
  "condition": self.condition,
45
51
  "values": self.values,
46
- "negated": self.negated,
47
52
  }
53
+ if self.negated:
54
+ rule["negated"] = True
55
+ return rule
48
56
 
49
57
  def negate(self) -> "SearchFilterRule":
50
58
  return SearchFilterRule(
@@ -73,10 +81,10 @@ def generate_filter(
73
81
  platform_instance: Optional[str],
74
82
  env: Optional[str],
75
83
  container: Optional[str],
76
- status: RemovedStatusFilter,
84
+ status: Optional[RemovedStatusFilter],
77
85
  extra_filters: Optional[List[RawSearchFilterRule]],
78
- extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
79
- ) -> List[Dict[str, List[RawSearchFilterRule]]]:
86
+ extra_or_filters: Optional[RawSearchFilter] = None,
87
+ ) -> RawSearchFilter:
80
88
  """
81
89
  Generate a search filter based on the provided parameters.
82
90
  :param platform: The platform to filter by.
@@ -105,15 +113,16 @@ def generate_filter(
105
113
  and_filters.append(_get_container_filter(container).to_raw())
106
114
 
107
115
  # Status filter.
108
- status_filter = _get_status_filter(status)
109
- if status_filter:
110
- and_filters.append(status_filter.to_raw())
116
+ if status:
117
+ status_filter = _get_status_filter(status)
118
+ if status_filter:
119
+ and_filters.append(status_filter.to_raw())
111
120
 
112
121
  # Extra filters.
113
122
  if extra_filters:
114
123
  and_filters += extra_filters
115
124
 
116
- or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
125
+ or_filters: RawSearchFilter = [{"and": and_filters}]
117
126
 
118
127
  # Env filter
119
128
  if env:
@@ -127,11 +136,27 @@ def generate_filter(
127
136
 
128
137
  # Extra OR filters are distributed across the top level and lists.
129
138
  if extra_or_filters:
130
- or_filters = [
131
- {"and": and_filter["and"] + [extra_or_filter]}
132
- for extra_or_filter in extra_or_filters
133
- for and_filter in or_filters
134
- ]
139
+ new_or_filters: RawSearchFilter = []
140
+ for and_filter in or_filters:
141
+ for extra_or_filter in extra_or_filters:
142
+ if isinstance(extra_or_filter, dict) and "and" in extra_or_filter:
143
+ new_or_filters.append(
144
+ {"and": and_filter["and"] + extra_or_filter["and"]}
145
+ )
146
+ else:
147
+ # Hack for backwards compatibility.
148
+ # We have some code that erroneously passed a List[RawSearchFilterRule]
149
+ # instead of a List[Dict["and", List[RawSearchFilterRule]]].
150
+ warnings.warn(
151
+ "Passing a List[RawSearchFilterRule] to extra_or_filters is deprecated. "
152
+ "Please pass a List[Dict[str, List[RawSearchFilterRule]]] instead.",
153
+ SearchFilterWarning,
154
+ stacklevel=3,
155
+ )
156
+ new_or_filters.append(
157
+ {"and": and_filter["and"] + [extra_or_filter]} # type: ignore
158
+ )
159
+ or_filters = new_or_filters
135
160
 
136
161
  return or_filters
137
162
 
@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
123
123
  ).workunit_processor,
124
124
  ]
125
125
 
126
- def get_workunits_internal(
127
- self,
128
- ) -> Iterable[MetadataWorkUnit]:
129
- for metadata in self._get_metadata():
130
- if isinstance(metadata, MetadataWorkUnit):
131
- yield metadata
132
- else:
133
- yield from metadata.as_workunits()
134
-
135
- def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
126
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
136
127
  if not self.cassandra_api.authenticate():
137
128
  return
138
129
  keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
@@ -2,36 +2,106 @@ from __future__ import annotations
2
2
 
3
3
  from typing import (
4
4
  TYPE_CHECKING,
5
- Dict,
6
5
  Iterable,
7
6
  List,
8
7
  Optional,
8
+ Tuple,
9
+ Type,
10
+ TypeVar,
9
11
  )
10
12
 
11
- from datahub.ingestion.graph.filters import RawSearchFilterRule
13
+ from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
12
14
  from datahub.metadata.urns import Urn
13
- from datahub.sdk.search_filters import Filter
15
+ from datahub.sdk.search_filters import (
16
+ Filter,
17
+ FilterDsl,
18
+ _EntityTypeFilter,
19
+ _OrFilters,
20
+ _StatusFilter,
21
+ )
14
22
 
15
23
  if TYPE_CHECKING:
16
24
  from datahub.sdk.main_client import DataHubClient
17
25
 
18
26
 
27
+ _FilterType = TypeVar("_FilterType", bound=Filter)
28
+
29
+
30
+ def _typed_dfs(
31
+ filter: Optional[_FilterType], type: Type[_FilterType]
32
+ ) -> Optional[List[_FilterType]]:
33
+ if filter is None:
34
+ return None
35
+
36
+ found: Optional[List[_FilterType]] = None
37
+ for f in filter.dfs():
38
+ if isinstance(f, type):
39
+ if found is None:
40
+ found = []
41
+ found.append(f)
42
+ return found
43
+
44
+
19
45
  def compile_filters(
20
46
  filter: Optional[Filter],
21
- ) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
47
+ ) -> Tuple[Optional[List[str]], RawSearchFilter]:
22
48
  # TODO: Not every filter type is supported for every entity type.
23
49
  # If we can detect issues with the filters at compile time, we should
24
50
  # raise an error.
25
51
 
26
- if filter is None:
27
- return None
52
+ existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
53
+ if existing_soft_deleted_filter is None:
54
+ soft_deleted_filter = FilterDsl.soft_deleted(
55
+ RemovedStatusFilter.NOT_SOFT_DELETED
56
+ )
57
+ if filter is None:
58
+ filter = soft_deleted_filter
59
+ else:
60
+ filter = FilterDsl.and_(filter, soft_deleted_filter)
61
+
62
+ # This should be safe - if filter were None coming in, then we would replace it
63
+ # with the soft-deleted filter.
64
+ assert filter is not None
28
65
 
29
66
  initial_filters = filter.compile()
30
- return [
67
+
68
+ compiled_filters: RawSearchFilter = [
31
69
  {"and": [rule.to_raw() for rule in andClause["and"]]}
32
70
  for andClause in initial_filters
33
71
  ]
34
72
 
73
+ entity_types = compute_entity_types(initial_filters)
74
+
75
+ return entity_types, compiled_filters
76
+
77
+
78
+ def compute_entity_types(
79
+ filters: _OrFilters,
80
+ ) -> Optional[List[str]]:
81
+ found_filters = False
82
+ found_positive_filters = False
83
+ entity_types: List[str] = []
84
+ for ands in filters:
85
+ for clause in ands["and"]:
86
+ if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
87
+ found_filters = True
88
+ if not clause.negated:
89
+ found_positive_filters = True
90
+
91
+ entity_types.extend(clause.values)
92
+
93
+ if not found_filters:
94
+ # If we didn't find any filters, use None so we use the default set.
95
+ return None
96
+
97
+ if not found_positive_filters:
98
+ # If we only found negated filters, then it's probably a query like
99
+ # "find me all entities except for dashboards". In that case, we
100
+ # still want to use the default set.
101
+ return None
102
+
103
+ return entity_types
104
+
35
105
 
36
106
  class SearchClient:
37
107
  def __init__(self, client: DataHubClient):
@@ -43,8 +113,11 @@ class SearchClient:
43
113
  filter: Optional[Filter] = None,
44
114
  ) -> Iterable[Urn]:
45
115
  # TODO: Add better limit / pagination support.
116
+ types, compiled_filters = compile_filters(filter)
46
117
  for urn in self._client._graph.get_urns_by_filter(
47
118
  query=query,
48
- extra_or_filters=compile_filters(filter),
119
+ status=None,
120
+ extra_or_filters=compiled_filters,
121
+ entity_types=types,
49
122
  ):
50
123
  yield Urn.from_string(urn)
@@ -3,7 +3,10 @@ from __future__ import annotations
3
3
  import abc
4
4
  from typing import (
5
5
  Any,
6
+ ClassVar,
7
+ Iterator,
6
8
  List,
9
+ Optional,
7
10
  Sequence,
8
11
  TypedDict,
9
12
  Union,
@@ -13,8 +16,13 @@ import pydantic
13
16
 
14
17
  from datahub.configuration.common import ConfigModel
15
18
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
- from datahub.ingestion.graph.client import entity_type_to_graphql
17
- from datahub.ingestion.graph.filters import FilterOperator, SearchFilterRule
19
+ from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
20
+ from datahub.ingestion.graph.filters import (
21
+ FilterOperator,
22
+ RemovedStatusFilter,
23
+ SearchFilterRule,
24
+ _get_status_filter,
25
+ )
18
26
  from datahub.metadata.schema_classes import EntityTypeName
19
27
  from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
28
 
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
37
45
  def compile(self) -> _OrFilters:
38
46
  pass
39
47
 
40
-
41
- def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
- if entity_type.upper() == entity_type:
43
- # Assume that we were passed a graphql EntityType enum value,
44
- # so no conversion is needed.
45
- return entity_type
46
- return entity_type_to_graphql(entity_type)
48
+ def dfs(self) -> Iterator[_BaseFilter]:
49
+ yield self
47
50
 
48
51
 
49
52
  class _EntityTypeFilter(_BaseFilter):
53
+ """Filter for specific entity types.
54
+
55
+ If no entity type filter is specified, we will search all entity types in the
56
+ default search set, mirroring the behavior of the DataHub UI.
57
+ """
58
+
59
+ ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
60
+
50
61
  entity_type: List[str] = pydantic.Field(
51
62
  description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
63
  )
53
64
 
54
65
  def _build_rule(self) -> SearchFilterRule:
55
66
  return SearchFilterRule(
56
- field="_entityType",
67
+ field=self.ENTITY_TYPE_FIELD,
57
68
  condition="EQUAL",
58
- values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
69
+ values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
70
  )
60
71
 
61
72
  def compile(self) -> _OrFilters:
@@ -78,6 +89,26 @@ class _EntitySubtypeFilter(_BaseFilter):
78
89
  return [{"and": [self._build_rule()]}]
79
90
 
80
91
 
92
+ class _StatusFilter(_BaseFilter):
93
+ """Filter for the status of entities during search.
94
+
95
+ If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
96
+ """
97
+
98
+ status: RemovedStatusFilter
99
+
100
+ def _build_rule(self) -> Optional[SearchFilterRule]:
101
+ return _get_status_filter(self.status)
102
+
103
+ def compile(self) -> _OrFilters:
104
+ rule = self._build_rule()
105
+ if rule:
106
+ return [{"and": [rule]}]
107
+ else:
108
+ # Our boolean algebra logic requires something here - returning [] would cause errors.
109
+ return FilterDsl.true().compile()
110
+
111
+
81
112
  class _PlatformFilter(_BaseFilter):
82
113
  platform: List[str]
83
114
  # TODO: Add validator to convert string -> list of strings
@@ -213,6 +244,11 @@ class _And(_BaseFilter):
213
244
  ]
214
245
  }
215
246
 
247
+ def dfs(self) -> Iterator[_BaseFilter]:
248
+ yield self
249
+ for filter in self.and_:
250
+ yield from filter.dfs()
251
+
216
252
 
217
253
  class _Or(_BaseFilter):
218
254
  """Represents an OR conjunction of filters."""
@@ -226,6 +262,11 @@ class _Or(_BaseFilter):
226
262
  merged_filter.extend(filter.compile())
227
263
  return merged_filter
228
264
 
265
+ def dfs(self) -> Iterator[_BaseFilter]:
266
+ yield self
267
+ for filter in self.or_:
268
+ yield from filter.dfs()
269
+
229
270
 
230
271
  class _Not(_BaseFilter):
231
272
  """Represents a NOT filter."""
@@ -256,6 +297,10 @@ class _Not(_BaseFilter):
256
297
 
257
298
  return final_filters
258
299
 
300
+ def dfs(self) -> Iterator[_BaseFilter]:
301
+ yield self
302
+ yield from self.not_.dfs()
303
+
259
304
 
260
305
  # TODO: With pydantic 2, we can use a RootModel with a
261
306
  # discriminated union to make the error messages more informative.
@@ -265,6 +310,7 @@ Filter = Union[
265
310
  _Not,
266
311
  _EntityTypeFilter,
267
312
  _EntitySubtypeFilter,
313
+ _StatusFilter,
268
314
  _PlatformFilter,
269
315
  _DomainFilter,
270
316
  _EnvFilter,
@@ -312,6 +358,18 @@ class FilterDsl:
312
358
  def not_(arg: "Filter") -> _Not:
313
359
  return _Not(not_=arg)
314
360
 
361
+ @staticmethod
362
+ def true() -> "Filter":
363
+ return _CustomCondition(
364
+ field="urn",
365
+ condition="EXISTS",
366
+ values=[],
367
+ )
368
+
369
+ @staticmethod
370
+ def false() -> "Filter":
371
+ return FilterDsl.not_(FilterDsl.true())
372
+
315
373
  @staticmethod
316
374
  def entity_type(
317
375
  entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -354,6 +412,10 @@ class FilterDsl:
354
412
  values=[f"{key}={value}"],
355
413
  )
356
414
 
415
+ @staticmethod
416
+ def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
417
+ return _StatusFilter(status=status)
418
+
357
419
  # TODO: Add a soft-deletion status filter
358
420
  # TODO: add a container / browse path filter
359
421
  # TODO add shortcut for custom filters