acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (53) hide show
  1. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/METADATA +2623 -2624
  2. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/RECORD +53 -49
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/emitter/mce_builder.py +28 -13
  8. datahub/ingestion/graph/client.py +15 -11
  9. datahub/ingestion/graph/filters.py +64 -37
  10. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  11. datahub/ingestion/source/common/subtypes.py +7 -0
  12. datahub/ingestion/source/identity/okta.py +22 -0
  13. datahub/ingestion/source/metabase.py +3 -3
  14. datahub/ingestion/source/mode.py +1 -1
  15. datahub/ingestion/source/preset.py +7 -4
  16. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  17. datahub/ingestion/source/sql/mssql/source.py +10 -4
  18. datahub/ingestion/source/superset.py +158 -24
  19. datahub/metadata/_schema_classes.py +157 -14
  20. datahub/metadata/_urns/urn_defs.py +82 -58
  21. datahub/metadata/schema.avsc +23 -10
  22. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  23. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  24. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  25. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  26. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  27. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  28. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  29. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  30. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  31. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  32. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  33. datahub/metadata/schemas/PostKey.avsc +2 -1
  34. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  35. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  36. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  37. datahub/pydantic/__init__.py +0 -0
  38. datahub/pydantic/compat.py +58 -0
  39. datahub/sdk/__init__.py +1 -0
  40. datahub/sdk/_all_entities.py +1 -1
  41. datahub/sdk/_shared.py +88 -3
  42. datahub/sdk/container.py +7 -1
  43. datahub/sdk/dataset.py +7 -1
  44. datahub/sdk/{_entity.py → entity.py} +4 -0
  45. datahub/sdk/entity_client.py +1 -1
  46. datahub/sdk/main_client.py +7 -1
  47. datahub/sdk/resolver_client.py +17 -29
  48. datahub/sdk/search_client.py +50 -0
  49. datahub/sdk/search_filters.py +374 -0
  50. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/LICENSE +0 -0
  51. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/WHEEL +0 -0
  52. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/entry_points.txt +0 -0
  53. {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,374 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ from typing import (
5
+ Any,
6
+ List,
7
+ Sequence,
8
+ TypedDict,
9
+ Union,
10
+ )
11
+
12
+ import pydantic
13
+
14
+ from datahub.configuration.common import ConfigModel
15
+ from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
+ from datahub.ingestion.graph.client import entity_type_to_graphql
17
+ from datahub.ingestion.graph.filters import SearchFilterRule
18
+ from datahub.metadata.schema_classes import EntityTypeName
19
+ from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
+
21
+ _AndSearchFilterRule = TypedDict(
22
+ "_AndSearchFilterRule", {"and": List[SearchFilterRule]}
23
+ )
24
+ _OrFilters = List[_AndSearchFilterRule]
25
+
26
+
27
+ class _BaseFilter(ConfigModel):
28
+ class Config:
29
+ # We can't wrap this in a TYPE_CHECKING block because the pydantic plugin
30
+ # doesn't recognize it properly. So unfortunately we'll need to live
31
+ # with the deprecation warning w/ pydantic v2.
32
+ allow_population_by_field_name = True
33
+ if PYDANTIC_VERSION_2:
34
+ populate_by_name = True
35
+
36
+ @abc.abstractmethod
37
+ def compile(self) -> _OrFilters:
38
+ pass
39
+
40
+
41
+ def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
+ if entity_type.upper() == entity_type:
43
+ # Assume that we were passed a graphql EntityType enum value,
44
+ # so no conversion is needed.
45
+ return entity_type
46
+ return entity_type_to_graphql(entity_type)
47
+
48
+
49
+ class _EntityTypeFilter(_BaseFilter):
50
+ entity_type: List[str] = pydantic.Field(
51
+ description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
+ )
53
+
54
+ def _build_rule(self) -> SearchFilterRule:
55
+ return SearchFilterRule(
56
+ field="_entityType",
57
+ condition="EQUAL",
58
+ values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
+ )
60
+
61
+ def compile(self) -> _OrFilters:
62
+ return [{"and": [self._build_rule()]}]
63
+
64
+
65
+ class _EntitySubtypeFilter(_BaseFilter):
66
+ entity_type: str
67
+ entity_subtype: str = pydantic.Field(
68
+ description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
69
+ )
70
+
71
+ def compile(self) -> _OrFilters:
72
+ rules = [
73
+ SearchFilterRule(
74
+ field="_entityType",
75
+ condition="EQUAL",
76
+ values=[_flexible_entity_type_to_graphql(self.entity_type)],
77
+ ),
78
+ SearchFilterRule(
79
+ field="typeNames",
80
+ condition="EQUAL",
81
+ values=[self.entity_subtype],
82
+ ),
83
+ ]
84
+ return [{"and": rules}]
85
+
86
+
87
+ class _PlatformFilter(_BaseFilter):
88
+ platform: List[str]
89
+ # TODO: Add validator to convert string -> list of strings
90
+
91
+ @pydantic.validator("platform", each_item=True)
92
+ def validate_platform(cls, v: str) -> str:
93
+ # Subtle - we use the constructor instead of the from_string method
94
+ # because coercion is acceptable here.
95
+ return str(DataPlatformUrn(v))
96
+
97
+ def _build_rule(self) -> SearchFilterRule:
98
+ return SearchFilterRule(
99
+ field="platform.keyword",
100
+ condition="EQUAL",
101
+ values=self.platform,
102
+ )
103
+
104
+ def compile(self) -> _OrFilters:
105
+ return [{"and": [self._build_rule()]}]
106
+
107
+
108
+ class _DomainFilter(_BaseFilter):
109
+ domain: List[str]
110
+
111
+ @pydantic.validator("domain", each_item=True)
112
+ def validate_domain(cls, v: str) -> str:
113
+ return str(DomainUrn.from_string(v))
114
+
115
+ def _build_rule(self) -> SearchFilterRule:
116
+ return SearchFilterRule(
117
+ field="domains",
118
+ condition="EQUAL",
119
+ values=self.domain,
120
+ )
121
+
122
+ def compile(self) -> _OrFilters:
123
+ return [{"and": [self._build_rule()]}]
124
+
125
+
126
+ class _EnvFilter(_BaseFilter):
127
+ # Note that not all entity types have an env (e.g. dashboards / charts).
128
+ # If the env filter is specified, these will be excluded.
129
+ env: List[str]
130
+
131
+ def compile(self) -> _OrFilters:
132
+ return [
133
+ # For most entity types, we look at the origin field.
134
+ {
135
+ "and": [
136
+ SearchFilterRule(
137
+ field="origin",
138
+ condition="EQUAL",
139
+ values=self.env,
140
+ ),
141
+ ]
142
+ },
143
+ # For containers, we now have an "env" property as of
144
+ # https://github.com/datahub-project/datahub/pull/11214
145
+ # Prior to this, we put "env" in the customProperties. But we're
146
+ # not bothering with that here.
147
+ {
148
+ "and": [
149
+ SearchFilterRule(
150
+ field="env",
151
+ condition="EQUAL",
152
+ values=self.env,
153
+ ),
154
+ ]
155
+ },
156
+ ]
157
+
158
+
159
+ class _CustomCondition(_BaseFilter):
160
+ """Represents a single field condition"""
161
+
162
+ field: str
163
+ condition: str
164
+ values: List[str]
165
+
166
+ def compile(self) -> _OrFilters:
167
+ rule = SearchFilterRule(
168
+ field=self.field,
169
+ condition=self.condition,
170
+ values=self.values,
171
+ )
172
+ return [{"and": [rule]}]
173
+
174
+
175
+ class _And(_BaseFilter):
176
+ """Represents an AND conjunction of filters"""
177
+
178
+ and_: Sequence["Filter"] = pydantic.Field(alias="and")
179
+ # TODO: Add validator to ensure that the "and" field is not empty
180
+
181
+ def compile(self) -> _OrFilters:
182
+ # The "and" operator must be implemented by doing a Cartesian product
183
+ # of the OR clauses.
184
+ # Example 1:
185
+ # (A or B) and (C or D) ->
186
+ # (A and C) or (A and D) or (B and C) or (B and D)
187
+ # Example 2:
188
+ # (A or B) and (C or D) and (E or F) ->
189
+ # (A and C and E) or (A and C and F) or (A and D and E) or (A and D and F) or
190
+ # (B and C and E) or (B and C and F) or (B and D and E) or (B and D and F)
191
+
192
+ # Start with the first filter's OR clauses
193
+ result = self.and_[0].compile()
194
+
195
+ # For each subsequent filter
196
+ for filter in self.and_[1:]:
197
+ new_result = []
198
+ # Get its OR clauses
199
+ other_clauses = filter.compile()
200
+
201
+ # Create Cartesian product
202
+ for existing_clause in result:
203
+ for other_clause in other_clauses:
204
+ # Merge the AND conditions from both clauses
205
+ new_result.append(self._merge_ands(existing_clause, other_clause))
206
+
207
+ result = new_result
208
+
209
+ return result
210
+
211
+ @classmethod
212
+ def _merge_ands(
213
+ cls, a: _AndSearchFilterRule, b: _AndSearchFilterRule
214
+ ) -> _AndSearchFilterRule:
215
+ return {
216
+ "and": [
217
+ *a["and"],
218
+ *b["and"],
219
+ ]
220
+ }
221
+
222
+
223
+ class _Or(_BaseFilter):
224
+ """Represents an OR conjunction of filters"""
225
+
226
+ or_: Sequence["Filter"] = pydantic.Field(alias="or")
227
+ # TODO: Add validator to ensure that the "or" field is not empty
228
+
229
+ def compile(self) -> _OrFilters:
230
+ merged_filter = []
231
+ for filter in self.or_:
232
+ merged_filter.extend(filter.compile())
233
+ return merged_filter
234
+
235
+
236
+ class _Not(_BaseFilter):
237
+ """Represents a NOT filter"""
238
+
239
+ not_: "Filter" = pydantic.Field(alias="not")
240
+
241
+ @pydantic.validator("not_", pre=False)
242
+ def validate_not(cls, v: "Filter") -> "Filter":
243
+ inner_filter = v.compile()
244
+ if len(inner_filter) != 1:
245
+ raise ValueError(
246
+ "Cannot negate a filter with multiple OR clauses [not yet supported]"
247
+ )
248
+ return v
249
+
250
+ def compile(self) -> _OrFilters:
251
+ # TODO: Eventually we'll want to implement a full DNF normalizer.
252
+ # https://en.wikipedia.org/wiki/Disjunctive_normal_form#Conversion_to_DNF
253
+
254
+ inner_filter = self.not_.compile()
255
+ assert len(inner_filter) == 1 # validated above
256
+
257
+ # ¬(A and B) -> (¬A) OR (¬B)
258
+ and_filters = inner_filter[0]["and"]
259
+ final_filters: _OrFilters = []
260
+ for rule in and_filters:
261
+ final_filters.append({"and": [rule.negate()]})
262
+
263
+ return final_filters
264
+
265
+
266
+ # TODO: With pydantic 2, we can use a RootModel with a
267
+ # discriminated union to make the error messages more informative.
268
+ Filter = Union[
269
+ _And,
270
+ _Or,
271
+ _Not,
272
+ _EntityTypeFilter,
273
+ _EntitySubtypeFilter,
274
+ _PlatformFilter,
275
+ _DomainFilter,
276
+ _EnvFilter,
277
+ _CustomCondition,
278
+ ]
279
+
280
+
281
+ # Required to resolve forward references to "Filter"
282
+ if PYDANTIC_VERSION_2:
283
+ _And.model_rebuild() # type: ignore
284
+ _Or.model_rebuild() # type: ignore
285
+ _Not.model_rebuild() # type: ignore
286
+ else:
287
+ _And.update_forward_refs()
288
+ _Or.update_forward_refs()
289
+ _Not.update_forward_refs()
290
+
291
+
292
+ def load_filters(obj: Any) -> Filter:
293
+ if PYDANTIC_VERSION_2:
294
+ return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
295
+ else:
296
+ return pydantic.parse_obj_as(Filter, obj) # type: ignore
297
+
298
+
299
+ # We need FilterDsl for two reasons:
300
+ # 1. To provide wrapper methods around lots of filters while avoid bloating the
301
+ # yaml spec.
302
+ # 2. Pydantic models in general don't support positional arguments, making the
303
+ # calls feel repetitive (e.g. Platform(platform=...)).
304
+ # See https://github.com/pydantic/pydantic/issues/6792
305
+ # We also considered using dataclasses / pydantic dataclasses, but
306
+ # ultimately decided that they didn't quite suit our requirements,
307
+ # particularly with regards to the field aliases for and/or/not.
308
+ class FilterDsl:
309
+ @staticmethod
310
+ def and_(*args: "Filter") -> _And:
311
+ return _And(and_=list(args))
312
+
313
+ @staticmethod
314
+ def or_(*args: "Filter") -> _Or:
315
+ return _Or(or_=list(args))
316
+
317
+ @staticmethod
318
+ def not_(arg: "Filter") -> _Not:
319
+ return _Not(not_=arg)
320
+
321
+ @staticmethod
322
+ def entity_type(
323
+ entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
324
+ ) -> _EntityTypeFilter:
325
+ return _EntityTypeFilter(
326
+ entity_type=(
327
+ [entity_type] if isinstance(entity_type, str) else list(entity_type)
328
+ )
329
+ )
330
+
331
+ @staticmethod
332
+ def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
333
+ return _EntitySubtypeFilter(
334
+ entity_type=entity_type,
335
+ entity_subtype=subtype,
336
+ )
337
+
338
+ @staticmethod
339
+ def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
340
+ return _PlatformFilter(
341
+ platform=[platform] if isinstance(platform, str) else platform
342
+ )
343
+
344
+ # TODO: Add a platform_instance filter
345
+
346
+ @staticmethod
347
+ def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
348
+ return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
349
+
350
+ @staticmethod
351
+ def env(env: Union[str, List[str]], /) -> _EnvFilter:
352
+ return _EnvFilter(env=[env] if isinstance(env, str) else env)
353
+
354
+ @staticmethod
355
+ def has_custom_property(key: str, value: str) -> _CustomCondition:
356
+ return _CustomCondition(
357
+ field="customProperties",
358
+ condition="EQUAL",
359
+ values=[f"{key}={value}"],
360
+ )
361
+
362
+ # TODO: Add a soft-deletion status filter
363
+ # TODO: add a container / browse path filter
364
+ # TODO add shortcut for custom filters
365
+
366
+ @staticmethod
367
+ def custom_filter(
368
+ field: str, condition: str, values: List[str]
369
+ ) -> _CustomCondition:
370
+ return _CustomCondition(
371
+ field=field,
372
+ condition=condition,
373
+ values=values,
374
+ )