acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (88) hide show
  1. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
  2. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +731 -42
  5. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  6. datahub/cli/specific/dataset_cli.py +128 -14
  7. datahub/configuration/git.py +1 -3
  8. datahub/ingestion/glossary/classification_mixin.py +1 -1
  9. datahub/ingestion/graph/client.py +16 -12
  10. datahub/ingestion/graph/filters.py +64 -37
  11. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  12. datahub/ingestion/source/abs/config.py +2 -4
  13. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  14. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
  15. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  16. datahub/ingestion/source/csv_enricher.py +1 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  18. datahub/ingestion/source/file.py +5 -2
  19. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  20. datahub/ingestion/source/ge_data_profiler.py +11 -14
  21. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  22. datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
  23. datahub/ingestion/source/identity/okta.py +1 -3
  24. datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
  25. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  26. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  27. datahub/ingestion/source/looker/lookml_source.py +2 -1
  28. datahub/ingestion/source/metadata/lineage.py +2 -2
  29. datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
  30. datahub/ingestion/source/nifi.py +6 -3
  31. datahub/ingestion/source/openapi_parser.py +2 -2
  32. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  33. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  34. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  37. datahub/ingestion/source/preset.py +7 -4
  38. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  39. datahub/ingestion/source/redash.py +2 -1
  40. datahub/ingestion/source/s3/config.py +2 -4
  41. datahub/ingestion/source/s3/source.py +20 -41
  42. datahub/ingestion/source/salesforce.py +1 -1
  43. datahub/ingestion/source/schema_inference/object.py +1 -1
  44. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  46. datahub/ingestion/source/sql/athena.py +2 -2
  47. datahub/ingestion/source/sql/sql_common.py +2 -2
  48. datahub/ingestion/source/sql/sql_types.py +2 -2
  49. datahub/ingestion/source/sql/teradata.py +4 -2
  50. datahub/ingestion/source/sql/trino.py +2 -2
  51. datahub/ingestion/source/superset.py +218 -56
  52. datahub/ingestion/source/tableau/tableau.py +1 -5
  53. datahub/lite/duckdb_lite.py +3 -9
  54. datahub/metadata/_schema_classes.py +157 -14
  55. datahub/metadata/_urns/urn_defs.py +58 -58
  56. datahub/metadata/schema.avsc +23 -10
  57. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  58. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  59. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  60. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  61. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  62. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  63. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  64. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  65. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  66. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  67. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  68. datahub/metadata/schemas/PostKey.avsc +2 -1
  69. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  70. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  71. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  72. datahub/pydantic/__init__.py +0 -0
  73. datahub/pydantic/compat.py +58 -0
  74. datahub/sdk/__init__.py +1 -0
  75. datahub/sdk/_all_entities.py +1 -1
  76. datahub/sdk/_shared.py +88 -3
  77. datahub/sdk/container.py +7 -1
  78. datahub/sdk/dataset.py +10 -4
  79. datahub/sdk/{_entity.py → entity.py} +4 -0
  80. datahub/sdk/entity_client.py +1 -1
  81. datahub/sdk/main_client.py +7 -1
  82. datahub/sdk/resolver_client.py +17 -29
  83. datahub/sdk/search_client.py +50 -0
  84. datahub/sdk/search_filters.py +374 -0
  85. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
  86. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
  87. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
  88. {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,8 @@
30
30
  "structuredProperties",
31
31
  "forms",
32
32
  "testResults",
33
- "versionProperties"
33
+ "versionProperties",
34
+ "subTypes"
34
35
  ]
35
36
  },
36
37
  "name": "MLModelKey",
@@ -17,7 +17,8 @@
17
17
  "dataPlatformInstance",
18
18
  "structuredProperties",
19
19
  "forms",
20
- "testResults"
20
+ "testResults",
21
+ "subTypes"
21
22
  ]
22
23
  },
23
24
  "name": "MLPrimaryKeyKey",
@@ -5,7 +5,8 @@
5
5
  "keyForEntity": "post",
6
6
  "entityCategory": "core",
7
7
  "entityAspects": [
8
- "postInfo"
8
+ "postInfo",
9
+ "subTypes"
9
10
  ]
10
11
  },
11
12
  "name": "PostKey",
@@ -13,7 +13,8 @@
13
13
  "schemaFieldAliases",
14
14
  "documentation",
15
15
  "testResults",
16
- "deprecation"
16
+ "deprecation",
17
+ "subTypes"
17
18
  ]
18
19
  },
19
20
  "name": "SchemaFieldKey",
@@ -137,6 +137,24 @@
137
137
  "name": "sortId",
138
138
  "doc": "Sort identifier that determines where a version lives in the order of the Version Set.\nWhat this looks like depends on the Version Scheme. For sort ids generated by DataHub we use an 8 character string representation."
139
139
  },
140
+ {
141
+ "type": {
142
+ "type": "enum",
143
+ "symbolDocs": {
144
+ "ALPHANUMERIC_GENERATED_BY_DATAHUB": "String managed by DataHub. Currently, an 8 character alphabetical string.",
145
+ "LEXICOGRAPHIC_STRING": "String sorted lexicographically."
146
+ },
147
+ "name": "VersioningScheme",
148
+ "namespace": "com.linkedin.pegasus2avro.versionset",
149
+ "symbols": [
150
+ "LEXICOGRAPHIC_STRING",
151
+ "ALPHANUMERIC_GENERATED_BY_DATAHUB"
152
+ ]
153
+ },
154
+ "name": "versioningScheme",
155
+ "default": "LEXICOGRAPHIC_STRING",
156
+ "doc": "What versioning scheme `sortId` belongs to.\nDefaults to a plain string that is lexicographically sorted."
157
+ },
140
158
  {
141
159
  "type": [
142
160
  "null",
@@ -36,9 +36,14 @@
36
36
  {
37
37
  "type": {
38
38
  "type": "enum",
39
+ "symbolDocs": {
40
+ "ALPHANUMERIC_GENERATED_BY_DATAHUB": "String managed by DataHub. Currently, an 8 character alphabetical string.",
41
+ "LEXICOGRAPHIC_STRING": "String sorted lexicographically."
42
+ },
39
43
  "name": "VersioningScheme",
40
44
  "namespace": "com.linkedin.pegasus2avro.versionset",
41
45
  "symbols": [
46
+ "LEXICOGRAPHIC_STRING",
42
47
  "ALPHANUMERIC_GENERATED_BY_DATAHUB"
43
48
  ]
44
49
  },
File without changes
@@ -0,0 +1,58 @@
1
+ import functools
2
+ from typing import Any, Callable, Optional, TypeVar, cast
3
+
4
+ # Define a type variable for the decorator
5
+ F = TypeVar("F", bound=Callable[..., Any])
6
+
7
+
8
+ # Check which Pydantic version is installed
9
+ def get_pydantic_version() -> int:
10
+ """Determine if Pydantic v1 or v2 is installed."""
11
+ try:
12
+ import pydantic
13
+
14
+ version = pydantic.__version__
15
+ return 1 if version.startswith("1.") else 2
16
+ except (ImportError, AttributeError):
17
+ # Default to v1 if we can't determine version
18
+ return 1
19
+
20
+
21
+ PYDANTIC_VERSION = get_pydantic_version()
22
+
23
+
24
+ # Create compatibility layer for dict-like methods
25
+ def compat_dict_method(v1_method: Optional[Callable] = None) -> Callable:
26
+ """
27
+ Decorator to make a dict method work with both Pydantic v1 and v2.
28
+
29
+ In v1: Uses the decorated method (typically dict)
30
+ In v2: Redirects to model_dump with appropriate parameter mapping
31
+ """
32
+
33
+ def decorator(func: F) -> F:
34
+ @functools.wraps(func)
35
+ def wrapper(self, *args, **kwargs):
36
+ if PYDANTIC_VERSION >= 2:
37
+ # Map v1 parameters to v2 parameters
38
+ # exclude -> exclude
39
+ # exclude_unset -> exclude_unset
40
+ # exclude_defaults -> exclude_defaults
41
+ # exclude_none -> exclude_none
42
+ # by_alias -> by_alias
43
+ model_dump_kwargs = kwargs.copy()
44
+
45
+ # Handle the 'exclude' parameter differently between versions
46
+ exclude = kwargs.get("exclude", set())
47
+ if isinstance(exclude, (set, dict)):
48
+ model_dump_kwargs["exclude"] = exclude
49
+
50
+ return self.model_dump(**model_dump_kwargs)
51
+ return func(self, *args, **kwargs)
52
+
53
+ return cast(F, wrapper)
54
+
55
+ # Allow use as both @compat_dict_method and @compat_dict_method()
56
+ if v1_method is None:
57
+ return decorator
58
+ return decorator(v1_method)
datahub/sdk/__init__.py CHANGED
@@ -20,6 +20,7 @@ from datahub.metadata.urns import (
20
20
  from datahub.sdk.container import Container
21
21
  from datahub.sdk.dataset import Dataset
22
22
  from datahub.sdk.main_client import DataHubClient
23
+ from datahub.sdk.search_filters import Filter, FilterDsl
23
24
 
24
25
  # We want to print out the warning if people do `from datahub.sdk import X`.
25
26
  # But we don't want to print out warnings if they're doing a more direct
@@ -1,8 +1,8 @@
1
1
  from typing import Dict, List, Type
2
2
 
3
- from datahub.sdk._entity import Entity
4
3
  from datahub.sdk.container import Container
5
4
  from datahub.sdk.dataset import Dataset
5
+ from datahub.sdk.entity import Entity
6
6
 
7
7
  # TODO: Is there a better way to declare this?
8
8
  ENTITY_CLASSES_LIST: List[Type[Entity]] = [
datahub/sdk/_shared.py CHANGED
@@ -7,6 +7,7 @@ from typing import (
7
7
  Callable,
8
8
  List,
9
9
  Optional,
10
+ Sequence,
10
11
  Tuple,
11
12
  Union,
12
13
  )
@@ -36,8 +37,8 @@ from datahub.metadata.urns import (
36
37
  TagUrn,
37
38
  Urn,
38
39
  )
39
- from datahub.sdk._entity import Entity
40
40
  from datahub.sdk._utils import add_list_unique, remove_list_unique
41
+ from datahub.sdk.entity import Entity
41
42
  from datahub.utilities.urns.error import InvalidUrnError
42
43
 
43
44
  if TYPE_CHECKING:
@@ -49,6 +50,8 @@ DatajobUrnOrStr: TypeAlias = Union[str, DataJobUrn]
49
50
 
50
51
  ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
51
52
 
53
+ _DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn()
54
+
52
55
 
53
56
  def make_time_stamp(ts: Optional[datetime]) -> Optional[models.TimeStampClass]:
54
57
  if ts is None:
@@ -438,8 +441,7 @@ class HasTerms(Entity):
438
441
  def _terms_audit_stamp(self) -> models.AuditStampClass:
439
442
  return models.AuditStampClass(
440
443
  time=0,
441
- # TODO figure out what to put here
442
- actor=CorpUserUrn("__ingestion").urn(),
444
+ actor=_DEFAULT_ACTOR_URN,
443
445
  )
444
446
 
445
447
  def set_terms(self, terms: TermsInputType) -> None:
@@ -493,3 +495,86 @@ class HasDomain(Entity):
493
495
  def set_domain(self, domain: DomainInputType) -> None:
494
496
  domain_urn = DomainUrn.from_string(domain) # basically a type assertion
495
497
  self._set_aspect(models.DomainsClass(domains=[str(domain_urn)]))
498
+
499
+
500
+ LinkInputType: TypeAlias = Union[
501
+ str,
502
+ Tuple[str, str], # url, description
503
+ models.InstitutionalMemoryMetadataClass,
504
+ ]
505
+ LinksInputType: TypeAlias = Sequence[LinkInputType]
506
+
507
+
508
+ class HasInstitutionalMemory(Entity):
509
+ __slots__ = ()
510
+
511
+ # Internally the aspect is called institutionalMemory, and so much of the code
512
+ # uses that name. However, the public-facing API is called "links", since
513
+ # that's what we call these in the UI.
514
+
515
+ def _ensure_institutional_memory(
516
+ self,
517
+ ) -> List[models.InstitutionalMemoryMetadataClass]:
518
+ return self._setdefault_aspect(
519
+ models.InstitutionalMemoryClass(elements=[])
520
+ ).elements
521
+
522
+ @property
523
+ def links(self) -> Optional[List[models.InstitutionalMemoryMetadataClass]]:
524
+ if institutional_memory := self._get_aspect(models.InstitutionalMemoryClass):
525
+ return institutional_memory.elements
526
+ return None
527
+
528
+ @classmethod
529
+ def _institutional_memory_audit_stamp(self) -> models.AuditStampClass:
530
+ return models.AuditStampClass(
531
+ time=0,
532
+ actor=_DEFAULT_ACTOR_URN,
533
+ )
534
+
535
+ @classmethod
536
+ def _parse_link_association_class(
537
+ cls, link: LinkInputType
538
+ ) -> models.InstitutionalMemoryMetadataClass:
539
+ if isinstance(link, models.InstitutionalMemoryMetadataClass):
540
+ return link
541
+ elif isinstance(link, str):
542
+ return models.InstitutionalMemoryMetadataClass(
543
+ url=link,
544
+ description=link,
545
+ createStamp=cls._institutional_memory_audit_stamp(),
546
+ )
547
+ elif isinstance(link, tuple) and len(link) == 2:
548
+ url, description = link
549
+ return models.InstitutionalMemoryMetadataClass(
550
+ url=url,
551
+ description=description,
552
+ createStamp=cls._institutional_memory_audit_stamp(),
553
+ )
554
+ else:
555
+ assert_never(link)
556
+
557
+ def set_links(self, links: LinksInputType) -> None:
558
+ self._set_aspect(
559
+ models.InstitutionalMemoryClass(
560
+ elements=[self._parse_link_association_class(link) for link in links]
561
+ )
562
+ )
563
+
564
+ @classmethod
565
+ def _link_key(self, link: models.InstitutionalMemoryMetadataClass) -> str:
566
+ return link.url
567
+
568
+ def add_link(self, link: LinkInputType) -> None:
569
+ add_list_unique(
570
+ self._ensure_institutional_memory(),
571
+ self._link_key,
572
+ self._parse_link_association_class(link),
573
+ )
574
+
575
+ def remove_link(self, link: LinkInputType) -> None:
576
+ remove_list_unique(
577
+ self._ensure_institutional_memory(),
578
+ self._link_key,
579
+ self._parse_link_association_class(link),
580
+ )
datahub/sdk/container.py CHANGED
@@ -16,16 +16,17 @@ from datahub.metadata.urns import (
16
16
  ContainerUrn,
17
17
  Urn,
18
18
  )
19
- from datahub.sdk._entity import Entity, ExtraAspectsType
20
19
  from datahub.sdk._shared import (
21
20
  DomainInputType,
22
21
  HasContainer,
23
22
  HasDomain,
23
+ HasInstitutionalMemory,
24
24
  HasOwnership,
25
25
  HasPlatformInstance,
26
26
  HasSubtype,
27
27
  HasTags,
28
28
  HasTerms,
29
+ LinksInputType,
29
30
  OwnersInputType,
30
31
  ParentContainerInputType,
31
32
  TagsInputType,
@@ -33,6 +34,7 @@ from datahub.sdk._shared import (
33
34
  make_time_stamp,
34
35
  parse_time_stamp,
35
36
  )
37
+ from datahub.sdk.entity import Entity, ExtraAspectsType
36
38
  from datahub.utilities.sentinels import Auto, auto
37
39
 
38
40
 
@@ -41,6 +43,7 @@ class Container(
41
43
  HasSubtype,
42
44
  HasContainer,
43
45
  HasOwnership,
46
+ HasInstitutionalMemory,
44
47
  HasTags,
45
48
  HasTerms,
46
49
  HasDomain,
@@ -71,6 +74,7 @@ class Container(
71
74
  parent_container: Auto | ParentContainerInputType | None = auto,
72
75
  subtype: Optional[str] = None,
73
76
  owners: Optional[OwnersInputType] = None,
77
+ links: Optional[LinksInputType] = None,
74
78
  tags: Optional[TagsInputType] = None,
75
79
  terms: Optional[TermsInputType] = None,
76
80
  domain: Optional[DomainInputType] = None,
@@ -133,6 +137,8 @@ class Container(
133
137
  self.set_subtype(subtype)
134
138
  if owners is not None:
135
139
  self.set_owners(owners)
140
+ if links is not None:
141
+ self.set_links(links)
136
142
  if tags is not None:
137
143
  self.set_tags(tags)
138
144
  if terms is not None:
datahub/sdk/dataset.py CHANGED
@@ -18,17 +18,18 @@ from datahub.errors import (
18
18
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
19
19
  from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
20
20
  from datahub.sdk._attribution import is_ingestion_attribution
21
- from datahub.sdk._entity import Entity, ExtraAspectsType
22
21
  from datahub.sdk._shared import (
23
22
  DatasetUrnOrStr,
24
23
  DomainInputType,
25
24
  HasContainer,
26
25
  HasDomain,
26
+ HasInstitutionalMemory,
27
27
  HasOwnership,
28
28
  HasPlatformInstance,
29
29
  HasSubtype,
30
30
  HasTags,
31
31
  HasTerms,
32
+ LinksInputType,
32
33
  OwnersInputType,
33
34
  ParentContainerInputType,
34
35
  TagInputType,
@@ -39,6 +40,7 @@ from datahub.sdk._shared import (
39
40
  parse_time_stamp,
40
41
  )
41
42
  from datahub.sdk._utils import add_list_unique, remove_list_unique
43
+ from datahub.sdk.entity import Entity, ExtraAspectsType
42
44
  from datahub.utilities.sentinels import Unset, unset
43
45
 
44
46
  SchemaFieldInputType: TypeAlias = Union[
@@ -72,9 +74,9 @@ UpstreamLineageInputType: TypeAlias = Union[
72
74
  def _parse_upstream_input(
73
75
  upstream_input: UpstreamInputType,
74
76
  ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
75
- if isinstance(upstream_input, models.UpstreamClass):
76
- return upstream_input
77
- elif isinstance(upstream_input, models.FineGrainedLineageClass):
77
+ if isinstance(upstream_input, models.UpstreamClass) or isinstance(
78
+ upstream_input, models.FineGrainedLineageClass
79
+ ):
78
80
  return upstream_input
79
81
  elif isinstance(upstream_input, (str, DatasetUrn)):
80
82
  return models.UpstreamClass(
@@ -422,6 +424,7 @@ class Dataset(
422
424
  HasSubtype,
423
425
  HasContainer,
424
426
  HasOwnership,
427
+ HasInstitutionalMemory,
425
428
  HasTags,
426
429
  HasTerms,
427
430
  HasDomain,
@@ -453,6 +456,7 @@ class Dataset(
453
456
  parent_container: ParentContainerInputType | Unset = unset,
454
457
  subtype: Optional[str] = None,
455
458
  owners: Optional[OwnersInputType] = None,
459
+ links: Optional[LinksInputType] = None,
456
460
  tags: Optional[TagsInputType] = None,
457
461
  terms: Optional[TermsInputType] = None,
458
462
  # TODO structured_properties
@@ -499,6 +503,8 @@ class Dataset(
499
503
  self.set_subtype(subtype)
500
504
  if owners is not None:
501
505
  self.set_owners(owners)
506
+ if links is not None:
507
+ self.set_links(links)
502
508
  if tags is not None:
503
509
  self.set_tags(tags)
504
510
  if terms is not None:
@@ -56,6 +56,10 @@ class Entity:
56
56
  @abc.abstractmethod
57
57
  def get_urn_type(cls) -> Type[_SpecificUrn]: ...
58
58
 
59
+ @classmethod
60
+ def entity_type_name(cls) -> str:
61
+ return cls.get_urn_type().ENTITY_TYPE
62
+
59
63
  @property
60
64
  def urn(self) -> _SpecificUrn:
61
65
  return self._urn
@@ -14,10 +14,10 @@ from datahub.metadata.urns import (
14
14
  Urn,
15
15
  )
16
16
  from datahub.sdk._all_entities import ENTITY_CLASSES
17
- from datahub.sdk._entity import Entity
18
17
  from datahub.sdk._shared import UrnOrStr
19
18
  from datahub.sdk.container import Container
20
19
  from datahub.sdk.dataset import Dataset
20
+ from datahub.sdk.entity import Entity
21
21
 
22
22
  if TYPE_CHECKING:
23
23
  from datahub.sdk.main_client import DataHubClient
@@ -7,6 +7,7 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
7
7
  from datahub.ingestion.graph.config import DatahubClientConfig
8
8
  from datahub.sdk.entity_client import EntityClient
9
9
  from datahub.sdk.resolver_client import ResolverClient
10
+ from datahub.sdk.search_client import SearchClient
10
11
 
11
12
 
12
13
  class DataHubClient:
@@ -39,6 +40,8 @@ class DataHubClient:
39
40
 
40
41
  self._graph = graph
41
42
 
43
+ # TODO: test connection
44
+
42
45
  @classmethod
43
46
  def from_env(cls) -> "DataHubClient":
44
47
  """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
@@ -69,5 +72,8 @@ class DataHubClient:
69
72
  def resolve(self) -> ResolverClient:
70
73
  return ResolverClient(self)
71
74
 
72
- # TODO: search client
75
+ @property
76
+ def search(self) -> SearchClient:
77
+ return SearchClient(self)
78
+
73
79
  # TODO: lineage client
@@ -9,6 +9,7 @@ from datahub.metadata.urns import (
9
9
  DomainUrn,
10
10
  GlossaryTermUrn,
11
11
  )
12
+ from datahub.sdk.search_filters import Filter, FilterDsl as F
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from datahub.sdk.main_client import DataHubClient
@@ -38,37 +39,28 @@ class ResolverClient:
38
39
  self, *, name: Optional[str] = None, email: Optional[str] = None
39
40
  ) -> CorpUserUrn:
40
41
  filter_explanation: str
41
- filters = []
42
+ filter: Filter
42
43
  if name is not None:
43
44
  if email is not None:
44
45
  raise SdkUsageError("Cannot specify both name and email for auto_user")
45
- # TODO: do we filter on displayName or fullName?
46
+ # We're filtering on both fullName and displayName. It's not clear
47
+ # what the right behavior is here.
46
48
  filter_explanation = f"with name {name}"
47
- filters.append(
48
- {
49
- "field": "fullName",
50
- "values": [name],
51
- "condition": "EQUAL",
52
- }
49
+ filter = F.or_(
50
+ F.custom_filter("fullName", "EQUAL", [name]),
51
+ F.custom_filter("displayName", "EQUAL", [name]),
53
52
  )
54
53
  elif email is not None:
55
54
  filter_explanation = f"with email {email}"
56
- filters.append(
57
- {
58
- "field": "email",
59
- "values": [email],
60
- "condition": "EQUAL",
61
- }
62
- )
55
+ filter = F.custom_filter("email", "EQUAL", [email])
63
56
  else:
64
57
  raise SdkUsageError("Must specify either name or email for auto_user")
65
58
 
66
- users = list(
67
- self._graph.get_urns_by_filter(
68
- entity_types=[CorpUserUrn.ENTITY_TYPE],
69
- extraFilters=filters,
70
- )
59
+ filter = F.and_(
60
+ F.entity_type(CorpUserUrn.ENTITY_TYPE),
61
+ filter,
71
62
  )
63
+ users = list(self._client.search.get_urns(filter=filter))
72
64
  if len(users) == 0:
73
65
  # TODO: In auto methods, should we just create the user/domain/etc if it doesn't exist?
74
66
  raise ItemNotFoundError(f"User {filter_explanation} not found")
@@ -82,15 +74,11 @@ class ResolverClient:
82
74
  def term(self, *, name: str) -> GlossaryTermUrn:
83
75
  # TODO: Add some limits on the graph fetch
84
76
  terms = list(
85
- self._graph.get_urns_by_filter(
86
- entity_types=[GlossaryTermUrn.ENTITY_TYPE],
87
- extraFilters=[
88
- {
89
- "field": "id",
90
- "values": [name],
91
- "condition": "EQUAL",
92
- }
93
- ],
77
+ self._client.search.get_urns(
78
+ filter=F.and_(
79
+ F.entity_type(GlossaryTermUrn.ENTITY_TYPE),
80
+ F.custom_filter("name", "EQUAL", [name]),
81
+ ),
94
82
  )
95
83
  )
96
84
  if len(terms) == 0:
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Dict,
6
+ Iterable,
7
+ List,
8
+ Optional,
9
+ )
10
+
11
+ from datahub.ingestion.graph.filters import RawSearchFilterRule
12
+ from datahub.metadata.urns import Urn
13
+ from datahub.sdk.search_filters import Filter
14
+
15
+ if TYPE_CHECKING:
16
+ from datahub.sdk.main_client import DataHubClient
17
+
18
+
19
+ def compile_filters(
20
+ filter: Optional[Filter],
21
+ ) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
22
+ # TODO: Not every filter type is supported for every entity type.
23
+ # If we can detect issues with the filters at compile time, we should
24
+ # raise an error.
25
+
26
+ if filter is None:
27
+ return None
28
+
29
+ initial_filters = filter.compile()
30
+ return [
31
+ {"and": [rule.to_raw() for rule in andClause["and"]]}
32
+ for andClause in initial_filters
33
+ ]
34
+
35
+
36
+ class SearchClient:
37
+ def __init__(self, client: DataHubClient):
38
+ self._client = client
39
+
40
+ def get_urns(
41
+ self,
42
+ query: Optional[str] = None,
43
+ filter: Optional[Filter] = None,
44
+ ) -> Iterable[Urn]:
45
+ # TODO: Add better limit / pagination support.
46
+ for urn in self._client._graph.get_urns_by_filter(
47
+ query=query,
48
+ extra_or_filters=compile_filters(filter),
49
+ ):
50
+ yield Urn.from_string(urn)