acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
datahub/sdk/dataset.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import warnings
4
4
  from datetime import datetime
5
- from typing import Dict, List, Optional, Tuple, Type, Union
5
+ from typing import Dict, List, Optional, Sequence, Tuple, Type, Union
6
6
 
7
7
  from typing_extensions import Self, TypeAlias, assert_never
8
8
 
@@ -13,37 +13,43 @@ from datahub.errors import (
13
13
  IngestionAttributionWarning,
14
14
  ItemNotFoundError,
15
15
  SchemaFieldKeyError,
16
+ SdkUsageError,
16
17
  )
17
18
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
18
19
  from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
19
20
  from datahub.sdk._attribution import is_ingestion_attribution
20
- from datahub.sdk._entity import Entity
21
21
  from datahub.sdk._shared import (
22
- ContainerInputType,
23
22
  DatasetUrnOrStr,
24
23
  DomainInputType,
25
24
  HasContainer,
26
25
  HasDomain,
26
+ HasInstitutionalMemory,
27
27
  HasOwnership,
28
28
  HasPlatformInstance,
29
29
  HasSubtype,
30
30
  HasTags,
31
31
  HasTerms,
32
+ LinksInputType,
32
33
  OwnersInputType,
34
+ ParentContainerInputType,
35
+ TagInputType,
33
36
  TagsInputType,
37
+ TermInputType,
34
38
  TermsInputType,
35
39
  make_time_stamp,
36
40
  parse_time_stamp,
37
41
  )
42
+ from datahub.sdk._utils import add_list_unique, remove_list_unique
43
+ from datahub.sdk.entity import Entity, ExtraAspectsType
44
+ from datahub.utilities.sentinels import Unset, unset
38
45
 
39
46
  SchemaFieldInputType: TypeAlias = Union[
40
- str,
41
47
  Tuple[str, str], # (name, type)
42
48
  Tuple[str, str, str], # (name, type, description)
43
49
  models.SchemaFieldClass,
44
50
  ]
45
51
  SchemaFieldsInputType: TypeAlias = Union[
46
- List[SchemaFieldInputType],
52
+ Sequence[SchemaFieldInputType],
47
53
  models.SchemaMetadataClass,
48
54
  ]
49
55
 
@@ -68,9 +74,9 @@ UpstreamLineageInputType: TypeAlias = Union[
68
74
  def _parse_upstream_input(
69
75
  upstream_input: UpstreamInputType,
70
76
  ) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
71
- if isinstance(upstream_input, models.UpstreamClass):
72
- return upstream_input
73
- elif isinstance(upstream_input, models.FineGrainedLineageClass):
77
+ if isinstance(
78
+ upstream_input, (models.UpstreamClass, models.FineGrainedLineageClass)
79
+ ):
74
80
  return upstream_input
75
81
  elif isinstance(upstream_input, (str, DatasetUrn)):
76
82
  return models.UpstreamClass(
@@ -271,6 +277,51 @@ class SchemaField:
271
277
  tags=parsed_tags
272
278
  )
273
279
 
280
+ def add_tag(self, tag: TagInputType) -> None:
281
+ parsed_tag = self._parent._parse_tag_association_class(tag)
282
+
283
+ if is_ingestion_attribution():
284
+ raise SdkUsageError(
285
+ "Adding field tags in ingestion mode is not yet supported. "
286
+ "Use set_tags instead."
287
+ )
288
+ else:
289
+ editable_field = self._ensure_editable_schema_field()
290
+ if editable_field.globalTags is None:
291
+ editable_field.globalTags = models.GlobalTagsClass(tags=[])
292
+
293
+ add_list_unique(
294
+ editable_field.globalTags.tags,
295
+ key=self._parent._tag_key,
296
+ item=parsed_tag,
297
+ )
298
+
299
+ def remove_tag(self, tag: TagInputType) -> None:
300
+ parsed_tag = self._parent._parse_tag_association_class(tag)
301
+
302
+ if is_ingestion_attribution():
303
+ raise SdkUsageError(
304
+ "Adding field tags in ingestion mode is not yet supported. "
305
+ "Use set_tags instead."
306
+ )
307
+ else:
308
+ base_field = self._base_schema_field()
309
+ if base_field.globalTags is not None:
310
+ remove_list_unique(
311
+ base_field.globalTags.tags,
312
+ key=self._parent._tag_key,
313
+ item=parsed_tag,
314
+ missing_ok=True,
315
+ )
316
+
317
+ editable_field = self._ensure_editable_schema_field()
318
+ if editable_field.globalTags is not None:
319
+ remove_list_unique(
320
+ editable_field.globalTags.tags,
321
+ key=self._parent._tag_key,
322
+ item=parsed_tag,
323
+ )
324
+
274
325
  @property
275
326
  def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
276
327
  # TODO: Basically the same implementation as tags - can we share code?
@@ -287,7 +338,7 @@ class SchemaField:
287
338
 
288
339
  return terms
289
340
 
290
- def set_terms(self, terms: List[models.GlossaryTermAssociationClass]) -> None:
341
+ def set_terms(self, terms: TermsInputType) -> None:
291
342
  parsed_terms = [
292
343
  self._parent._parse_glossary_term_association_class(term) for term in terms
293
344
  ]
@@ -318,12 +369,62 @@ class SchemaField:
318
369
  )
319
370
  )
320
371
 
372
+ def add_term(self, term: TermInputType) -> None:
373
+ parsed_term = self._parent._parse_glossary_term_association_class(term)
374
+
375
+ if is_ingestion_attribution():
376
+ raise SdkUsageError(
377
+ "Adding field terms in ingestion mode is not yet supported. "
378
+ "Use set_terms instead."
379
+ )
380
+ else:
381
+ editable_field = self._ensure_editable_schema_field()
382
+ if editable_field.glossaryTerms is None:
383
+ editable_field.glossaryTerms = models.GlossaryTermsClass(
384
+ terms=[],
385
+ auditStamp=self._parent._terms_audit_stamp(),
386
+ )
387
+
388
+ add_list_unique(
389
+ editable_field.glossaryTerms.terms,
390
+ key=self._parent._terms_key,
391
+ item=parsed_term,
392
+ )
393
+
394
+ def remove_term(self, term: TermInputType) -> None:
395
+ parsed_term = self._parent._parse_glossary_term_association_class(term)
396
+
397
+ if is_ingestion_attribution():
398
+ raise SdkUsageError(
399
+ "Removing field terms in ingestion mode is not yet supported. "
400
+ "Use set_terms instead."
401
+ )
402
+ else:
403
+ base_field = self._base_schema_field()
404
+ if base_field.glossaryTerms is not None:
405
+ remove_list_unique(
406
+ base_field.glossaryTerms.terms,
407
+ key=self._parent._terms_key,
408
+ item=parsed_term,
409
+ missing_ok=True,
410
+ )
411
+
412
+ editable_field = self._ensure_editable_schema_field()
413
+ if editable_field.glossaryTerms is not None:
414
+ remove_list_unique(
415
+ editable_field.glossaryTerms.terms,
416
+ key=self._parent._terms_key,
417
+ item=parsed_term,
418
+ missing_ok=True,
419
+ )
420
+
321
421
 
322
422
  class Dataset(
323
423
  HasPlatformInstance,
324
424
  HasSubtype,
325
425
  HasContainer,
326
426
  HasOwnership,
427
+ HasInstitutionalMemory,
327
428
  HasTags,
328
429
  HasTerms,
329
430
  HasDomain,
@@ -352,13 +453,15 @@ class Dataset(
352
453
  created: Optional[datetime] = None,
353
454
  last_modified: Optional[datetime] = None,
354
455
  # Standard aspects.
456
+ parent_container: ParentContainerInputType | Unset = unset,
355
457
  subtype: Optional[str] = None,
356
- container: Optional[ContainerInputType] = None,
357
458
  owners: Optional[OwnersInputType] = None,
459
+ links: Optional[LinksInputType] = None,
358
460
  tags: Optional[TagsInputType] = None,
359
461
  terms: Optional[TermsInputType] = None,
360
462
  # TODO structured_properties
361
463
  domain: Optional[DomainInputType] = None,
464
+ extra_aspects: ExtraAspectsType = None,
362
465
  # Dataset-specific aspects.
363
466
  schema: Optional[SchemaFieldsInputType] = None,
364
467
  upstreams: Optional[models.UpstreamLineageClass] = None,
@@ -370,6 +473,7 @@ class Dataset(
370
473
  env=env,
371
474
  )
372
475
  super().__init__(urn)
476
+ self._set_extra_aspects(extra_aspects)
373
477
 
374
478
  self._set_platform_instance(urn.platform, platform_instance)
375
479
 
@@ -393,12 +497,14 @@ class Dataset(
393
497
  if last_modified is not None:
394
498
  self.set_last_modified(last_modified)
395
499
 
500
+ if parent_container is not unset:
501
+ self._set_container(parent_container)
396
502
  if subtype is not None:
397
503
  self.set_subtype(subtype)
398
- if container is not None:
399
- self._set_container(container)
400
504
  if owners is not None:
401
505
  self.set_owners(owners)
506
+ if links is not None:
507
+ self.set_links(links)
402
508
  if tags is not None:
403
509
  self.set_tags(tags)
404
510
  if terms is not None:
@@ -537,14 +643,6 @@ class Dataset(
537
643
  nativeDataType=field_type,
538
644
  description=description,
539
645
  )
540
- elif isinstance(schema_field_input, str):
541
- # TODO: Not sure this branch makes sense - we should probably just require types?
542
- return models.SchemaFieldClass(
543
- fieldPath=schema_field_input,
544
- type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
545
- nativeDataType="unknown",
546
- description=None,
547
- )
548
646
  else:
549
647
  assert_never(schema_field_input)
550
648
 
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import abc
2
- from typing import List, Optional, Type, Union
4
+ from typing import TYPE_CHECKING, List, Optional, Type, Union
3
5
 
4
6
  from typing_extensions import Self
5
7
 
@@ -10,6 +12,12 @@ from datahub.errors import SdkUsageError
10
12
  from datahub.metadata.urns import Urn
11
13
  from datahub.utilities.urns._urn_base import _SpecificUrn
12
14
 
15
+ if TYPE_CHECKING:
16
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+
18
+
19
+ ExtraAspectsType = Union[None, List[AspectTypeVar]]
20
+
13
21
 
14
22
  class Entity:
15
23
  __slots__ = ("_urn", "_prev_aspects", "_aspects")
@@ -36,6 +44,8 @@ class Entity:
36
44
 
37
45
  def _init_from_graph(self, current_aspects: models.AspectBag) -> Self:
38
46
  self._prev_aspects = current_aspects
47
+
48
+ self._aspects = {}
39
49
  aspect: models._Aspect
40
50
  for aspect_name, aspect in (current_aspects or {}).items(): # type: ignore
41
51
  aspect_copy = type(aspect).from_obj(aspect.to_obj())
@@ -46,6 +56,10 @@ class Entity:
46
56
  @abc.abstractmethod
47
57
  def get_urn_type(cls) -> Type[_SpecificUrn]: ...
48
58
 
59
+ @classmethod
60
+ def entity_type_name(cls) -> str:
61
+ return cls.get_urn_type().ENTITY_TYPE
62
+
49
63
  @property
50
64
  def urn(self) -> _SpecificUrn:
51
65
  return self._urn
@@ -85,5 +99,14 @@ class Entity:
85
99
  )
86
100
  return mcps
87
101
 
102
+ def as_workunits(self) -> List[MetadataWorkUnit]:
103
+ return [mcp.as_workunit() for mcp in self._as_mcps()]
104
+
105
+ def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None:
106
+ # TODO: Add validation to ensure that an "extra aspect" does not conflict
107
+ # with / get overridden by a standard aspect.
108
+ for aspect in extra_aspects or []:
109
+ self._set_aspect(aspect)
110
+
88
111
  def __repr__(self) -> str:
89
112
  return f"{self.__class__.__name__}('{self.urn}')"
@@ -14,10 +14,10 @@ from datahub.metadata.urns import (
14
14
  Urn,
15
15
  )
16
16
  from datahub.sdk._all_entities import ENTITY_CLASSES
17
- from datahub.sdk._entity import Entity
18
17
  from datahub.sdk._shared import UrnOrStr
19
18
  from datahub.sdk.container import Container
20
19
  from datahub.sdk.dataset import Dataset
20
+ from datahub.sdk.entity import Entity
21
21
 
22
22
  if TYPE_CHECKING:
23
23
  from datahub.sdk.main_client import DataHubClient
@@ -7,6 +7,7 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
7
7
  from datahub.ingestion.graph.config import DatahubClientConfig
8
8
  from datahub.sdk.entity_client import EntityClient
9
9
  from datahub.sdk.resolver_client import ResolverClient
10
+ from datahub.sdk.search_client import SearchClient
10
11
 
11
12
 
12
13
  class DataHubClient:
@@ -39,12 +40,28 @@ class DataHubClient:
39
40
 
40
41
  self._graph = graph
41
42
 
43
+ # TODO: test connection
44
+
42
45
  @classmethod
43
46
  def from_env(cls) -> "DataHubClient":
47
+ """Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
48
+
49
+ This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
50
+ it will read credentials from ~/.datahubenv. That file can be created using
51
+ the `datahub init` command.
52
+
53
+ If you're looking to specify the server/token in code, use the
54
+ DataHubClient(server=..., token=...) constructor instead.
55
+
56
+ Returns:
57
+ A DataHubClient instance.
58
+ """
59
+
44
60
  # Inspired by the DockerClient.from_env() method.
45
61
  # TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
46
62
  # That file is part of the "environment", but is not a traditional "env variable".
47
63
  graph = get_default_graph()
64
+
48
65
  return cls(graph=graph)
49
66
 
50
67
  @property
@@ -54,3 +71,9 @@ class DataHubClient:
54
71
  @property
55
72
  def resolve(self) -> ResolverClient:
56
73
  return ResolverClient(self)
74
+
75
+ @property
76
+ def search(self) -> SearchClient:
77
+ return SearchClient(self)
78
+
79
+ # TODO: lineage client
@@ -9,6 +9,7 @@ from datahub.metadata.urns import (
9
9
  DomainUrn,
10
10
  GlossaryTermUrn,
11
11
  )
12
+ from datahub.sdk.search_filters import Filter, FilterDsl as F
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from datahub.sdk.main_client import DataHubClient
@@ -38,37 +39,28 @@ class ResolverClient:
38
39
  self, *, name: Optional[str] = None, email: Optional[str] = None
39
40
  ) -> CorpUserUrn:
40
41
  filter_explanation: str
41
- filters = []
42
+ filter: Filter
42
43
  if name is not None:
43
44
  if email is not None:
44
45
  raise SdkUsageError("Cannot specify both name and email for auto_user")
45
- # TODO: do we filter on displayName or fullName?
46
+ # We're filtering on both fullName and displayName. It's not clear
47
+ # what the right behavior is here.
46
48
  filter_explanation = f"with name {name}"
47
- filters.append(
48
- {
49
- "field": "fullName",
50
- "values": [name],
51
- "condition": "EQUAL",
52
- }
49
+ filter = F.or_(
50
+ F.custom_filter("fullName", "EQUAL", [name]),
51
+ F.custom_filter("displayName", "EQUAL", [name]),
53
52
  )
54
53
  elif email is not None:
55
54
  filter_explanation = f"with email {email}"
56
- filters.append(
57
- {
58
- "field": "email",
59
- "values": [email],
60
- "condition": "EQUAL",
61
- }
62
- )
55
+ filter = F.custom_filter("email", "EQUAL", [email])
63
56
  else:
64
57
  raise SdkUsageError("Must specify either name or email for auto_user")
65
58
 
66
- users = list(
67
- self._graph.get_urns_by_filter(
68
- entity_types=[CorpUserUrn.ENTITY_TYPE],
69
- extraFilters=filters,
70
- )
59
+ filter = F.and_(
60
+ F.entity_type(CorpUserUrn.ENTITY_TYPE),
61
+ filter,
71
62
  )
63
+ users = list(self._client.search.get_urns(filter=filter))
72
64
  if len(users) == 0:
73
65
  # TODO: In auto methods, should we just create the user/domain/etc if it doesn't exist?
74
66
  raise ItemNotFoundError(f"User {filter_explanation} not found")
@@ -82,15 +74,11 @@ class ResolverClient:
82
74
  def term(self, *, name: str) -> GlossaryTermUrn:
83
75
  # TODO: Add some limits on the graph fetch
84
76
  terms = list(
85
- self._graph.get_urns_by_filter(
86
- entity_types=[GlossaryTermUrn.ENTITY_TYPE],
87
- extraFilters=[
88
- {
89
- "field": "id",
90
- "values": [name],
91
- "condition": "EQUAL",
92
- }
93
- ],
77
+ self._client.search.get_urns(
78
+ filter=F.and_(
79
+ F.entity_type(GlossaryTermUrn.ENTITY_TYPE),
80
+ F.custom_filter("name", "EQUAL", [name]),
81
+ ),
94
82
  )
95
83
  )
96
84
  if len(terms) == 0:
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Dict,
6
+ Iterable,
7
+ List,
8
+ Optional,
9
+ )
10
+
11
+ from datahub.ingestion.graph.filters import RawSearchFilterRule
12
+ from datahub.metadata.urns import Urn
13
+ from datahub.sdk.search_filters import Filter
14
+
15
+ if TYPE_CHECKING:
16
+ from datahub.sdk.main_client import DataHubClient
17
+
18
+
19
+ def compile_filters(
20
+ filter: Optional[Filter],
21
+ ) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
22
+ # TODO: Not every filter type is supported for every entity type.
23
+ # If we can detect issues with the filters at compile time, we should
24
+ # raise an error.
25
+
26
+ if filter is None:
27
+ return None
28
+
29
+ initial_filters = filter.compile()
30
+ return [
31
+ {"and": [rule.to_raw() for rule in andClause["and"]]}
32
+ for andClause in initial_filters
33
+ ]
34
+
35
+
36
+ class SearchClient:
37
+ def __init__(self, client: DataHubClient):
38
+ self._client = client
39
+
40
+ def get_urns(
41
+ self,
42
+ query: Optional[str] = None,
43
+ filter: Optional[Filter] = None,
44
+ ) -> Iterable[Urn]:
45
+ # TODO: Add better limit / pagination support.
46
+ for urn in self._client._graph.get_urns_by_filter(
47
+ query=query,
48
+ extra_or_filters=compile_filters(filter),
49
+ ):
50
+ yield Urn.from_string(urn)