acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +3 -5
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +3 -3
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +6 -12
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +7 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +251 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +29 -5
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +20 -13
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
datahub/sdk/dataset.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import warnings
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import Dict, List, Optional, Tuple, Type, Union
|
|
5
|
+
from typing import Dict, List, Optional, Sequence, Tuple, Type, Union
|
|
6
6
|
|
|
7
7
|
from typing_extensions import Self, TypeAlias, assert_never
|
|
8
8
|
|
|
@@ -13,37 +13,43 @@ from datahub.errors import (
|
|
|
13
13
|
IngestionAttributionWarning,
|
|
14
14
|
ItemNotFoundError,
|
|
15
15
|
SchemaFieldKeyError,
|
|
16
|
+
SdkUsageError,
|
|
16
17
|
)
|
|
17
18
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
18
19
|
from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn
|
|
19
20
|
from datahub.sdk._attribution import is_ingestion_attribution
|
|
20
|
-
from datahub.sdk._entity import Entity
|
|
21
21
|
from datahub.sdk._shared import (
|
|
22
|
-
ContainerInputType,
|
|
23
22
|
DatasetUrnOrStr,
|
|
24
23
|
DomainInputType,
|
|
25
24
|
HasContainer,
|
|
26
25
|
HasDomain,
|
|
26
|
+
HasInstitutionalMemory,
|
|
27
27
|
HasOwnership,
|
|
28
28
|
HasPlatformInstance,
|
|
29
29
|
HasSubtype,
|
|
30
30
|
HasTags,
|
|
31
31
|
HasTerms,
|
|
32
|
+
LinksInputType,
|
|
32
33
|
OwnersInputType,
|
|
34
|
+
ParentContainerInputType,
|
|
35
|
+
TagInputType,
|
|
33
36
|
TagsInputType,
|
|
37
|
+
TermInputType,
|
|
34
38
|
TermsInputType,
|
|
35
39
|
make_time_stamp,
|
|
36
40
|
parse_time_stamp,
|
|
37
41
|
)
|
|
42
|
+
from datahub.sdk._utils import add_list_unique, remove_list_unique
|
|
43
|
+
from datahub.sdk.entity import Entity, ExtraAspectsType
|
|
44
|
+
from datahub.utilities.sentinels import Unset, unset
|
|
38
45
|
|
|
39
46
|
SchemaFieldInputType: TypeAlias = Union[
|
|
40
|
-
str,
|
|
41
47
|
Tuple[str, str], # (name, type)
|
|
42
48
|
Tuple[str, str, str], # (name, type, description)
|
|
43
49
|
models.SchemaFieldClass,
|
|
44
50
|
]
|
|
45
51
|
SchemaFieldsInputType: TypeAlias = Union[
|
|
46
|
-
|
|
52
|
+
Sequence[SchemaFieldInputType],
|
|
47
53
|
models.SchemaMetadataClass,
|
|
48
54
|
]
|
|
49
55
|
|
|
@@ -68,9 +74,9 @@ UpstreamLineageInputType: TypeAlias = Union[
|
|
|
68
74
|
def _parse_upstream_input(
|
|
69
75
|
upstream_input: UpstreamInputType,
|
|
70
76
|
) -> Union[models.UpstreamClass, models.FineGrainedLineageClass]:
|
|
71
|
-
if isinstance(
|
|
72
|
-
|
|
73
|
-
|
|
77
|
+
if isinstance(
|
|
78
|
+
upstream_input, (models.UpstreamClass, models.FineGrainedLineageClass)
|
|
79
|
+
):
|
|
74
80
|
return upstream_input
|
|
75
81
|
elif isinstance(upstream_input, (str, DatasetUrn)):
|
|
76
82
|
return models.UpstreamClass(
|
|
@@ -271,6 +277,51 @@ class SchemaField:
|
|
|
271
277
|
tags=parsed_tags
|
|
272
278
|
)
|
|
273
279
|
|
|
280
|
+
def add_tag(self, tag: TagInputType) -> None:
|
|
281
|
+
parsed_tag = self._parent._parse_tag_association_class(tag)
|
|
282
|
+
|
|
283
|
+
if is_ingestion_attribution():
|
|
284
|
+
raise SdkUsageError(
|
|
285
|
+
"Adding field tags in ingestion mode is not yet supported. "
|
|
286
|
+
"Use set_tags instead."
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
editable_field = self._ensure_editable_schema_field()
|
|
290
|
+
if editable_field.globalTags is None:
|
|
291
|
+
editable_field.globalTags = models.GlobalTagsClass(tags=[])
|
|
292
|
+
|
|
293
|
+
add_list_unique(
|
|
294
|
+
editable_field.globalTags.tags,
|
|
295
|
+
key=self._parent._tag_key,
|
|
296
|
+
item=parsed_tag,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def remove_tag(self, tag: TagInputType) -> None:
|
|
300
|
+
parsed_tag = self._parent._parse_tag_association_class(tag)
|
|
301
|
+
|
|
302
|
+
if is_ingestion_attribution():
|
|
303
|
+
raise SdkUsageError(
|
|
304
|
+
"Adding field tags in ingestion mode is not yet supported. "
|
|
305
|
+
"Use set_tags instead."
|
|
306
|
+
)
|
|
307
|
+
else:
|
|
308
|
+
base_field = self._base_schema_field()
|
|
309
|
+
if base_field.globalTags is not None:
|
|
310
|
+
remove_list_unique(
|
|
311
|
+
base_field.globalTags.tags,
|
|
312
|
+
key=self._parent._tag_key,
|
|
313
|
+
item=parsed_tag,
|
|
314
|
+
missing_ok=True,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
editable_field = self._ensure_editable_schema_field()
|
|
318
|
+
if editable_field.globalTags is not None:
|
|
319
|
+
remove_list_unique(
|
|
320
|
+
editable_field.globalTags.tags,
|
|
321
|
+
key=self._parent._tag_key,
|
|
322
|
+
item=parsed_tag,
|
|
323
|
+
)
|
|
324
|
+
|
|
274
325
|
@property
|
|
275
326
|
def terms(self) -> Optional[List[models.GlossaryTermAssociationClass]]:
|
|
276
327
|
# TODO: Basically the same implementation as tags - can we share code?
|
|
@@ -287,7 +338,7 @@ class SchemaField:
|
|
|
287
338
|
|
|
288
339
|
return terms
|
|
289
340
|
|
|
290
|
-
def set_terms(self, terms:
|
|
341
|
+
def set_terms(self, terms: TermsInputType) -> None:
|
|
291
342
|
parsed_terms = [
|
|
292
343
|
self._parent._parse_glossary_term_association_class(term) for term in terms
|
|
293
344
|
]
|
|
@@ -318,12 +369,62 @@ class SchemaField:
|
|
|
318
369
|
)
|
|
319
370
|
)
|
|
320
371
|
|
|
372
|
+
def add_term(self, term: TermInputType) -> None:
|
|
373
|
+
parsed_term = self._parent._parse_glossary_term_association_class(term)
|
|
374
|
+
|
|
375
|
+
if is_ingestion_attribution():
|
|
376
|
+
raise SdkUsageError(
|
|
377
|
+
"Adding field terms in ingestion mode is not yet supported. "
|
|
378
|
+
"Use set_terms instead."
|
|
379
|
+
)
|
|
380
|
+
else:
|
|
381
|
+
editable_field = self._ensure_editable_schema_field()
|
|
382
|
+
if editable_field.glossaryTerms is None:
|
|
383
|
+
editable_field.glossaryTerms = models.GlossaryTermsClass(
|
|
384
|
+
terms=[],
|
|
385
|
+
auditStamp=self._parent._terms_audit_stamp(),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
add_list_unique(
|
|
389
|
+
editable_field.glossaryTerms.terms,
|
|
390
|
+
key=self._parent._terms_key,
|
|
391
|
+
item=parsed_term,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def remove_term(self, term: TermInputType) -> None:
|
|
395
|
+
parsed_term = self._parent._parse_glossary_term_association_class(term)
|
|
396
|
+
|
|
397
|
+
if is_ingestion_attribution():
|
|
398
|
+
raise SdkUsageError(
|
|
399
|
+
"Removing field terms in ingestion mode is not yet supported. "
|
|
400
|
+
"Use set_terms instead."
|
|
401
|
+
)
|
|
402
|
+
else:
|
|
403
|
+
base_field = self._base_schema_field()
|
|
404
|
+
if base_field.glossaryTerms is not None:
|
|
405
|
+
remove_list_unique(
|
|
406
|
+
base_field.glossaryTerms.terms,
|
|
407
|
+
key=self._parent._terms_key,
|
|
408
|
+
item=parsed_term,
|
|
409
|
+
missing_ok=True,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
editable_field = self._ensure_editable_schema_field()
|
|
413
|
+
if editable_field.glossaryTerms is not None:
|
|
414
|
+
remove_list_unique(
|
|
415
|
+
editable_field.glossaryTerms.terms,
|
|
416
|
+
key=self._parent._terms_key,
|
|
417
|
+
item=parsed_term,
|
|
418
|
+
missing_ok=True,
|
|
419
|
+
)
|
|
420
|
+
|
|
321
421
|
|
|
322
422
|
class Dataset(
|
|
323
423
|
HasPlatformInstance,
|
|
324
424
|
HasSubtype,
|
|
325
425
|
HasContainer,
|
|
326
426
|
HasOwnership,
|
|
427
|
+
HasInstitutionalMemory,
|
|
327
428
|
HasTags,
|
|
328
429
|
HasTerms,
|
|
329
430
|
HasDomain,
|
|
@@ -352,13 +453,15 @@ class Dataset(
|
|
|
352
453
|
created: Optional[datetime] = None,
|
|
353
454
|
last_modified: Optional[datetime] = None,
|
|
354
455
|
# Standard aspects.
|
|
456
|
+
parent_container: ParentContainerInputType | Unset = unset,
|
|
355
457
|
subtype: Optional[str] = None,
|
|
356
|
-
container: Optional[ContainerInputType] = None,
|
|
357
458
|
owners: Optional[OwnersInputType] = None,
|
|
459
|
+
links: Optional[LinksInputType] = None,
|
|
358
460
|
tags: Optional[TagsInputType] = None,
|
|
359
461
|
terms: Optional[TermsInputType] = None,
|
|
360
462
|
# TODO structured_properties
|
|
361
463
|
domain: Optional[DomainInputType] = None,
|
|
464
|
+
extra_aspects: ExtraAspectsType = None,
|
|
362
465
|
# Dataset-specific aspects.
|
|
363
466
|
schema: Optional[SchemaFieldsInputType] = None,
|
|
364
467
|
upstreams: Optional[models.UpstreamLineageClass] = None,
|
|
@@ -370,6 +473,7 @@ class Dataset(
|
|
|
370
473
|
env=env,
|
|
371
474
|
)
|
|
372
475
|
super().__init__(urn)
|
|
476
|
+
self._set_extra_aspects(extra_aspects)
|
|
373
477
|
|
|
374
478
|
self._set_platform_instance(urn.platform, platform_instance)
|
|
375
479
|
|
|
@@ -393,12 +497,14 @@ class Dataset(
|
|
|
393
497
|
if last_modified is not None:
|
|
394
498
|
self.set_last_modified(last_modified)
|
|
395
499
|
|
|
500
|
+
if parent_container is not unset:
|
|
501
|
+
self._set_container(parent_container)
|
|
396
502
|
if subtype is not None:
|
|
397
503
|
self.set_subtype(subtype)
|
|
398
|
-
if container is not None:
|
|
399
|
-
self._set_container(container)
|
|
400
504
|
if owners is not None:
|
|
401
505
|
self.set_owners(owners)
|
|
506
|
+
if links is not None:
|
|
507
|
+
self.set_links(links)
|
|
402
508
|
if tags is not None:
|
|
403
509
|
self.set_tags(tags)
|
|
404
510
|
if terms is not None:
|
|
@@ -537,14 +643,6 @@ class Dataset(
|
|
|
537
643
|
nativeDataType=field_type,
|
|
538
644
|
description=description,
|
|
539
645
|
)
|
|
540
|
-
elif isinstance(schema_field_input, str):
|
|
541
|
-
# TODO: Not sure this branch makes sense - we should probably just require types?
|
|
542
|
-
return models.SchemaFieldClass(
|
|
543
|
-
fieldPath=schema_field_input,
|
|
544
|
-
type=models.SchemaFieldDataTypeClass(models.NullTypeClass()),
|
|
545
|
-
nativeDataType="unknown",
|
|
546
|
-
description=None,
|
|
547
|
-
)
|
|
548
646
|
else:
|
|
549
647
|
assert_never(schema_field_input)
|
|
550
648
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import abc
|
|
2
|
-
from typing import List, Optional, Type, Union
|
|
4
|
+
from typing import TYPE_CHECKING, List, Optional, Type, Union
|
|
3
5
|
|
|
4
6
|
from typing_extensions import Self
|
|
5
7
|
|
|
@@ -10,6 +12,12 @@ from datahub.errors import SdkUsageError
|
|
|
10
12
|
from datahub.metadata.urns import Urn
|
|
11
13
|
from datahub.utilities.urns._urn_base import _SpecificUrn
|
|
12
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ExtraAspectsType = Union[None, List[AspectTypeVar]]
|
|
20
|
+
|
|
13
21
|
|
|
14
22
|
class Entity:
|
|
15
23
|
__slots__ = ("_urn", "_prev_aspects", "_aspects")
|
|
@@ -36,6 +44,8 @@ class Entity:
|
|
|
36
44
|
|
|
37
45
|
def _init_from_graph(self, current_aspects: models.AspectBag) -> Self:
|
|
38
46
|
self._prev_aspects = current_aspects
|
|
47
|
+
|
|
48
|
+
self._aspects = {}
|
|
39
49
|
aspect: models._Aspect
|
|
40
50
|
for aspect_name, aspect in (current_aspects or {}).items(): # type: ignore
|
|
41
51
|
aspect_copy = type(aspect).from_obj(aspect.to_obj())
|
|
@@ -46,6 +56,10 @@ class Entity:
|
|
|
46
56
|
@abc.abstractmethod
|
|
47
57
|
def get_urn_type(cls) -> Type[_SpecificUrn]: ...
|
|
48
58
|
|
|
59
|
+
@classmethod
|
|
60
|
+
def entity_type_name(cls) -> str:
|
|
61
|
+
return cls.get_urn_type().ENTITY_TYPE
|
|
62
|
+
|
|
49
63
|
@property
|
|
50
64
|
def urn(self) -> _SpecificUrn:
|
|
51
65
|
return self._urn
|
|
@@ -85,5 +99,14 @@ class Entity:
|
|
|
85
99
|
)
|
|
86
100
|
return mcps
|
|
87
101
|
|
|
102
|
+
def as_workunits(self) -> List[MetadataWorkUnit]:
|
|
103
|
+
return [mcp.as_workunit() for mcp in self._as_mcps()]
|
|
104
|
+
|
|
105
|
+
def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None:
|
|
106
|
+
# TODO: Add validation to ensure that an "extra aspect" does not conflict
|
|
107
|
+
# with / get overridden by a standard aspect.
|
|
108
|
+
for aspect in extra_aspects or []:
|
|
109
|
+
self._set_aspect(aspect)
|
|
110
|
+
|
|
88
111
|
def __repr__(self) -> str:
|
|
89
112
|
return f"{self.__class__.__name__}('{self.urn}')"
|
datahub/sdk/entity_client.py
CHANGED
|
@@ -14,10 +14,10 @@ from datahub.metadata.urns import (
|
|
|
14
14
|
Urn,
|
|
15
15
|
)
|
|
16
16
|
from datahub.sdk._all_entities import ENTITY_CLASSES
|
|
17
|
-
from datahub.sdk._entity import Entity
|
|
18
17
|
from datahub.sdk._shared import UrnOrStr
|
|
19
18
|
from datahub.sdk.container import Container
|
|
20
19
|
from datahub.sdk.dataset import Dataset
|
|
20
|
+
from datahub.sdk.entity import Entity
|
|
21
21
|
|
|
22
22
|
if TYPE_CHECKING:
|
|
23
23
|
from datahub.sdk.main_client import DataHubClient
|
datahub/sdk/main_client.py
CHANGED
|
@@ -7,6 +7,7 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
|
7
7
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
8
8
|
from datahub.sdk.entity_client import EntityClient
|
|
9
9
|
from datahub.sdk.resolver_client import ResolverClient
|
|
10
|
+
from datahub.sdk.search_client import SearchClient
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class DataHubClient:
|
|
@@ -39,12 +40,28 @@ class DataHubClient:
|
|
|
39
40
|
|
|
40
41
|
self._graph = graph
|
|
41
42
|
|
|
43
|
+
# TODO: test connection
|
|
44
|
+
|
|
42
45
|
@classmethod
|
|
43
46
|
def from_env(cls) -> "DataHubClient":
|
|
47
|
+
"""Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
|
|
48
|
+
|
|
49
|
+
This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
|
|
50
|
+
it will read credentials from ~/.datahubenv. That file can be created using
|
|
51
|
+
the `datahub init` command.
|
|
52
|
+
|
|
53
|
+
If you're looking to specify the server/token in code, use the
|
|
54
|
+
DataHubClient(server=..., token=...) constructor instead.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A DataHubClient instance.
|
|
58
|
+
"""
|
|
59
|
+
|
|
44
60
|
# Inspired by the DockerClient.from_env() method.
|
|
45
61
|
# TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
|
|
46
62
|
# That file is part of the "environment", but is not a traditional "env variable".
|
|
47
63
|
graph = get_default_graph()
|
|
64
|
+
|
|
48
65
|
return cls(graph=graph)
|
|
49
66
|
|
|
50
67
|
@property
|
|
@@ -54,3 +71,9 @@ class DataHubClient:
|
|
|
54
71
|
@property
|
|
55
72
|
def resolve(self) -> ResolverClient:
|
|
56
73
|
return ResolverClient(self)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def search(self) -> SearchClient:
|
|
77
|
+
return SearchClient(self)
|
|
78
|
+
|
|
79
|
+
# TODO: lineage client
|
datahub/sdk/resolver_client.py
CHANGED
|
@@ -9,6 +9,7 @@ from datahub.metadata.urns import (
|
|
|
9
9
|
DomainUrn,
|
|
10
10
|
GlossaryTermUrn,
|
|
11
11
|
)
|
|
12
|
+
from datahub.sdk.search_filters import Filter, FilterDsl as F
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from datahub.sdk.main_client import DataHubClient
|
|
@@ -38,37 +39,28 @@ class ResolverClient:
|
|
|
38
39
|
self, *, name: Optional[str] = None, email: Optional[str] = None
|
|
39
40
|
) -> CorpUserUrn:
|
|
40
41
|
filter_explanation: str
|
|
41
|
-
|
|
42
|
+
filter: Filter
|
|
42
43
|
if name is not None:
|
|
43
44
|
if email is not None:
|
|
44
45
|
raise SdkUsageError("Cannot specify both name and email for auto_user")
|
|
45
|
-
#
|
|
46
|
+
# We're filtering on both fullName and displayName. It's not clear
|
|
47
|
+
# what the right behavior is here.
|
|
46
48
|
filter_explanation = f"with name {name}"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"values": [name],
|
|
51
|
-
"condition": "EQUAL",
|
|
52
|
-
}
|
|
49
|
+
filter = F.or_(
|
|
50
|
+
F.custom_filter("fullName", "EQUAL", [name]),
|
|
51
|
+
F.custom_filter("displayName", "EQUAL", [name]),
|
|
53
52
|
)
|
|
54
53
|
elif email is not None:
|
|
55
54
|
filter_explanation = f"with email {email}"
|
|
56
|
-
|
|
57
|
-
{
|
|
58
|
-
"field": "email",
|
|
59
|
-
"values": [email],
|
|
60
|
-
"condition": "EQUAL",
|
|
61
|
-
}
|
|
62
|
-
)
|
|
55
|
+
filter = F.custom_filter("email", "EQUAL", [email])
|
|
63
56
|
else:
|
|
64
57
|
raise SdkUsageError("Must specify either name or email for auto_user")
|
|
65
58
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
extraFilters=filters,
|
|
70
|
-
)
|
|
59
|
+
filter = F.and_(
|
|
60
|
+
F.entity_type(CorpUserUrn.ENTITY_TYPE),
|
|
61
|
+
filter,
|
|
71
62
|
)
|
|
63
|
+
users = list(self._client.search.get_urns(filter=filter))
|
|
72
64
|
if len(users) == 0:
|
|
73
65
|
# TODO: In auto methods, should we just create the user/domain/etc if it doesn't exist?
|
|
74
66
|
raise ItemNotFoundError(f"User {filter_explanation} not found")
|
|
@@ -82,15 +74,11 @@ class ResolverClient:
|
|
|
82
74
|
def term(self, *, name: str) -> GlossaryTermUrn:
|
|
83
75
|
# TODO: Add some limits on the graph fetch
|
|
84
76
|
terms = list(
|
|
85
|
-
self.
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
"values": [name],
|
|
91
|
-
"condition": "EQUAL",
|
|
92
|
-
}
|
|
93
|
-
],
|
|
77
|
+
self._client.search.get_urns(
|
|
78
|
+
filter=F.and_(
|
|
79
|
+
F.entity_type(GlossaryTermUrn.ENTITY_TYPE),
|
|
80
|
+
F.custom_filter("name", "EQUAL", [name]),
|
|
81
|
+
),
|
|
94
82
|
)
|
|
95
83
|
)
|
|
96
84
|
if len(terms) == 0:
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import (
|
|
4
|
+
TYPE_CHECKING,
|
|
5
|
+
Dict,
|
|
6
|
+
Iterable,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from datahub.ingestion.graph.filters import RawSearchFilterRule
|
|
12
|
+
from datahub.metadata.urns import Urn
|
|
13
|
+
from datahub.sdk.search_filters import Filter
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from datahub.sdk.main_client import DataHubClient
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compile_filters(
|
|
20
|
+
filter: Optional[Filter],
|
|
21
|
+
) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
|
|
22
|
+
# TODO: Not every filter type is supported for every entity type.
|
|
23
|
+
# If we can detect issues with the filters at compile time, we should
|
|
24
|
+
# raise an error.
|
|
25
|
+
|
|
26
|
+
if filter is None:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
initial_filters = filter.compile()
|
|
30
|
+
return [
|
|
31
|
+
{"and": [rule.to_raw() for rule in andClause["and"]]}
|
|
32
|
+
for andClause in initial_filters
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SearchClient:
|
|
37
|
+
def __init__(self, client: DataHubClient):
|
|
38
|
+
self._client = client
|
|
39
|
+
|
|
40
|
+
def get_urns(
|
|
41
|
+
self,
|
|
42
|
+
query: Optional[str] = None,
|
|
43
|
+
filter: Optional[Filter] = None,
|
|
44
|
+
) -> Iterable[Urn]:
|
|
45
|
+
# TODO: Add better limit / pagination support.
|
|
46
|
+
for urn in self._client._graph.get_urns_by_filter(
|
|
47
|
+
query=query,
|
|
48
|
+
extra_or_filters=compile_filters(filter),
|
|
49
|
+
):
|
|
50
|
+
yield Urn.from_string(urn)
|