acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -36,8 +36,10 @@ csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource
|
|
|
36
36
|
datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource
|
|
37
37
|
datahub-apply = datahub.ingestion.source.apply.datahub_apply:DataHubApplySource
|
|
38
38
|
datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource
|
|
39
|
+
datahub-debug = datahub.ingestion.source.debug.datahub_debug:DataHubDebugSource
|
|
39
40
|
datahub-gc = datahub.ingestion.source.gc.datahub_gc:DataHubGcSource
|
|
40
41
|
datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource
|
|
42
|
+
datahub-mock-data = datahub.ingestion.source.mock_data.datahub_mock_data:DataHubMockDataSource
|
|
41
43
|
dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource
|
|
42
44
|
dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource
|
|
43
45
|
delta-lake = datahub.ingestion.source.delta_lake:DeltaLakeSource
|
datahub/_version.py
CHANGED
|
@@ -383,7 +383,7 @@ class Dataset(StrictModel):
|
|
|
383
383
|
urn: Optional[str] = None
|
|
384
384
|
description: Optional[str] = None
|
|
385
385
|
name: Optional[str] = None
|
|
386
|
-
schema_metadata: Optional[SchemaSpecification] = Field(alias="schema")
|
|
386
|
+
schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
|
|
387
387
|
downstreams: Optional[List[str]] = None
|
|
388
388
|
properties: Optional[Dict[str, str]] = None
|
|
389
389
|
subtype: Optional[str] = None
|
|
File without changes
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Iterable, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import cachetools
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from datahub.api.entities.platformresource.platform_resource import (
|
|
11
|
+
PlatformResource,
|
|
12
|
+
PlatformResourceKey,
|
|
13
|
+
)
|
|
14
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
+
from datahub.metadata.urns import PlatformResourceUrn, Urn
|
|
16
|
+
from datahub.utilities.search_utils import ElasticDocumentQuery
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PlatformResourceRepository:
|
|
22
|
+
def __init__(self, graph: DataHubGraph):
|
|
23
|
+
self.graph = graph
|
|
24
|
+
self.cache: cachetools.TTLCache = cachetools.TTLCache(maxsize=1000, ttl=60 * 5)
|
|
25
|
+
|
|
26
|
+
def search_by_filter(
|
|
27
|
+
self, filter: ElasticDocumentQuery, add_to_cache: bool = True
|
|
28
|
+
) -> Iterable[PlatformResource]:
|
|
29
|
+
results = PlatformResource.search_by_filters(self.graph, filter)
|
|
30
|
+
for platform_resource in results:
|
|
31
|
+
if add_to_cache:
|
|
32
|
+
self.cache[platform_resource.id] = platform_resource
|
|
33
|
+
yield platform_resource
|
|
34
|
+
|
|
35
|
+
def create(self, platform_resource: PlatformResource) -> None:
|
|
36
|
+
platform_resource.to_datahub(self.graph)
|
|
37
|
+
self.cache[platform_resource.id] = platform_resource
|
|
38
|
+
|
|
39
|
+
def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
|
|
40
|
+
return self.cache.get(key.id)
|
|
41
|
+
|
|
42
|
+
def delete(self, key: PlatformResourceKey) -> None:
|
|
43
|
+
self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
|
|
44
|
+
del self.cache[key.id]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ExternalEntityId:
|
|
48
|
+
"""
|
|
49
|
+
ExternalEntityId is a unique
|
|
50
|
+
identifier for an ExternalEntity.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def to_platform_resource_key(self) -> PlatformResourceKey:
|
|
55
|
+
"""
|
|
56
|
+
Converts the ExternalEntityId to a PlatformResourceKey.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CaseSensitivity(Enum):
|
|
62
|
+
UPPER = "upper"
|
|
63
|
+
LOWER = "lower"
|
|
64
|
+
MIXED = "mixed"
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def detect_case_sensitivity(value: str) -> "CaseSensitivity":
|
|
68
|
+
if value.isupper():
|
|
69
|
+
return CaseSensitivity.UPPER
|
|
70
|
+
elif value.islower():
|
|
71
|
+
return CaseSensitivity.LOWER
|
|
72
|
+
return CaseSensitivity.MIXED
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def detect_for_many(values: List[str]) -> "CaseSensitivity":
|
|
76
|
+
"""
|
|
77
|
+
Detects the case sensitivity for a list of strings.
|
|
78
|
+
Returns CaseSensitivity.MIXED if the case sensitivity is mixed.
|
|
79
|
+
"""
|
|
80
|
+
if len(values) == 0:
|
|
81
|
+
return CaseSensitivity.MIXED
|
|
82
|
+
|
|
83
|
+
if all(
|
|
84
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.UPPER
|
|
85
|
+
for value in values
|
|
86
|
+
):
|
|
87
|
+
return CaseSensitivity.UPPER
|
|
88
|
+
elif all(
|
|
89
|
+
CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.LOWER
|
|
90
|
+
for value in values
|
|
91
|
+
):
|
|
92
|
+
return CaseSensitivity.LOWER
|
|
93
|
+
return CaseSensitivity.MIXED
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class LinkedResourceSet(BaseModel):
|
|
97
|
+
"""
|
|
98
|
+
A LinkedResourceSet is a set of DataHub URNs that are linked to an ExternalEntity.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
urns: List[str]
|
|
102
|
+
|
|
103
|
+
def _has_conflict(self, urn: Urn) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Detects if the urn is safe to add into the set
|
|
106
|
+
This is used to detect conflicts between DataHub URNs that are linked to
|
|
107
|
+
the same ExternalEntity.
|
|
108
|
+
e.g. Case sensitivity of URNs
|
|
109
|
+
Mixing tags and terms in the same set etc.
|
|
110
|
+
Return True if the urn is not safe to add into the set, else False.
|
|
111
|
+
If the urn is already in the set, we don't need to add it again, but
|
|
112
|
+
that is not a conflict.
|
|
113
|
+
"""
|
|
114
|
+
if urn.urn() in self.urns:
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
# Detect the entity_type of the urns in the existing set
|
|
118
|
+
detected_entity_type = None
|
|
119
|
+
for existing_urn in self.urns:
|
|
120
|
+
try:
|
|
121
|
+
parsed_urn = Urn.from_string(existing_urn)
|
|
122
|
+
entity_type = parsed_urn.entity_type
|
|
123
|
+
if detected_entity_type is None:
|
|
124
|
+
detected_entity_type = entity_type
|
|
125
|
+
elif detected_entity_type != entity_type:
|
|
126
|
+
logger.warning(
|
|
127
|
+
f"Detected entity_type {detected_entity_type} is not equals to {entity_type}"
|
|
128
|
+
)
|
|
129
|
+
return True
|
|
130
|
+
except ValueError:
|
|
131
|
+
# Not a valid URN
|
|
132
|
+
logger.warning(f"Invalid URN {existing_urn} in LinkedResourceSet")
|
|
133
|
+
return True
|
|
134
|
+
try:
|
|
135
|
+
parsed_urn = urn
|
|
136
|
+
if (
|
|
137
|
+
detected_entity_type is not None
|
|
138
|
+
and parsed_urn.entity_type != detected_entity_type
|
|
139
|
+
):
|
|
140
|
+
logger.warning(
|
|
141
|
+
f"Detected entity_type {detected_entity_type} is not equals to parsed_urn's entity_type: {parsed_urn.entity_type}"
|
|
142
|
+
)
|
|
143
|
+
return True
|
|
144
|
+
except ValueError:
|
|
145
|
+
# Not a valid URN
|
|
146
|
+
logger.warning(f"Invalid URN: {urn} in LinkedResourceSet")
|
|
147
|
+
return True
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
def add(self, urn: Union[str, Urn]) -> bool:
|
|
151
|
+
"""
|
|
152
|
+
Adds a URN to the set.
|
|
153
|
+
Returns True if the URN was added, False if it was already in the set.
|
|
154
|
+
Raises a ValueError if the URN is in conflict with the existing set.
|
|
155
|
+
"""
|
|
156
|
+
# Deduplicate the URNs if we have somehow duplicate items from concurrent runs
|
|
157
|
+
self.urns = list(set(self.urns))
|
|
158
|
+
if isinstance(urn, str):
|
|
159
|
+
urn = Urn.from_string(urn)
|
|
160
|
+
if self._has_conflict(urn):
|
|
161
|
+
raise ValueError(f"Conflict detected when adding URN {urn} to the set")
|
|
162
|
+
if urn.urn() not in self.urns:
|
|
163
|
+
self.urns.append(urn.urn())
|
|
164
|
+
return True
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class ExternalEntity:
|
|
169
|
+
"""
|
|
170
|
+
An ExternalEntity is a representation of an entity that external to DataHub
|
|
171
|
+
but could be linked to one or more DataHub entities.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
@abstractmethod
|
|
175
|
+
def is_managed_by_datahub(self) -> bool:
|
|
176
|
+
"""
|
|
177
|
+
Returns whether the entity is managed by DataHub.
|
|
178
|
+
"""
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
@abstractmethod
|
|
182
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
183
|
+
"""
|
|
184
|
+
Returns the URNs of the DataHub entities linked to the external entity.
|
|
185
|
+
Empty list if no linked entities.
|
|
186
|
+
"""
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
@abstractmethod
|
|
190
|
+
def as_platform_resource(self) -> PlatformResource:
|
|
191
|
+
"""
|
|
192
|
+
Converts the ExternalEntity to a PlatformResource.
|
|
193
|
+
"""
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
@abstractmethod
|
|
197
|
+
def get_id(self) -> ExternalEntityId:
|
|
198
|
+
"""
|
|
199
|
+
Returns the ExternalEntityId for the ExternalEntity.
|
|
200
|
+
"""
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@dataclass
|
|
205
|
+
class MissingExternalEntity(ExternalEntity):
|
|
206
|
+
id: ExternalEntityId
|
|
207
|
+
|
|
208
|
+
def is_managed_by_datahub(self) -> bool:
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
def datahub_linked_resources(self) -> LinkedResourceSet:
|
|
212
|
+
return LinkedResourceSet(urns=[])
|
|
213
|
+
|
|
214
|
+
def as_platform_resource(self) -> Optional[PlatformResource]: # type: ignore[override]
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def get_id(self) -> ExternalEntityId:
|
|
218
|
+
return self.id
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class ExternalSystem:
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def exists(self, external_entity_id: ExternalEntityId) -> bool:
|
|
224
|
+
"""
|
|
225
|
+
Returns whether the ExternalEntityId exists in the external system.
|
|
226
|
+
"""
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
@abstractmethod
|
|
230
|
+
def get(
|
|
231
|
+
self,
|
|
232
|
+
external_entity_id: ExternalEntityId,
|
|
233
|
+
platform_resource_repository: PlatformResourceRepository,
|
|
234
|
+
) -> Optional[ExternalEntity]:
|
|
235
|
+
"""
|
|
236
|
+
Returns the ExternalEntity for the ExternalEntityId.
|
|
237
|
+
Uses the platform resource repository to enrich the ExternalEntity with DataHub URNs.
|
|
238
|
+
"""
|
|
239
|
+
pass
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External Tags Module
|
|
3
|
+
|
|
4
|
+
This module provides tag types that integrate with external systems like DataHub and Unity Catalog.
|
|
5
|
+
It builds on top of RestrictedText to provide sanitized, truncated tag handling with original value preservation.
|
|
6
|
+
|
|
7
|
+
Classes:
|
|
8
|
+
- ExternalTag: DataHub-compatible tag with key/value parsing from URNs
|
|
9
|
+
|
|
10
|
+
Example Usage:
|
|
11
|
+
# DataHub Tags
|
|
12
|
+
tag = ExternalTag.from_urn("urn:li:tag:environment:production")
|
|
13
|
+
datahub_urn = tag.get_datahub_tag # Returns TagUrn object or string
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Any, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
from datahub.api.entities.external.restricted_text import RestrictedText
|
|
24
|
+
from datahub.metadata.urns import TagUrn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExternalTag(BaseModel):
|
|
28
|
+
"""A tag type that parses DataHub Tag URNs into key-value pairs with RestrictedText properties."""
|
|
29
|
+
|
|
30
|
+
key: RestrictedText
|
|
31
|
+
value: Optional[RestrictedText] = None
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
key: Optional[Union[str, RestrictedText]] = None,
|
|
36
|
+
value: Optional[Union[str, RestrictedText]] = None,
|
|
37
|
+
**data: Any,
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Initialize ExternalTag from either a DataHub Tag URN or explicit key/value.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
key: Explicit key value (optional for Pydantic initialization)
|
|
44
|
+
value: Explicit value (optional)
|
|
45
|
+
**data: Additional Pydantic data
|
|
46
|
+
"""
|
|
47
|
+
if key is not None:
|
|
48
|
+
# Direct initialization with key/value
|
|
49
|
+
processed_key = (
|
|
50
|
+
RestrictedText(key) if not isinstance(key, RestrictedText) else key
|
|
51
|
+
)
|
|
52
|
+
processed_value = None
|
|
53
|
+
if value is not None:
|
|
54
|
+
processed_value = (
|
|
55
|
+
RestrictedText(value)
|
|
56
|
+
if not isinstance(value, RestrictedText)
|
|
57
|
+
else value
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
super().__init__(
|
|
61
|
+
key=processed_key,
|
|
62
|
+
value=processed_value,
|
|
63
|
+
**data,
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
# Standard pydantic initialization
|
|
67
|
+
super().__init__(**data)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def _parse_tag_name(tag_name: str) -> Tuple[str, Optional[str]]:
|
|
71
|
+
"""
|
|
72
|
+
Parse tag name into key and optional value.
|
|
73
|
+
|
|
74
|
+
If tag_name contains ':', split on first ':' into key:value
|
|
75
|
+
Otherwise, use entire tag_name as key with no value.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
tag_name: The tag name portion from the URN
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Tuple of (key, value) where value may be None
|
|
82
|
+
"""
|
|
83
|
+
if ":" in tag_name:
|
|
84
|
+
parts = tag_name.split(":", 1) # Split on first ':' only
|
|
85
|
+
return parts[0], parts[1]
|
|
86
|
+
else:
|
|
87
|
+
return tag_name, None
|
|
88
|
+
|
|
89
|
+
def to_datahub_tag_urn(self) -> TagUrn:
|
|
90
|
+
"""
|
|
91
|
+
Generate a DataHub Tag URN from the key and value.
|
|
92
|
+
This method creates the URN using the original (unprocessed) values.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
'urn:li:tag:key:value' if value exists, otherwise 'urn:li:tag:key'
|
|
96
|
+
"""
|
|
97
|
+
if self.value is not None:
|
|
98
|
+
tag_name = f"{self.key.original}:{self.value.original}"
|
|
99
|
+
else:
|
|
100
|
+
tag_name = self.key.original
|
|
101
|
+
|
|
102
|
+
return TagUrn(name=tag_name)
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_urn(cls, tag_urn: Union[str, "TagUrn"]) -> "ExternalTag":
|
|
106
|
+
"""
|
|
107
|
+
Create an ExternalTag from a DataHub Tag URN.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
tag_urn: DataHub Tag URN string or TagUrn object
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
ExternalTag instance
|
|
114
|
+
"""
|
|
115
|
+
if isinstance(tag_urn, str):
|
|
116
|
+
tag_urn = TagUrn.from_string(tag_urn)
|
|
117
|
+
key, value = cls._parse_tag_name(tag_urn.name)
|
|
118
|
+
return cls(key=key, value=value)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def from_key_value(cls, key: str, value: Optional[str] = None) -> "ExternalTag":
|
|
122
|
+
"""
|
|
123
|
+
Create an ExternalTag from explicit key and value.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
key: Tag key
|
|
127
|
+
value: Optional tag value
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
ExternalTag instance
|
|
131
|
+
"""
|
|
132
|
+
return cls(key=key, value=value)
|
|
133
|
+
|
|
134
|
+
def __str__(self) -> str:
|
|
135
|
+
"""String representation of the tag."""
|
|
136
|
+
if self.value is not None:
|
|
137
|
+
return f"{self.key}:{self.value}"
|
|
138
|
+
else:
|
|
139
|
+
return str(self.key)
|
|
140
|
+
|
|
141
|
+
def __repr__(self) -> str:
|
|
142
|
+
if self.value is not None:
|
|
143
|
+
return f"ExternalTag(key={self.key!r}, value={self.value!r})"
|
|
144
|
+
else:
|
|
145
|
+
return f"ExternalTag(key={self.key!r})"
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Import RestrictedText from your existing module
|
|
2
|
+
# Uncomment and adjust the import path as needed:
|
|
3
|
+
# from your_restricted_text_module import RestrictedText
|
|
4
|
+
# The following is a list of tag constraints:
|
|
5
|
+
# You can assign a maximum of 50 tags to a single securable object.
|
|
6
|
+
# The maximum length of a tag key is 255 characters.
|
|
7
|
+
# The maximum length of a tag value is 1000 characters.
|
|
8
|
+
# The following characters are not allowed in tag keys:
|
|
9
|
+
# . , - = / :
|
|
10
|
+
# Tag search using the workspace search UI is supported only for tables, views, and table columns.
|
|
11
|
+
# Tag search requires exact term matching.
|
|
12
|
+
# https://learn.microsoft.com/en-us/azure/databricks/database-objects/tags#constraint
|
|
13
|
+
from typing import Any, Dict, Optional, Union
|
|
14
|
+
|
|
15
|
+
from typing_extensions import ClassVar
|
|
16
|
+
|
|
17
|
+
from datahub.api.entities.external.external_tag import ExternalTag
|
|
18
|
+
from datahub.api.entities.external.restricted_text import RestrictedText
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LakeFormationTagKeyText(RestrictedText):
|
|
22
|
+
"""RestrictedText configured for Unity Catalog tag keys."""
|
|
23
|
+
|
|
24
|
+
_default_max_length: ClassVar[int] = 50
|
|
25
|
+
# Unity Catalog tag keys: alphanumeric, hyphens, underscores, periods only
|
|
26
|
+
_default_replacement_char: ClassVar[str] = "_"
|
|
27
|
+
_default_truncation_suffix: ClassVar[str] = "" # No suffix for clean identifiers
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LakeFormationTagValueText(RestrictedText):
|
|
31
|
+
"""RestrictedText configured for Unity Catalog tag values."""
|
|
32
|
+
|
|
33
|
+
_default_max_length: ClassVar[int] = 50
|
|
34
|
+
# Unity Catalog tag values are more permissive but still have some restrictions
|
|
35
|
+
_default_replacement_char: ClassVar[str] = " "
|
|
36
|
+
_default_truncation_suffix: ClassVar[str] = "..."
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LakeFormationTag(ExternalTag):
|
|
40
|
+
"""
|
|
41
|
+
A tag type specifically designed for LakeFormation tag restrictions.
|
|
42
|
+
|
|
43
|
+
LakeFormation Tag Restrictions:
|
|
44
|
+
- Key: Max 127 characters, alphanumeric + hyphens, underscores, periods only
|
|
45
|
+
- Value: Max 256 characters, more permissive but no control characters
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
key: LakeFormationTagKeyText
|
|
49
|
+
value: Optional[LakeFormationTagValueText] = None
|
|
50
|
+
catalog: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
key: Optional[Union[str, LakeFormationTagKeyText]] = None,
|
|
55
|
+
value: Optional[Union[str, LakeFormationTagValueText]] = None,
|
|
56
|
+
**data: Any,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Initialize LakeFormation Tag from either a DataHub Tag URN or explicit key/value.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
key: Explicit key value (optional for Pydantic initialization)
|
|
63
|
+
value: Explicit value (optional)
|
|
64
|
+
**data: Additional Pydantic data
|
|
65
|
+
"""
|
|
66
|
+
if key is not None:
|
|
67
|
+
# Direct initialization with key/value
|
|
68
|
+
processed_key = (
|
|
69
|
+
LakeFormationTagKeyText(key)
|
|
70
|
+
if not isinstance(key, LakeFormationTagKeyText)
|
|
71
|
+
else key
|
|
72
|
+
)
|
|
73
|
+
processed_value = None
|
|
74
|
+
if value is not None:
|
|
75
|
+
processed_value = (
|
|
76
|
+
LakeFormationTagValueText(value)
|
|
77
|
+
if not isinstance(value, LakeFormationTagValueText)
|
|
78
|
+
else value
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
super().__init__(
|
|
82
|
+
key=processed_key,
|
|
83
|
+
value=processed_value,
|
|
84
|
+
**data,
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
# Standard pydantic initialization
|
|
88
|
+
super().__init__(**data)
|
|
89
|
+
|
|
90
|
+
def __eq__(self, other: object) -> bool:
|
|
91
|
+
"""Check equality based on key and value."""
|
|
92
|
+
if not isinstance(other, LakeFormationTag):
|
|
93
|
+
return False
|
|
94
|
+
return str(self.key) == str(other.key) and (
|
|
95
|
+
str(self.value) if self.value else None
|
|
96
|
+
) == (str(other.value) if other.value else None)
|
|
97
|
+
|
|
98
|
+
def __hash__(self) -> int:
|
|
99
|
+
"""Make LakeFormationTag hashable based on key and value."""
|
|
100
|
+
return hash((str(self.key), str(self.value) if self.value else None))
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_dict(cls, tag_dict: Dict[str, Any]) -> "LakeFormationTag":
|
|
104
|
+
"""
|
|
105
|
+
Create a LakeFormationTag from a dictionary with 'key' and optional 'value'.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
tag_dict: Dictionary with 'key' and optional 'value' keys
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
LakeFormationTag instance
|
|
112
|
+
"""
|
|
113
|
+
return cls(key=tag_dict["key"], value=tag_dict.get("value"))
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def from_key_value(
|
|
117
|
+
cls, key: str, value: Optional[str] = None
|
|
118
|
+
) -> "LakeFormationTag":
|
|
119
|
+
"""
|
|
120
|
+
Create a LakeFormationTagPlatformResource from explicit key and value.
|
|
121
|
+
|
|
122
|
+
Overrides the parent method to return the correct type.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
key: Tag key
|
|
126
|
+
value: Optional tag value
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
LakeFormationTag instance
|
|
130
|
+
"""
|
|
131
|
+
return cls(key=key, value=value)
|
|
132
|
+
|
|
133
|
+
def to_dict(self) -> Dict[str, str]:
|
|
134
|
+
"""
|
|
135
|
+
Convert to dictionary format suitable for LakeFormation tag.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Dictionary with 'key' and optionally 'value'
|
|
139
|
+
"""
|
|
140
|
+
result: Dict[str, str] = {"key": self.key.original}
|
|
141
|
+
if self.value is not None:
|
|
142
|
+
result["value"] = self.value.original
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
def to_display_dict(self) -> Dict[str, str]:
|
|
146
|
+
"""
|
|
147
|
+
Convert to dictionary format showing processed values.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Dictionary with processed 'key' and optional 'value'
|
|
151
|
+
"""
|
|
152
|
+
result: Dict[str, str] = {"key": str(self.key)}
|
|
153
|
+
if self.value is not None:
|
|
154
|
+
result["value"] = str(self.value)
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
def __repr__(self) -> str:
|
|
158
|
+
if self.value:
|
|
159
|
+
return f"LakeFormationTag(key={self.key!r}, value={self.value!r})"
|
|
160
|
+
else:
|
|
161
|
+
return f"LakeFormationTag(key={self.key!r})"
|