acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.8.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -36,8 +36,10 @@ csv-enricher = datahub.ingestion.source.csv_enricher:CSVEnricherSource
36
36
  datahub = datahub.ingestion.source.datahub.datahub_source:DataHubSource
37
37
  datahub-apply = datahub.ingestion.source.apply.datahub_apply:DataHubApplySource
38
38
  datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource
39
+ datahub-debug = datahub.ingestion.source.debug.datahub_debug:DataHubDebugSource
39
40
  datahub-gc = datahub.ingestion.source.gc.datahub_gc:DataHubGcSource
40
41
  datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource
42
+ datahub-mock-data = datahub.ingestion.source.mock_data.datahub_mock_data:DataHubMockDataSource
41
43
  dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource
42
44
  dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource
43
45
  delta-lake = datahub.ingestion.source.delta_lake:DeltaLakeSource
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.1.1rc3"
3
+ __version__ = "1.2.0"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -383,7 +383,7 @@ class Dataset(StrictModel):
383
383
  urn: Optional[str] = None
384
384
  description: Optional[str] = None
385
385
  name: Optional[str] = None
386
- schema_metadata: Optional[SchemaSpecification] = Field(alias="schema")
386
+ schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
387
387
  downstreams: Optional[List[str]] = None
388
388
  properties: Optional[Dict[str, str]] = None
389
389
  subtype: Optional[str] = None
@@ -786,6 +786,7 @@ class Dataset(StrictModel):
786
786
  if schema_metadata:
787
787
  # If the schema is built off of an avro schema, we only extract the fields if they have structured properties
788
788
  # Otherwise, we extract all fields
789
+ schema_fields = []
789
790
  if (
790
791
  schema_metadata.platformSchema
791
792
  and isinstance(schema_metadata.platformSchema, models.OtherSchemaClass)
File without changes
@@ -0,0 +1,239 @@
1
+ import logging
2
+ from abc import abstractmethod
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+ from typing import Iterable, List, Optional, Union
6
+
7
+ import cachetools
8
+ from pydantic import BaseModel
9
+
10
+ from datahub.api.entities.platformresource.platform_resource import (
11
+ PlatformResource,
12
+ PlatformResourceKey,
13
+ )
14
+ from datahub.ingestion.graph.client import DataHubGraph
15
+ from datahub.metadata.urns import PlatformResourceUrn, Urn
16
+ from datahub.utilities.search_utils import ElasticDocumentQuery
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class PlatformResourceRepository:
22
+ def __init__(self, graph: DataHubGraph):
23
+ self.graph = graph
24
+ self.cache: cachetools.TTLCache = cachetools.TTLCache(maxsize=1000, ttl=60 * 5)
25
+
26
+ def search_by_filter(
27
+ self, filter: ElasticDocumentQuery, add_to_cache: bool = True
28
+ ) -> Iterable[PlatformResource]:
29
+ results = PlatformResource.search_by_filters(self.graph, filter)
30
+ for platform_resource in results:
31
+ if add_to_cache:
32
+ self.cache[platform_resource.id] = platform_resource
33
+ yield platform_resource
34
+
35
+ def create(self, platform_resource: PlatformResource) -> None:
36
+ platform_resource.to_datahub(self.graph)
37
+ self.cache[platform_resource.id] = platform_resource
38
+
39
+ def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
40
+ return self.cache.get(key.id)
41
+
42
+ def delete(self, key: PlatformResourceKey) -> None:
43
+ self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
44
+ del self.cache[key.id]
45
+
46
+
47
+ class ExternalEntityId:
48
+ """
49
+ ExternalEntityId is a unique
50
+ identifier for an ExternalEntity.
51
+ """
52
+
53
+ @abstractmethod
54
+ def to_platform_resource_key(self) -> PlatformResourceKey:
55
+ """
56
+ Converts the ExternalEntityId to a PlatformResourceKey.
57
+ """
58
+ pass
59
+
60
+
61
+ class CaseSensitivity(Enum):
62
+ UPPER = "upper"
63
+ LOWER = "lower"
64
+ MIXED = "mixed"
65
+
66
+ @staticmethod
67
+ def detect_case_sensitivity(value: str) -> "CaseSensitivity":
68
+ if value.isupper():
69
+ return CaseSensitivity.UPPER
70
+ elif value.islower():
71
+ return CaseSensitivity.LOWER
72
+ return CaseSensitivity.MIXED
73
+
74
+ @staticmethod
75
+ def detect_for_many(values: List[str]) -> "CaseSensitivity":
76
+ """
77
+ Detects the case sensitivity for a list of strings.
78
+ Returns CaseSensitivity.MIXED if the case sensitivity is mixed.
79
+ """
80
+ if len(values) == 0:
81
+ return CaseSensitivity.MIXED
82
+
83
+ if all(
84
+ CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.UPPER
85
+ for value in values
86
+ ):
87
+ return CaseSensitivity.UPPER
88
+ elif all(
89
+ CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.LOWER
90
+ for value in values
91
+ ):
92
+ return CaseSensitivity.LOWER
93
+ return CaseSensitivity.MIXED
94
+
95
+
96
+ class LinkedResourceSet(BaseModel):
97
+ """
98
+ A LinkedResourceSet is a set of DataHub URNs that are linked to an ExternalEntity.
99
+ """
100
+
101
+ urns: List[str]
102
+
103
+ def _has_conflict(self, urn: Urn) -> bool:
104
+ """
105
+ Detects if the urn is safe to add into the set
106
+ This is used to detect conflicts between DataHub URNs that are linked to
107
+ the same ExternalEntity.
108
+ e.g. Case sensitivity of URNs
109
+ Mixing tags and terms in the same set etc.
110
+ Return True if the urn is not safe to add into the set, else False.
111
+ If the urn is already in the set, we don't need to add it again, but
112
+ that is not a conflict.
113
+ """
114
+ if urn.urn() in self.urns:
115
+ return False
116
+
117
+ # Detect the entity_type of the urns in the existing set
118
+ detected_entity_type = None
119
+ for existing_urn in self.urns:
120
+ try:
121
+ parsed_urn = Urn.from_string(existing_urn)
122
+ entity_type = parsed_urn.entity_type
123
+ if detected_entity_type is None:
124
+ detected_entity_type = entity_type
125
+ elif detected_entity_type != entity_type:
126
+ logger.warning(
127
+ f"Detected entity_type {detected_entity_type} is not equals to {entity_type}"
128
+ )
129
+ return True
130
+ except ValueError:
131
+ # Not a valid URN
132
+ logger.warning(f"Invalid URN {existing_urn} in LinkedResourceSet")
133
+ return True
134
+ try:
135
+ parsed_urn = urn
136
+ if (
137
+ detected_entity_type is not None
138
+ and parsed_urn.entity_type != detected_entity_type
139
+ ):
140
+ logger.warning(
141
+ f"Detected entity_type {detected_entity_type} is not equals to parsed_urn's entity_type: {parsed_urn.entity_type}"
142
+ )
143
+ return True
144
+ except ValueError:
145
+ # Not a valid URN
146
+ logger.warning(f"Invalid URN: {urn} in LinkedResourceSet")
147
+ return True
148
+ return False
149
+
150
+ def add(self, urn: Union[str, Urn]) -> bool:
151
+ """
152
+ Adds a URN to the set.
153
+ Returns True if the URN was added, False if it was already in the set.
154
+ Raises a ValueError if the URN is in conflict with the existing set.
155
+ """
156
+ # Deduplicate the URNs if we have somehow duplicate items from concurrent runs
157
+ self.urns = list(set(self.urns))
158
+ if isinstance(urn, str):
159
+ urn = Urn.from_string(urn)
160
+ if self._has_conflict(urn):
161
+ raise ValueError(f"Conflict detected when adding URN {urn} to the set")
162
+ if urn.urn() not in self.urns:
163
+ self.urns.append(urn.urn())
164
+ return True
165
+ return False
166
+
167
+
168
+ class ExternalEntity:
169
+ """
170
+ An ExternalEntity is a representation of an entity that external to DataHub
171
+ but could be linked to one or more DataHub entities.
172
+ """
173
+
174
+ @abstractmethod
175
+ def is_managed_by_datahub(self) -> bool:
176
+ """
177
+ Returns whether the entity is managed by DataHub.
178
+ """
179
+ pass
180
+
181
+ @abstractmethod
182
+ def datahub_linked_resources(self) -> LinkedResourceSet:
183
+ """
184
+ Returns the URNs of the DataHub entities linked to the external entity.
185
+ Empty list if no linked entities.
186
+ """
187
+ pass
188
+
189
+ @abstractmethod
190
+ def as_platform_resource(self) -> PlatformResource:
191
+ """
192
+ Converts the ExternalEntity to a PlatformResource.
193
+ """
194
+ pass
195
+
196
+ @abstractmethod
197
+ def get_id(self) -> ExternalEntityId:
198
+ """
199
+ Returns the ExternalEntityId for the ExternalEntity.
200
+ """
201
+ pass
202
+
203
+
204
+ @dataclass
205
+ class MissingExternalEntity(ExternalEntity):
206
+ id: ExternalEntityId
207
+
208
+ def is_managed_by_datahub(self) -> bool:
209
+ return False
210
+
211
+ def datahub_linked_resources(self) -> LinkedResourceSet:
212
+ return LinkedResourceSet(urns=[])
213
+
214
+ def as_platform_resource(self) -> Optional[PlatformResource]: # type: ignore[override]
215
+ return None
216
+
217
+ def get_id(self) -> ExternalEntityId:
218
+ return self.id
219
+
220
+
221
+ class ExternalSystem:
222
+ @abstractmethod
223
+ def exists(self, external_entity_id: ExternalEntityId) -> bool:
224
+ """
225
+ Returns whether the ExternalEntityId exists in the external system.
226
+ """
227
+ pass
228
+
229
+ @abstractmethod
230
+ def get(
231
+ self,
232
+ external_entity_id: ExternalEntityId,
233
+ platform_resource_repository: PlatformResourceRepository,
234
+ ) -> Optional[ExternalEntity]:
235
+ """
236
+ Returns the ExternalEntity for the ExternalEntityId.
237
+ Uses the platform resource repository to enrich the ExternalEntity with DataHub URNs.
238
+ """
239
+ pass
@@ -0,0 +1,145 @@
1
+ """
2
+ External Tags Module
3
+
4
+ This module provides tag types that integrate with external systems like DataHub and Unity Catalog.
5
+ It builds on top of RestrictedText to provide sanitized, truncated tag handling with original value preservation.
6
+
7
+ Classes:
8
+ - ExternalTag: DataHub-compatible tag with key/value parsing from URNs
9
+
10
+ Example Usage:
11
+ # DataHub Tags
12
+ tag = ExternalTag.from_urn("urn:li:tag:environment:production")
13
+ datahub_urn = tag.get_datahub_tag # Returns TagUrn object or string
14
+
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from typing import Any, Optional, Tuple, Union
20
+
21
+ from pydantic import BaseModel
22
+
23
+ from datahub.api.entities.external.restricted_text import RestrictedText
24
+ from datahub.metadata.urns import TagUrn
25
+
26
+
27
+ class ExternalTag(BaseModel):
28
+ """A tag type that parses DataHub Tag URNs into key-value pairs with RestrictedText properties."""
29
+
30
+ key: RestrictedText
31
+ value: Optional[RestrictedText] = None
32
+
33
+ def __init__(
34
+ self,
35
+ key: Optional[Union[str, RestrictedText]] = None,
36
+ value: Optional[Union[str, RestrictedText]] = None,
37
+ **data: Any,
38
+ ) -> None:
39
+ """
40
+ Initialize ExternalTag from either a DataHub Tag URN or explicit key/value.
41
+
42
+ Args:
43
+ key: Explicit key value (optional for Pydantic initialization)
44
+ value: Explicit value (optional)
45
+ **data: Additional Pydantic data
46
+ """
47
+ if key is not None:
48
+ # Direct initialization with key/value
49
+ processed_key = (
50
+ RestrictedText(key) if not isinstance(key, RestrictedText) else key
51
+ )
52
+ processed_value = None
53
+ if value is not None:
54
+ processed_value = (
55
+ RestrictedText(value)
56
+ if not isinstance(value, RestrictedText)
57
+ else value
58
+ )
59
+
60
+ super().__init__(
61
+ key=processed_key,
62
+ value=processed_value,
63
+ **data,
64
+ )
65
+ else:
66
+ # Standard pydantic initialization
67
+ super().__init__(**data)
68
+
69
+ @staticmethod
70
+ def _parse_tag_name(tag_name: str) -> Tuple[str, Optional[str]]:
71
+ """
72
+ Parse tag name into key and optional value.
73
+
74
+ If tag_name contains ':', split on first ':' into key:value
75
+ Otherwise, use entire tag_name as key with no value.
76
+
77
+ Args:
78
+ tag_name: The tag name portion from the URN
79
+
80
+ Returns:
81
+ Tuple of (key, value) where value may be None
82
+ """
83
+ if ":" in tag_name:
84
+ parts = tag_name.split(":", 1) # Split on first ':' only
85
+ return parts[0], parts[1]
86
+ else:
87
+ return tag_name, None
88
+
89
+ def to_datahub_tag_urn(self) -> TagUrn:
90
+ """
91
+ Generate a DataHub Tag URN from the key and value.
92
+ This method creates the URN using the original (unprocessed) values.
93
+
94
+ Returns:
95
+ 'urn:li:tag:key:value' if value exists, otherwise 'urn:li:tag:key'
96
+ """
97
+ if self.value is not None:
98
+ tag_name = f"{self.key.original}:{self.value.original}"
99
+ else:
100
+ tag_name = self.key.original
101
+
102
+ return TagUrn(name=tag_name)
103
+
104
+ @classmethod
105
+ def from_urn(cls, tag_urn: Union[str, "TagUrn"]) -> "ExternalTag":
106
+ """
107
+ Create an ExternalTag from a DataHub Tag URN.
108
+
109
+ Args:
110
+ tag_urn: DataHub Tag URN string or TagUrn object
111
+
112
+ Returns:
113
+ ExternalTag instance
114
+ """
115
+ if isinstance(tag_urn, str):
116
+ tag_urn = TagUrn.from_string(tag_urn)
117
+ key, value = cls._parse_tag_name(tag_urn.name)
118
+ return cls(key=key, value=value)
119
+
120
+ @classmethod
121
+ def from_key_value(cls, key: str, value: Optional[str] = None) -> "ExternalTag":
122
+ """
123
+ Create an ExternalTag from explicit key and value.
124
+
125
+ Args:
126
+ key: Tag key
127
+ value: Optional tag value
128
+
129
+ Returns:
130
+ ExternalTag instance
131
+ """
132
+ return cls(key=key, value=value)
133
+
134
+ def __str__(self) -> str:
135
+ """String representation of the tag."""
136
+ if self.value is not None:
137
+ return f"{self.key}:{self.value}"
138
+ else:
139
+ return str(self.key)
140
+
141
+ def __repr__(self) -> str:
142
+ if self.value is not None:
143
+ return f"ExternalTag(key={self.key!r}, value={self.value!r})"
144
+ else:
145
+ return f"ExternalTag(key={self.key!r})"
@@ -0,0 +1,161 @@
1
+ # Import RestrictedText from your existing module
2
+ # Uncomment and adjust the import path as needed:
3
+ # from your_restricted_text_module import RestrictedText
4
+ # The following is a list of tag constraints:
5
+ # You can assign a maximum of 50 tags to a single securable object.
6
+ # The maximum length of a tag key is 255 characters.
7
+ # The maximum length of a tag value is 1000 characters.
8
+ # The following characters are not allowed in tag keys:
9
+ # . , - = / :
10
+ # Tag search using the workspace search UI is supported only for tables, views, and table columns.
11
+ # Tag search requires exact term matching.
12
+ # https://learn.microsoft.com/en-us/azure/databricks/database-objects/tags#constraint
13
+ from typing import Any, Dict, Optional, Union
14
+
15
+ from typing_extensions import ClassVar
16
+
17
+ from datahub.api.entities.external.external_tag import ExternalTag
18
+ from datahub.api.entities.external.restricted_text import RestrictedText
19
+
20
+
21
+ class LakeFormationTagKeyText(RestrictedText):
22
+ """RestrictedText configured for Unity Catalog tag keys."""
23
+
24
+ _default_max_length: ClassVar[int] = 50
25
+ # Unity Catalog tag keys: alphanumeric, hyphens, underscores, periods only
26
+ _default_replacement_char: ClassVar[str] = "_"
27
+ _default_truncation_suffix: ClassVar[str] = "" # No suffix for clean identifiers
28
+
29
+
30
+ class LakeFormationTagValueText(RestrictedText):
31
+ """RestrictedText configured for Unity Catalog tag values."""
32
+
33
+ _default_max_length: ClassVar[int] = 50
34
+ # Unity Catalog tag values are more permissive but still have some restrictions
35
+ _default_replacement_char: ClassVar[str] = " "
36
+ _default_truncation_suffix: ClassVar[str] = "..."
37
+
38
+
39
+ class LakeFormationTag(ExternalTag):
40
+ """
41
+ A tag type specifically designed for LakeFormation tag restrictions.
42
+
43
+ LakeFormation Tag Restrictions:
44
+ - Key: Max 127 characters, alphanumeric + hyphens, underscores, periods only
45
+ - Value: Max 256 characters, more permissive but no control characters
46
+ """
47
+
48
+ key: LakeFormationTagKeyText
49
+ value: Optional[LakeFormationTagValueText] = None
50
+ catalog: Optional[str] = None
51
+
52
+ def __init__(
53
+ self,
54
+ key: Optional[Union[str, LakeFormationTagKeyText]] = None,
55
+ value: Optional[Union[str, LakeFormationTagValueText]] = None,
56
+ **data: Any,
57
+ ) -> None:
58
+ """
59
+ Initialize LakeFormation Tag from either a DataHub Tag URN or explicit key/value.
60
+
61
+ Args:
62
+ key: Explicit key value (optional for Pydantic initialization)
63
+ value: Explicit value (optional)
64
+ **data: Additional Pydantic data
65
+ """
66
+ if key is not None:
67
+ # Direct initialization with key/value
68
+ processed_key = (
69
+ LakeFormationTagKeyText(key)
70
+ if not isinstance(key, LakeFormationTagKeyText)
71
+ else key
72
+ )
73
+ processed_value = None
74
+ if value is not None:
75
+ processed_value = (
76
+ LakeFormationTagValueText(value)
77
+ if not isinstance(value, LakeFormationTagValueText)
78
+ else value
79
+ )
80
+
81
+ super().__init__(
82
+ key=processed_key,
83
+ value=processed_value,
84
+ **data,
85
+ )
86
+ else:
87
+ # Standard pydantic initialization
88
+ super().__init__(**data)
89
+
90
+ def __eq__(self, other: object) -> bool:
91
+ """Check equality based on key and value."""
92
+ if not isinstance(other, LakeFormationTag):
93
+ return False
94
+ return str(self.key) == str(other.key) and (
95
+ str(self.value) if self.value else None
96
+ ) == (str(other.value) if other.value else None)
97
+
98
+ def __hash__(self) -> int:
99
+ """Make LakeFormationTag hashable based on key and value."""
100
+ return hash((str(self.key), str(self.value) if self.value else None))
101
+
102
+ @classmethod
103
+ def from_dict(cls, tag_dict: Dict[str, Any]) -> "LakeFormationTag":
104
+ """
105
+ Create a LakeFormationTag from a dictionary with 'key' and optional 'value'.
106
+
107
+ Args:
108
+ tag_dict: Dictionary with 'key' and optional 'value' keys
109
+
110
+ Returns:
111
+ LakeFormationTag instance
112
+ """
113
+ return cls(key=tag_dict["key"], value=tag_dict.get("value"))
114
+
115
+ @classmethod
116
+ def from_key_value(
117
+ cls, key: str, value: Optional[str] = None
118
+ ) -> "LakeFormationTag":
119
+ """
120
+ Create a LakeFormationTagPlatformResource from explicit key and value.
121
+
122
+ Overrides the parent method to return the correct type.
123
+
124
+ Args:
125
+ key: Tag key
126
+ value: Optional tag value
127
+
128
+ Returns:
129
+ LakeFormationTag instance
130
+ """
131
+ return cls(key=key, value=value)
132
+
133
+ def to_dict(self) -> Dict[str, str]:
134
+ """
135
+ Convert to dictionary format suitable for LakeFormation tag.
136
+
137
+ Returns:
138
+ Dictionary with 'key' and optionally 'value'
139
+ """
140
+ result: Dict[str, str] = {"key": self.key.original}
141
+ if self.value is not None:
142
+ result["value"] = self.value.original
143
+ return result
144
+
145
+ def to_display_dict(self) -> Dict[str, str]:
146
+ """
147
+ Convert to dictionary format showing processed values.
148
+
149
+ Returns:
150
+ Dictionary with processed 'key' and optional 'value'
151
+ """
152
+ result: Dict[str, str] = {"key": str(self.key)}
153
+ if self.value is not None:
154
+ result["value"] = str(self.value)
155
+ return result
156
+
157
+ def __repr__(self) -> str:
158
+ if self.value:
159
+ return f"LakeFormationTag(key={self.key!r}, value={self.value!r})"
160
+ else:
161
+ return f"LakeFormationTag(key={self.key!r})"