acryl-datahub 1.1.0rc4__py3-none-any.whl → 1.1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show
  1. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2414 -2412
  2. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
  3. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +9 -8
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/restricted_text.py +247 -0
  10. datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
  11. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  12. datahub/cli/delete_cli.py +4 -4
  13. datahub/cli/ingest_cli.py +9 -1
  14. datahub/emitter/mce_builder.py +3 -1
  15. datahub/emitter/response_helper.py +86 -1
  16. datahub/emitter/rest_emitter.py +1 -1
  17. datahub/ingestion/graph/client.py +3 -3
  18. datahub/ingestion/source/apply/datahub_apply.py +4 -4
  19. datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
  20. datahub/ingestion/source/data_lake_common/object_store.py +644 -0
  21. datahub/ingestion/source/datahub/config.py +11 -0
  22. datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
  23. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  24. datahub/ingestion/source/dbt/dbt_common.py +30 -11
  25. datahub/ingestion/source/gcs/gcs_source.py +22 -7
  26. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  27. datahub/ingestion/source/hex/query_fetcher.py +9 -3
  28. datahub/ingestion/source/openapi.py +12 -0
  29. datahub/ingestion/source/openapi_parser.py +56 -37
  30. datahub/ingestion/source/s3/source.py +65 -6
  31. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  32. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
  33. datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
  34. datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
  35. datahub/ingestion/source/sql/athena.py +1 -0
  36. datahub/ingestion/source/sql/hive.py +2 -3
  37. datahub/ingestion/source/sql/sql_common.py +98 -34
  38. datahub/ingestion/source/sql/sql_types.py +5 -2
  39. datahub/ingestion/source/unity/config.py +5 -0
  40. datahub/ingestion/source/unity/proxy.py +117 -0
  41. datahub/ingestion/source/unity/source.py +167 -15
  42. datahub/ingestion/source/unity/tag_entities.py +295 -0
  43. datahub/metadata/_internal_schema_classes.py +667 -522
  44. datahub/metadata/_urns/urn_defs.py +1804 -1748
  45. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  46. datahub/metadata/schema.avsc +17358 -17584
  47. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  48. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  49. datahub/metadata/schemas/Applications.avsc +38 -0
  50. datahub/metadata/schemas/ChartKey.avsc +1 -0
  51. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  52. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  53. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  54. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  55. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  56. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  57. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  58. datahub/metadata/schemas/DatasetKey.avsc +1 -0
  59. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  60. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  61. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  62. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  63. datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
  64. datahub/metadata/schemas/MLModelKey.avsc +1 -0
  65. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  66. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  67. datahub/metadata/schemas/__init__.py +3 -3
  68. datahub/sdk/__init__.py +6 -0
  69. datahub/sdk/_all_entities.py +11 -0
  70. datahub/sdk/_shared.py +118 -1
  71. datahub/sdk/chart.py +315 -0
  72. datahub/sdk/container.py +7 -0
  73. datahub/sdk/dashboard.py +432 -0
  74. datahub/sdk/dataflow.py +309 -0
  75. datahub/sdk/datajob.py +342 -0
  76. datahub/sdk/dataset.py +8 -2
  77. datahub/sdk/entity_client.py +90 -2
  78. datahub/sdk/lineage_client.py +681 -82
  79. datahub/sdk/main_client.py +27 -8
  80. datahub/sdk/mlmodel.py +101 -38
  81. datahub/sdk/mlmodelgroup.py +7 -0
  82. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  83. datahub/testing/mce_helpers.py +421 -0
  84. datahub/testing/sdk_v2_helpers.py +18 -0
  85. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
  86. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
  87. {acryl_datahub-1.1.0rc4.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,12 @@
1
1
  import logging
2
2
  import re
3
+ import time
3
4
  from concurrent.futures import ThreadPoolExecutor
4
5
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
5
6
  from urllib.parse import urljoin
6
7
 
8
+ from datahub.api.entities.external.external_entities import PlatformResourceRepository
9
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
7
10
  from datahub.emitter.mce_builder import (
8
11
  make_data_platform_urn,
9
12
  make_dataplatform_instance_urn,
@@ -78,6 +81,7 @@ from datahub.ingestion.source.unity.proxy_types import (
78
81
  Catalog,
79
82
  Column,
80
83
  CustomCatalogType,
84
+ HiveTableType,
81
85
  Metastore,
82
86
  Notebook,
83
87
  NotebookId,
@@ -87,8 +91,17 @@ from datahub.ingestion.source.unity.proxy_types import (
87
91
  TableReference,
88
92
  )
89
93
  from datahub.ingestion.source.unity.report import UnityCatalogReport
94
+ from datahub.ingestion.source.unity.tag_entities import (
95
+ UnityCatalogTagPlatformResource,
96
+ UnityCatalogTagPlatformResourceId,
97
+ )
90
98
  from datahub.ingestion.source.unity.usage import UnityCatalogUsageExtractor
91
- from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
99
+ from datahub.metadata.com.linkedin.pegasus2avro.common import (
100
+ GlobalTags,
101
+ MetadataAttribution,
102
+ Siblings,
103
+ TagAssociation,
104
+ )
92
105
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
93
106
  DatasetLineageType,
94
107
  FineGrainedLineage,
@@ -116,6 +129,7 @@ from datahub.metadata.schema_classes import (
116
129
  UpstreamClass,
117
130
  UpstreamLineageClass,
118
131
  )
132
+ from datahub.metadata.urns import TagUrn
119
133
  from datahub.sql_parsing.schema_resolver import SchemaResolver
120
134
  from datahub.sql_parsing.sqlglot_lineage import (
121
135
  SqlParsingResult,
@@ -162,6 +176,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
162
176
  platform: str = "databricks"
163
177
  platform_instance_name: Optional[str]
164
178
  sql_parser_schema_resolver: Optional[SchemaResolver] = None
179
+ platform_resource_repository: Optional[PlatformResourceRepository] = None
165
180
 
166
181
  def get_report(self) -> UnityCatalogReport:
167
182
  return self.report
@@ -211,6 +226,10 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
211
226
 
212
227
  # Global map of tables, for profiling
213
228
  self.tables: FileBackedDict[Table] = FileBackedDict()
229
+ if self.ctx.graph:
230
+ self.platform_resource_repository = PlatformResourceRepository(
231
+ self.ctx.graph
232
+ )
214
233
 
215
234
  def init_hive_metastore_proxy(self):
216
235
  self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
@@ -506,13 +525,42 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
506
525
  yield from self.add_table_to_dataset_container(dataset_urn, schema)
507
526
 
508
527
  table_props = self._create_table_property_aspect(table)
528
+ tags = None
529
+ if not isinstance(table.table_type, HiveTableType) and self.config.include_tags:
530
+ try:
531
+ table_tags = self._get_table_tags(
532
+ table.ref.catalog, table.ref.schema, table.ref.table
533
+ )
534
+ if table_tags:
535
+ logger.debug(f"Table tags for {table.ref}: {table_tags}")
536
+ attribution = MetadataAttribution(
537
+ # source="unity-catalog",
538
+ actor="urn:li:corpuser:datahub",
539
+ time=int(time.time() * 1000),
540
+ )
541
+ tags = GlobalTags(
542
+ tags=[
543
+ TagAssociation(
544
+ tag=tag.to_datahub_tag_urn().urn(),
545
+ attribution=attribution,
546
+ )
547
+ for tag in table_tags
548
+ ]
549
+ )
550
+
551
+ yield from self.gen_platform_resources(table_tags)
552
+
553
+ except Exception as e:
554
+ logger.exception(f"Error fetching table {table.ref} tags", exc_info=e)
509
555
 
510
556
  view_props = None
511
557
  if table.view_definition:
512
558
  view_props = self._create_view_property_aspect(table)
513
559
 
514
560
  sub_type = self._create_table_sub_type_aspect(table)
515
- schema_metadata = self._create_schema_metadata_aspect(table)
561
+ schema_metadata, platform_resources = self._create_schema_metadata_aspect(table)
562
+ yield from platform_resources
563
+
516
564
  domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name)
517
565
  ownership = self._create_table_ownership_aspect(table)
518
566
  data_platform_instance = self._create_data_platform_instance_aspect()
@@ -585,6 +633,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
585
633
  domain,
586
634
  data_platform_instance,
587
635
  lineage,
636
+ tags,
588
637
  ],
589
638
  )
590
639
  ]
@@ -718,6 +767,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
718
767
 
719
768
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
720
769
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
770
+ schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
771
+ schema.catalog.name
772
+ ).get(f"{schema.catalog.name}.{schema.name}", [])
773
+ if schema_tags:
774
+ logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
775
+ # Generate platform resources for schema tags
776
+ yield from self.gen_platform_resources(schema_tags)
721
777
 
722
778
  schema_container_key = self.gen_schema_key(schema)
723
779
  yield from gen_containers(
@@ -729,6 +785,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
729
785
  description=schema.comment,
730
786
  owner_urn=self.get_owner_urn(schema.owner),
731
787
  external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
788
+ tags=[tag.to_datahub_tag_urn().urn() for tag in schema_tags]
789
+ if schema_tags
790
+ else None,
732
791
  )
733
792
 
734
793
  def gen_metastore_containers(
@@ -749,6 +808,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
749
808
 
750
809
  def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
751
810
  domain_urn = self._gen_domain_urn(catalog.name)
811
+ catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog.name).get(
812
+ catalog.name, []
813
+ )
814
+ if catalog_tags:
815
+ logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
816
+ # Generate platform resources for schema tags
817
+ yield from self.gen_platform_resources(catalog_tags)
752
818
 
753
819
  catalog_container_key = self.gen_catalog_key(catalog)
754
820
  yield from gen_containers(
@@ -764,6 +830,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
764
830
  description=catalog.comment,
765
831
  owner_urn=self.get_owner_urn(catalog.owner),
766
832
  external_url=f"{self.external_url_base}/{catalog.name}",
833
+ tags=[tag.to_datahub_tag_urn().urn() for tag in catalog_tags]
834
+ if catalog_tags
835
+ else None,
767
836
  )
768
837
 
769
838
  def gen_schema_key(self, schema: Schema) -> ContainerKey:
@@ -832,6 +901,30 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
832
901
  dataset_urn=dataset_urn,
833
902
  )
834
903
 
904
+ def _get_catalog_tags(
905
+ self, catalog: str, schema: str, table: str
906
+ ) -> List[UnityCatalogTag]:
907
+ all_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog)
908
+ return all_tags.get(f"{catalog}", [])
909
+
910
+ def _get_schema_tags(
911
+ self, catalog: str, schema: str, table: str
912
+ ) -> List[UnityCatalogTag]:
913
+ all_tags = self.unity_catalog_api_proxy.get_schema_tags(catalog)
914
+ return all_tags.get(f"{catalog}.{schema}", [])
915
+
916
+ def _get_table_tags(
917
+ self, catalog: str, schema: str, table: str
918
+ ) -> List[UnityCatalogTag]:
919
+ all_tags = self.unity_catalog_api_proxy.get_table_tags(catalog)
920
+ return all_tags.get(f"{catalog}.{schema}.{table}", [])
921
+
922
+ def _get_column_tags(
923
+ self, catalog: str, schema: str, table: str, column: str
924
+ ) -> List[UnityCatalogTag]:
925
+ all_tags = self.unity_catalog_api_proxy.get_column_tags(catalog)
926
+ return all_tags.get(f"{catalog}.{schema}.{table}.{column}", [])
927
+
835
928
  def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
836
929
  custom_properties: dict = {}
837
930
  if table.storage_location is not None:
@@ -921,30 +1014,88 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
921
1014
  materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
922
1015
  )
923
1016
 
924
- def _create_schema_metadata_aspect(self, table: Table) -> SchemaMetadataClass:
925
- schema_fields: List[SchemaFieldClass] = []
1017
+ def gen_platform_resources(
1018
+ self, tags: List[UnityCatalogTag]
1019
+ ) -> Iterable[MetadataWorkUnit]:
1020
+ if self.ctx.graph and self.platform_resource_repository:
1021
+ for tag in tags:
1022
+ platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1023
+ platform_instance=self.platform_instance_name,
1024
+ platform_resource_repository=self.platform_resource_repository,
1025
+ tag=tag,
1026
+ )
1027
+ logger.debug(f"Created platform resource {platform_resource_id}")
926
1028
 
1029
+ unity_catalog_tag = UnityCatalogTagPlatformResource.get_from_datahub(
1030
+ platform_resource_id, self.platform_resource_repository, False
1031
+ )
1032
+ if (
1033
+ tag.to_datahub_tag_urn().urn()
1034
+ not in unity_catalog_tag.datahub_linked_resources().urns
1035
+ ):
1036
+ unity_catalog_tag.datahub_linked_resources().add(
1037
+ tag.to_datahub_tag_urn().urn()
1038
+ )
1039
+ platform_resource = unity_catalog_tag.as_platform_resource()
1040
+ for mcp in platform_resource.to_mcps():
1041
+ yield MetadataWorkUnit(
1042
+ id=f"platform_resource-{platform_resource.id}",
1043
+ mcp=mcp,
1044
+ )
1045
+
1046
+ def _create_schema_metadata_aspect(
1047
+ self, table: Table
1048
+ ) -> Tuple[SchemaMetadataClass, Iterable[MetadataWorkUnit]]:
1049
+ schema_fields: List[SchemaFieldClass] = []
1050
+ unique_tags: Set[UnityCatalogTag] = set()
927
1051
  for column in table.columns:
928
- schema_fields.extend(self._create_schema_field(column))
929
-
930
- return SchemaMetadataClass(
931
- schemaName=table.id,
932
- platform=make_data_platform_urn(self.platform),
933
- fields=schema_fields,
934
- hash="",
935
- version=0,
936
- platformSchema=MySqlDDLClass(tableSchema=""),
1052
+ tag_urns: Optional[List[TagUrn]] = None
1053
+ if self.config.include_tags:
1054
+ column_tags = self._get_column_tags(
1055
+ table.ref.catalog, table.ref.schema, table.ref.table, column.name
1056
+ )
1057
+ unique_tags.update(column_tags)
1058
+ tag_urns = [tag.to_datahub_tag_urn() for tag in column_tags]
1059
+ schema_fields.extend(self._create_schema_field(column, tag_urns))
1060
+
1061
+ platform_resources = self.gen_platform_resources(list(unique_tags))
1062
+ return (
1063
+ SchemaMetadataClass(
1064
+ schemaName=table.id,
1065
+ platform=make_data_platform_urn(self.platform),
1066
+ fields=schema_fields,
1067
+ hash="",
1068
+ version=0,
1069
+ platformSchema=MySqlDDLClass(tableSchema=""),
1070
+ ),
1071
+ platform_resources,
937
1072
  )
938
1073
 
939
1074
  @staticmethod
940
- def _create_schema_field(column: Column) -> List[SchemaFieldClass]:
1075
+ def _create_schema_field(
1076
+ column: Column, tags: Optional[List[TagUrn]]
1077
+ ) -> List[SchemaFieldClass]:
941
1078
  _COMPLEX_TYPE = re.compile("^(struct|array)")
942
-
1079
+ global_tags: Optional[GlobalTags] = None
943
1080
  if _COMPLEX_TYPE.match(column.type_text.lower()):
944
1081
  return get_schema_fields_for_hive_column(
945
1082
  column.name, column.type_text.lower(), description=column.comment
946
1083
  )
947
1084
  else:
1085
+ if tags is not None:
1086
+ logger.debug(f"Column tags are: {tags}")
1087
+ attribution = MetadataAttribution(
1088
+ source="urn:li:dataPlatform:unity-catalog",
1089
+ actor="urn:li:corpuser:datahub",
1090
+ time=int(time.time() * 1000),
1091
+ )
1092
+ global_tags = GlobalTags(
1093
+ tags=[
1094
+ TagAssociation(tag=tag.urn(), attribution=attribution)
1095
+ for tag in tags
1096
+ ]
1097
+ )
1098
+
948
1099
  return [
949
1100
  SchemaFieldClass(
950
1101
  fieldPath=column.name,
@@ -954,6 +1105,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
954
1105
  nativeDataType=column.type_text,
955
1106
  nullable=column.nullable,
956
1107
  description=column.comment,
1108
+ globalTags=global_tags if tags else None,
957
1109
  )
958
1110
  ]
959
1111
 
@@ -0,0 +1,295 @@
1
+ import logging
2
+ from typing import List, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from datahub.api.entities.external.external_entities import (
7
+ ExternalEntity,
8
+ ExternalEntityId,
9
+ LinkedResourceSet,
10
+ PlatformResourceRepository,
11
+ )
12
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
13
+ from datahub.api.entities.platformresource.platform_resource import (
14
+ PlatformResource,
15
+ PlatformResourceKey,
16
+ PlatformResourceSearchFields,
17
+ )
18
+ from datahub.ingestion.graph.client import DataHubGraph
19
+ from datahub.metadata.urns import TagUrn
20
+ from datahub.utilities.search_utils import ElasticDocumentQuery
21
+ from datahub.utilities.urns.urn import Urn
22
+
23
+
24
+ class UnityCatalogTagSyncContext(BaseModel):
25
+ # it is intentionally empty
26
+ platform_instance: Optional[str] = None
27
+
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
33
+ """
34
+ A SnowflakeTagId is a unique identifier for a Snowflake tag.
35
+ """
36
+
37
+ tag_key: str
38
+ tag_value: Optional[str] = None
39
+ platform_instance: Optional[str]
40
+ exists_in_unity_catalog: bool = False
41
+ persisted: bool = False
42
+
43
+ def __hash__(self) -> int:
44
+ return hash(self.to_platform_resource_key().id)
45
+
46
+ # this is a hack to make sure the property is a string and not private pydantic field
47
+ @staticmethod
48
+ def _RESOURCE_TYPE() -> str:
49
+ return "UnityCatalogTagPlatformResource"
50
+
51
+ def to_platform_resource_key(self) -> PlatformResourceKey:
52
+ return PlatformResourceKey(
53
+ platform="databricks",
54
+ resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
55
+ primary_key=f"{self.tag_key}:{self.tag_value}",
56
+ platform_instance=self.platform_instance,
57
+ )
58
+
59
+ @classmethod
60
+ def from_tag(
61
+ cls,
62
+ tag: UnityCatalogTag,
63
+ platform_instance: Optional[str],
64
+ platform_resource_repository: PlatformResourceRepository,
65
+ exists_in_unity_catalog: bool = False,
66
+ ) -> "UnityCatalogTagPlatformResourceId":
67
+ """
68
+ Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
69
+ """
70
+
71
+ existing_platform_resource = cls.search_by_urn(
72
+ tag.to_datahub_tag_urn().urn(),
73
+ platform_resource_repository=platform_resource_repository,
74
+ tag_sync_context=UnityCatalogTagSyncContext(
75
+ platform_instance=platform_instance
76
+ ),
77
+ )
78
+ if existing_platform_resource:
79
+ logger.info(
80
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.original}: {existing_platform_resource}"
81
+ )
82
+ return existing_platform_resource
83
+
84
+ return UnityCatalogTagPlatformResourceId(
85
+ tag_key=tag.key.original,
86
+ tag_value=tag.value.original if tag.value is not None else None,
87
+ platform_instance=platform_instance,
88
+ exists_in_unity_catalog=exists_in_unity_catalog,
89
+ persisted=False,
90
+ )
91
+
92
+ @classmethod
93
+ def search_by_urn(
94
+ cls,
95
+ urn: str,
96
+ platform_resource_repository: PlatformResourceRepository,
97
+ tag_sync_context: UnityCatalogTagSyncContext,
98
+ ) -> Optional["UnityCatalogTagPlatformResourceId"]:
99
+ mapped_tags = [
100
+ t
101
+ for t in platform_resource_repository.search_by_filter(
102
+ ElasticDocumentQuery.create_from(
103
+ (
104
+ PlatformResourceSearchFields.RESOURCE_TYPE,
105
+ str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
106
+ ),
107
+ (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
108
+ )
109
+ )
110
+ ]
111
+ logger.info(
112
+ f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
113
+ )
114
+ if len(mapped_tags) > 0:
115
+ for platform_resource in mapped_tags:
116
+ if (
117
+ platform_resource.resource_info
118
+ and platform_resource.resource_info.value
119
+ ):
120
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
121
+ **platform_resource.resource_info.value.as_pydantic_object(
122
+ UnityCatalogTagPlatformResource
123
+ ).dict()
124
+ )
125
+ if (
126
+ unity_catalog_tag.id.platform_instance
127
+ == tag_sync_context.platform_instance
128
+ ):
129
+ unity_catalog_tag_id = unity_catalog_tag.id
130
+ unity_catalog_tag_id.exists_in_unity_catalog = True
131
+ unity_catalog_tag_id.persisted = True
132
+ return unity_catalog_tag_id
133
+ else:
134
+ logger.warning(
135
+ f"Platform resource {platform_resource} does not have a resource_info value"
136
+ )
137
+ continue
138
+
139
+ # If we reach here, it means we did not find a mapped tag for the URN
140
+ logger.info(
141
+ f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
142
+ )
143
+ return None
144
+
145
+ @classmethod
146
+ def from_datahub_urn(
147
+ cls,
148
+ urn: str,
149
+ platform_resource_repository: PlatformResourceRepository,
150
+ tag_sync_context: UnityCatalogTagSyncContext,
151
+ graph: DataHubGraph,
152
+ ) -> "UnityCatalogTagPlatformResourceId":
153
+ """
154
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
155
+ """
156
+ # First we check if we already have a mapped platform resource for this
157
+ # urn that is of the type UnityCatalogTagPlatformResource
158
+ # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
159
+ # Else, we need to generate a new UnityCatalogTagPlatformResourceId
160
+ existing_platform_resource_id = cls.search_by_urn(
161
+ urn, platform_resource_repository, tag_sync_context
162
+ )
163
+ if existing_platform_resource_id:
164
+ logger.info(
165
+ f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
166
+ )
167
+ return existing_platform_resource_id
168
+
169
+ # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
170
+ new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
171
+ if new_unity_catalog_tag_id:
172
+ # we then check if this tag has already been ingested as a platform
173
+ # resource in the platform resource repository
174
+ resource_key = platform_resource_repository.get(
175
+ new_unity_catalog_tag_id.to_platform_resource_key()
176
+ )
177
+ if resource_key:
178
+ logger.info(
179
+ f"Tag {new_unity_catalog_tag_id} already exists in platform resource repository with {resource_key}"
180
+ )
181
+ new_unity_catalog_tag_id.exists_in_unity_catalog = (
182
+ True # TODO: Check if this is a safe assumption
183
+ )
184
+ return new_unity_catalog_tag_id
185
+ raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
186
+
187
+ @classmethod
188
+ def generate_tag_id(
189
+ cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
190
+ ) -> "UnityCatalogTagPlatformResourceId":
191
+ parsed_urn = Urn.from_string(urn)
192
+ entity_type = parsed_urn.entity_type
193
+ if entity_type == "tag":
194
+ new_unity_catalog_tag_id = (
195
+ UnityCatalogTagPlatformResourceId.from_datahub_tag(
196
+ TagUrn.from_string(urn), tag_sync_context
197
+ )
198
+ )
199
+ else:
200
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
201
+ return new_unity_catalog_tag_id
202
+
203
+ @classmethod
204
+ def from_datahub_tag(
205
+ cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
206
+ ) -> "UnityCatalogTagPlatformResourceId":
207
+ uc_tag = UnityCatalogTag.from_urn(tag_urn)
208
+
209
+ return UnityCatalogTagPlatformResourceId(
210
+ tag_key=str(uc_tag.key),
211
+ tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
212
+ platform_instance=tag_sync_context.platform_instance,
213
+ exists_in_unity_catalog=False,
214
+ )
215
+
216
+
217
+ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
218
+ datahub_urns: LinkedResourceSet
219
+ managed_by_datahub: bool
220
+ id: UnityCatalogTagPlatformResourceId
221
+ allowed_values: Optional[List[str]]
222
+
223
+ def get_id(self) -> ExternalEntityId:
224
+ return self.id
225
+
226
+ def is_managed_by_datahub(self) -> bool:
227
+ return self.managed_by_datahub
228
+
229
+ def datahub_linked_resources(self) -> LinkedResourceSet:
230
+ return self.datahub_urns
231
+
232
+ def as_platform_resource(self) -> PlatformResource:
233
+ return PlatformResource.create(
234
+ key=self.id.to_platform_resource_key(),
235
+ secondary_keys=[u for u in self.datahub_urns.urns],
236
+ value=self,
237
+ )
238
+
239
+ @classmethod
240
+ def get_from_datahub(
241
+ cls,
242
+ unity_catalog_tag_id: UnityCatalogTagPlatformResourceId,
243
+ platform_resource_repository: PlatformResourceRepository,
244
+ managed_by_datahub: bool = False,
245
+ ) -> "UnityCatalogTagPlatformResource":
246
+ # Search for linked DataHub URNs
247
+ platform_resources = [
248
+ r
249
+ for r in platform_resource_repository.search_by_filter(
250
+ ElasticDocumentQuery.create_from(
251
+ (
252
+ PlatformResourceSearchFields.RESOURCE_TYPE,
253
+ str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
254
+ ),
255
+ (
256
+ PlatformResourceSearchFields.PRIMARY_KEY,
257
+ f"{unity_catalog_tag_id.tag_key}/{unity_catalog_tag_id.tag_value}",
258
+ ),
259
+ )
260
+ )
261
+ ]
262
+ if len(platform_resources) == 1:
263
+ platform_resource: PlatformResource = platform_resources[0]
264
+ if (
265
+ platform_resource.resource_info
266
+ and platform_resource.resource_info.value
267
+ ):
268
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
269
+ **platform_resource.resource_info.value.as_pydantic_object(
270
+ UnityCatalogTagPlatformResource
271
+ ).dict()
272
+ )
273
+ return unity_catalog_tag
274
+ else:
275
+ for platform_resource in platform_resources:
276
+ if (
277
+ platform_resource.resource_info
278
+ and platform_resource.resource_info.value
279
+ ):
280
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
281
+ **platform_resource.resource_info.value.as_pydantic_object(
282
+ UnityCatalogTagPlatformResource
283
+ ).dict()
284
+ )
285
+ if (
286
+ unity_catalog_tag.id.platform_instance
287
+ == unity_catalog_tag_id.platform_instance
288
+ ):
289
+ return unity_catalog_tag
290
+ return cls(
291
+ id=unity_catalog_tag_id,
292
+ datahub_urns=LinkedResourceSet(urns=[]),
293
+ managed_by_datahub=managed_by_datahub,
294
+ allowed_values=None,
295
+ )