acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,12 @@
1
1
  import logging
2
2
  import re
3
+ import time
3
4
  from concurrent.futures import ThreadPoolExecutor
4
5
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
5
6
  from urllib.parse import urljoin
6
7
 
8
+ from datahub.api.entities.external.external_entities import PlatformResourceRepository
9
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
7
10
  from datahub.emitter.mce_builder import (
8
11
  make_data_platform_urn,
9
12
  make_dataplatform_instance_urn,
@@ -53,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
53
56
  from datahub.ingestion.source.common.subtypes import (
54
57
  DatasetContainerSubTypes,
55
58
  DatasetSubTypes,
59
+ SourceCapabilityModifier,
56
60
  )
57
61
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
58
62
  StaleEntityRemovalHandler,
@@ -78,6 +82,7 @@ from datahub.ingestion.source.unity.proxy_types import (
78
82
  Catalog,
79
83
  Column,
80
84
  CustomCatalogType,
85
+ HiveTableType,
81
86
  Metastore,
82
87
  Notebook,
83
88
  NotebookId,
@@ -87,8 +92,17 @@ from datahub.ingestion.source.unity.proxy_types import (
87
92
  TableReference,
88
93
  )
89
94
  from datahub.ingestion.source.unity.report import UnityCatalogReport
95
+ from datahub.ingestion.source.unity.tag_entities import (
96
+ UnityCatalogTagPlatformResource,
97
+ UnityCatalogTagPlatformResourceId,
98
+ )
90
99
  from datahub.ingestion.source.unity.usage import UnityCatalogUsageExtractor
91
- from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
100
+ from datahub.metadata.com.linkedin.pegasus2avro.common import (
101
+ GlobalTags,
102
+ MetadataAttribution,
103
+ Siblings,
104
+ TagAssociation,
105
+ )
92
106
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
93
107
  DatasetLineageType,
94
108
  FineGrainedLineage,
@@ -116,6 +130,7 @@ from datahub.metadata.schema_classes import (
116
130
  UpstreamClass,
117
131
  UpstreamLineageClass,
118
132
  )
133
+ from datahub.metadata.urns import TagUrn
119
134
  from datahub.sql_parsing.schema_resolver import SchemaResolver
120
135
  from datahub.sql_parsing.sqlglot_lineage import (
121
136
  SqlParsingResult,
@@ -138,16 +153,24 @@ logger: logging.Logger = logging.getLogger(__name__)
138
153
  @capability(SourceCapability.USAGE_STATS, "Enabled by default")
139
154
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
140
155
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
141
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
156
+ @capability(
157
+ SourceCapability.CONTAINERS,
158
+ "Enabled by default",
159
+ subtype_modifier=[
160
+ SourceCapabilityModifier.CATALOG,
161
+ SourceCapabilityModifier.SCHEMA,
162
+ ],
163
+ )
142
164
  @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
143
165
  @capability(
144
166
  SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
145
167
  )
146
168
  @capability(
147
169
  SourceCapability.DELETION_DETECTION,
148
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
170
+ "Enabled by default via stateful ingestion",
149
171
  supported=True,
150
172
  )
173
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
151
174
  @support_status(SupportStatus.INCUBATING)
152
175
  class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
153
176
  """
@@ -162,6 +185,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
162
185
  platform: str = "databricks"
163
186
  platform_instance_name: Optional[str]
164
187
  sql_parser_schema_resolver: Optional[SchemaResolver] = None
188
+ platform_resource_repository: Optional[PlatformResourceRepository] = None
165
189
 
166
190
  def get_report(self) -> UnityCatalogReport:
167
191
  return self.report
@@ -211,6 +235,10 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
211
235
 
212
236
  # Global map of tables, for profiling
213
237
  self.tables: FileBackedDict[Table] = FileBackedDict()
238
+ if self.ctx.graph:
239
+ self.platform_resource_repository = PlatformResourceRepository(
240
+ self.ctx.graph
241
+ )
214
242
 
215
243
  def init_hive_metastore_proxy(self):
216
244
  self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
@@ -506,13 +534,42 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
506
534
  yield from self.add_table_to_dataset_container(dataset_urn, schema)
507
535
 
508
536
  table_props = self._create_table_property_aspect(table)
537
+ tags = None
538
+ if not isinstance(table.table_type, HiveTableType) and self.config.include_tags:
539
+ try:
540
+ table_tags = self._get_table_tags(
541
+ table.ref.catalog, table.ref.schema, table.ref.table
542
+ )
543
+ if table_tags:
544
+ logger.debug(f"Table tags for {table.ref}: {table_tags}")
545
+ attribution = MetadataAttribution(
546
+ # source="unity-catalog",
547
+ actor="urn:li:corpuser:datahub",
548
+ time=int(time.time() * 1000),
549
+ )
550
+ tags = GlobalTags(
551
+ tags=[
552
+ TagAssociation(
553
+ tag=tag.to_datahub_tag_urn().urn(),
554
+ attribution=attribution,
555
+ )
556
+ for tag in table_tags
557
+ ]
558
+ )
559
+
560
+ yield from self.gen_platform_resources(table_tags)
561
+
562
+ except Exception as e:
563
+ logger.exception(f"Error fetching table {table.ref} tags", exc_info=e)
509
564
 
510
565
  view_props = None
511
566
  if table.view_definition:
512
567
  view_props = self._create_view_property_aspect(table)
513
568
 
514
569
  sub_type = self._create_table_sub_type_aspect(table)
515
- schema_metadata = self._create_schema_metadata_aspect(table)
570
+ schema_metadata, platform_resources = self._create_schema_metadata_aspect(table)
571
+ yield from platform_resources
572
+
516
573
  domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name)
517
574
  ownership = self._create_table_ownership_aspect(table)
518
575
  data_platform_instance = self._create_data_platform_instance_aspect()
@@ -585,6 +642,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
585
642
  domain,
586
643
  data_platform_instance,
587
644
  lineage,
645
+ tags,
588
646
  ],
589
647
  )
590
648
  ]
@@ -718,6 +776,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
718
776
 
719
777
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
720
778
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
779
+ schema_tags = []
780
+ if self.config.include_tags:
781
+ schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
782
+ schema.catalog.name
783
+ ).get(f"{schema.catalog.name}.{schema.name}", [])
784
+ logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
785
+ # Generate platform resources for schema tags
786
+ yield from self.gen_platform_resources(schema_tags)
721
787
 
722
788
  schema_container_key = self.gen_schema_key(schema)
723
789
  yield from gen_containers(
@@ -729,6 +795,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
729
795
  description=schema.comment,
730
796
  owner_urn=self.get_owner_urn(schema.owner),
731
797
  external_url=f"{self.external_url_base}/{schema.catalog.name}/{schema.name}",
798
+ tags=[tag.to_datahub_tag_urn().name for tag in schema_tags]
799
+ if schema_tags
800
+ else None,
732
801
  )
733
802
 
734
803
  def gen_metastore_containers(
@@ -749,6 +818,14 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
749
818
 
750
819
  def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
751
820
  domain_urn = self._gen_domain_urn(catalog.name)
821
+ catalog_tags = []
822
+ if self.config.include_tags:
823
+ catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
824
+ catalog.name
825
+ ).get(catalog.name, [])
826
+ logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
827
+ # Generate platform resources for schema tags
828
+ yield from self.gen_platform_resources(catalog_tags)
752
829
 
753
830
  catalog_container_key = self.gen_catalog_key(catalog)
754
831
  yield from gen_containers(
@@ -764,6 +841,9 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
764
841
  description=catalog.comment,
765
842
  owner_urn=self.get_owner_urn(catalog.owner),
766
843
  external_url=f"{self.external_url_base}/{catalog.name}",
844
+ tags=[tag.to_datahub_tag_urn().name for tag in catalog_tags]
845
+ if catalog_tags
846
+ else None,
767
847
  )
768
848
 
769
849
  def gen_schema_key(self, schema: Schema) -> ContainerKey:
@@ -832,6 +912,30 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
832
912
  dataset_urn=dataset_urn,
833
913
  )
834
914
 
915
+ def _get_catalog_tags(
916
+ self, catalog: str, schema: str, table: str
917
+ ) -> List[UnityCatalogTag]:
918
+ all_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog)
919
+ return all_tags.get(f"{catalog}", [])
920
+
921
+ def _get_schema_tags(
922
+ self, catalog: str, schema: str, table: str
923
+ ) -> List[UnityCatalogTag]:
924
+ all_tags = self.unity_catalog_api_proxy.get_schema_tags(catalog)
925
+ return all_tags.get(f"{catalog}.{schema}", [])
926
+
927
+ def _get_table_tags(
928
+ self, catalog: str, schema: str, table: str
929
+ ) -> List[UnityCatalogTag]:
930
+ all_tags = self.unity_catalog_api_proxy.get_table_tags(catalog)
931
+ return all_tags.get(f"{catalog}.{schema}.{table}", [])
932
+
933
+ def _get_column_tags(
934
+ self, catalog: str, schema: str, table: str, column: str
935
+ ) -> List[UnityCatalogTag]:
936
+ all_tags = self.unity_catalog_api_proxy.get_column_tags(catalog)
937
+ return all_tags.get(f"{catalog}.{schema}.{table}.{column}", [])
938
+
835
939
  def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
836
940
  custom_properties: dict = {}
837
941
  if table.storage_location is not None:
@@ -921,30 +1025,103 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
921
1025
  materialized=False, viewLanguage="SQL", viewLogic=table.view_definition
922
1026
  )
923
1027
 
924
- def _create_schema_metadata_aspect(self, table: Table) -> SchemaMetadataClass:
925
- schema_fields: List[SchemaFieldClass] = []
1028
+ def gen_platform_resources(
1029
+ self, tags: List[UnityCatalogTag]
1030
+ ) -> Iterable[MetadataWorkUnit]:
1031
+ if self.ctx.graph and self.platform_resource_repository:
1032
+ for tag in tags:
1033
+ try:
1034
+ platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1035
+ platform_instance=self.platform_instance_name,
1036
+ platform_resource_repository=self.platform_resource_repository,
1037
+ tag=tag,
1038
+ )
1039
+ logger.debug(f"Created platform resource {platform_resource_id}")
926
1040
 
1041
+ unity_catalog_tag = (
1042
+ UnityCatalogTagPlatformResource.get_from_datahub(
1043
+ platform_resource_id,
1044
+ self.platform_resource_repository,
1045
+ False,
1046
+ )
1047
+ )
1048
+ if (
1049
+ tag.to_datahub_tag_urn().urn()
1050
+ not in unity_catalog_tag.datahub_linked_resources().urns
1051
+ ):
1052
+ unity_catalog_tag.datahub_linked_resources().add(
1053
+ tag.to_datahub_tag_urn().urn()
1054
+ )
1055
+ platform_resource = unity_catalog_tag.as_platform_resource()
1056
+ for mcp in platform_resource.to_mcps():
1057
+ yield MetadataWorkUnit(
1058
+ id=f"platform_resource-{platform_resource.id}",
1059
+ mcp=mcp,
1060
+ )
1061
+ except Exception as e:
1062
+ logger.exception(
1063
+ f"Error processing platform resource for tag {tag}"
1064
+ )
1065
+ self.report.report_warning(
1066
+ message="Error processing platform resource for tag",
1067
+ context=str(tag),
1068
+ title="Error processing platform resource for tag",
1069
+ exc=e,
1070
+ )
1071
+ continue
1072
+
1073
+ def _create_schema_metadata_aspect(
1074
+ self, table: Table
1075
+ ) -> Tuple[SchemaMetadataClass, Iterable[MetadataWorkUnit]]:
1076
+ schema_fields: List[SchemaFieldClass] = []
1077
+ unique_tags: Set[UnityCatalogTag] = set()
927
1078
  for column in table.columns:
928
- schema_fields.extend(self._create_schema_field(column))
929
-
930
- return SchemaMetadataClass(
931
- schemaName=table.id,
932
- platform=make_data_platform_urn(self.platform),
933
- fields=schema_fields,
934
- hash="",
935
- version=0,
936
- platformSchema=MySqlDDLClass(tableSchema=""),
1079
+ tag_urns: Optional[List[TagUrn]] = None
1080
+ if self.config.include_tags:
1081
+ column_tags = self._get_column_tags(
1082
+ table.ref.catalog, table.ref.schema, table.ref.table, column.name
1083
+ )
1084
+ unique_tags.update(column_tags)
1085
+ tag_urns = [tag.to_datahub_tag_urn() for tag in column_tags]
1086
+ schema_fields.extend(self._create_schema_field(column, tag_urns))
1087
+
1088
+ platform_resources = self.gen_platform_resources(list(unique_tags))
1089
+ return (
1090
+ SchemaMetadataClass(
1091
+ schemaName=table.id,
1092
+ platform=make_data_platform_urn(self.platform),
1093
+ fields=schema_fields,
1094
+ hash="",
1095
+ version=0,
1096
+ platformSchema=MySqlDDLClass(tableSchema=""),
1097
+ ),
1098
+ platform_resources,
937
1099
  )
938
1100
 
939
1101
  @staticmethod
940
- def _create_schema_field(column: Column) -> List[SchemaFieldClass]:
1102
+ def _create_schema_field(
1103
+ column: Column, tags: Optional[List[TagUrn]]
1104
+ ) -> List[SchemaFieldClass]:
941
1105
  _COMPLEX_TYPE = re.compile("^(struct|array)")
942
-
1106
+ global_tags: Optional[GlobalTags] = None
943
1107
  if _COMPLEX_TYPE.match(column.type_text.lower()):
944
1108
  return get_schema_fields_for_hive_column(
945
1109
  column.name, column.type_text.lower(), description=column.comment
946
1110
  )
947
1111
  else:
1112
+ if tags is not None:
1113
+ attribution = MetadataAttribution(
1114
+ source="urn:li:dataPlatform:unity-catalog",
1115
+ actor="urn:li:corpuser:datahub",
1116
+ time=int(time.time() * 1000),
1117
+ )
1118
+ global_tags = GlobalTags(
1119
+ tags=[
1120
+ TagAssociation(tag=tag.urn(), attribution=attribution)
1121
+ for tag in tags
1122
+ ]
1123
+ )
1124
+
948
1125
  return [
949
1126
  SchemaFieldClass(
950
1127
  fieldPath=column.name,
@@ -954,6 +1131,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
954
1131
  nativeDataType=column.type_text,
955
1132
  nullable=column.nullable,
956
1133
  description=column.comment,
1134
+ globalTags=global_tags if tags else None,
957
1135
  )
958
1136
  ]
959
1137
 
@@ -0,0 +1,295 @@
1
+ import logging
2
+ from typing import List, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from datahub.api.entities.external.external_entities import (
7
+ ExternalEntity,
8
+ ExternalEntityId,
9
+ LinkedResourceSet,
10
+ PlatformResourceRepository,
11
+ )
12
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
13
+ from datahub.api.entities.platformresource.platform_resource import (
14
+ PlatformResource,
15
+ PlatformResourceKey,
16
+ PlatformResourceSearchFields,
17
+ )
18
+ from datahub.ingestion.graph.client import DataHubGraph
19
+ from datahub.metadata.urns import TagUrn
20
+ from datahub.utilities.search_utils import ElasticDocumentQuery
21
+ from datahub.utilities.urns.urn import Urn
22
+
23
+
24
+ class UnityCatalogTagSyncContext(BaseModel):
25
+ # it is intentionally empty
26
+ platform_instance: Optional[str] = None
27
+
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class UnityCatalogTagPlatformResourceId(BaseModel, ExternalEntityId):
33
+ """
34
+ A SnowflakeTagId is a unique identifier for a Snowflake tag.
35
+ """
36
+
37
+ tag_key: str
38
+ tag_value: Optional[str] = None
39
+ platform_instance: Optional[str]
40
+ exists_in_unity_catalog: bool = False
41
+ persisted: bool = False
42
+
43
+ def __hash__(self) -> int:
44
+ return hash(self.to_platform_resource_key().id)
45
+
46
+ # this is a hack to make sure the property is a string and not private pydantic field
47
+ @staticmethod
48
+ def _RESOURCE_TYPE() -> str:
49
+ return "UnityCatalogTagPlatformResource"
50
+
51
+ def to_platform_resource_key(self) -> PlatformResourceKey:
52
+ return PlatformResourceKey(
53
+ platform="databricks",
54
+ resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
55
+ primary_key=f"{self.tag_key}:{self.tag_value}",
56
+ platform_instance=self.platform_instance,
57
+ )
58
+
59
+ @classmethod
60
+ def from_tag(
61
+ cls,
62
+ tag: UnityCatalogTag,
63
+ platform_instance: Optional[str],
64
+ platform_resource_repository: PlatformResourceRepository,
65
+ exists_in_unity_catalog: bool = False,
66
+ ) -> "UnityCatalogTagPlatformResourceId":
67
+ """
68
+ Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
69
+ """
70
+
71
+ existing_platform_resource = cls.search_by_urn(
72
+ tag.to_datahub_tag_urn().urn(),
73
+ platform_resource_repository=platform_resource_repository,
74
+ tag_sync_context=UnityCatalogTagSyncContext(
75
+ platform_instance=platform_instance
76
+ ),
77
+ )
78
+ if existing_platform_resource:
79
+ logger.info(
80
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.original}: {existing_platform_resource}"
81
+ )
82
+ return existing_platform_resource
83
+
84
+ return UnityCatalogTagPlatformResourceId(
85
+ tag_key=tag.key.original,
86
+ tag_value=tag.value.original if tag.value is not None else None,
87
+ platform_instance=platform_instance,
88
+ exists_in_unity_catalog=exists_in_unity_catalog,
89
+ persisted=False,
90
+ )
91
+
92
+ @classmethod
93
+ def search_by_urn(
94
+ cls,
95
+ urn: str,
96
+ platform_resource_repository: PlatformResourceRepository,
97
+ tag_sync_context: UnityCatalogTagSyncContext,
98
+ ) -> Optional["UnityCatalogTagPlatformResourceId"]:
99
+ mapped_tags = [
100
+ t
101
+ for t in platform_resource_repository.search_by_filter(
102
+ ElasticDocumentQuery.create_from(
103
+ (
104
+ PlatformResourceSearchFields.RESOURCE_TYPE,
105
+ str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
106
+ ),
107
+ (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
108
+ )
109
+ )
110
+ ]
111
+ logger.info(
112
+ f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
113
+ )
114
+ if len(mapped_tags) > 0:
115
+ for platform_resource in mapped_tags:
116
+ if (
117
+ platform_resource.resource_info
118
+ and platform_resource.resource_info.value
119
+ ):
120
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
121
+ **platform_resource.resource_info.value.as_pydantic_object(
122
+ UnityCatalogTagPlatformResource
123
+ ).dict()
124
+ )
125
+ if (
126
+ unity_catalog_tag.id.platform_instance
127
+ == tag_sync_context.platform_instance
128
+ ):
129
+ unity_catalog_tag_id = unity_catalog_tag.id
130
+ unity_catalog_tag_id.exists_in_unity_catalog = True
131
+ unity_catalog_tag_id.persisted = True
132
+ return unity_catalog_tag_id
133
+ else:
134
+ logger.warning(
135
+ f"Platform resource {platform_resource} does not have a resource_info value"
136
+ )
137
+ continue
138
+
139
+ # If we reach here, it means we did not find a mapped tag for the URN
140
+ logger.info(
141
+ f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new UnityCatalogTagPlatformResourceId."
142
+ )
143
+ return None
144
+
145
+ @classmethod
146
+ def from_datahub_urn(
147
+ cls,
148
+ urn: str,
149
+ platform_resource_repository: PlatformResourceRepository,
150
+ tag_sync_context: UnityCatalogTagSyncContext,
151
+ graph: DataHubGraph,
152
+ ) -> "UnityCatalogTagPlatformResourceId":
153
+ """
154
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
155
+ """
156
+ # First we check if we already have a mapped platform resource for this
157
+ # urn that is of the type UnityCatalogTagPlatformResource
158
+ # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
159
+ # Else, we need to generate a new UnityCatalogTagPlatformResourceId
160
+ existing_platform_resource_id = cls.search_by_urn(
161
+ urn, platform_resource_repository, tag_sync_context
162
+ )
163
+ if existing_platform_resource_id:
164
+ logger.info(
165
+ f"Found existing UnityCatalogTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
166
+ )
167
+ return existing_platform_resource_id
168
+
169
+ # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
170
+ new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
171
+ if new_unity_catalog_tag_id:
172
+ # we then check if this tag has already been ingested as a platform
173
+ # resource in the platform resource repository
174
+ resource_key = platform_resource_repository.get(
175
+ new_unity_catalog_tag_id.to_platform_resource_key()
176
+ )
177
+ if resource_key:
178
+ logger.info(
179
+ f"Tag {new_unity_catalog_tag_id} already exists in platform resource repository with {resource_key}"
180
+ )
181
+ new_unity_catalog_tag_id.exists_in_unity_catalog = (
182
+ True # TODO: Check if this is a safe assumption
183
+ )
184
+ return new_unity_catalog_tag_id
185
+ raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
186
+
187
+ @classmethod
188
+ def generate_tag_id(
189
+ cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
190
+ ) -> "UnityCatalogTagPlatformResourceId":
191
+ parsed_urn = Urn.from_string(urn)
192
+ entity_type = parsed_urn.entity_type
193
+ if entity_type == "tag":
194
+ new_unity_catalog_tag_id = (
195
+ UnityCatalogTagPlatformResourceId.from_datahub_tag(
196
+ TagUrn.from_string(urn), tag_sync_context
197
+ )
198
+ )
199
+ else:
200
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
201
+ return new_unity_catalog_tag_id
202
+
203
+ @classmethod
204
+ def from_datahub_tag(
205
+ cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
206
+ ) -> "UnityCatalogTagPlatformResourceId":
207
+ uc_tag = UnityCatalogTag.from_urn(tag_urn)
208
+
209
+ return UnityCatalogTagPlatformResourceId(
210
+ tag_key=str(uc_tag.key),
211
+ tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
212
+ platform_instance=tag_sync_context.platform_instance,
213
+ exists_in_unity_catalog=False,
214
+ )
215
+
216
+
217
+ class UnityCatalogTagPlatformResource(BaseModel, ExternalEntity):
218
+ datahub_urns: LinkedResourceSet
219
+ managed_by_datahub: bool
220
+ id: UnityCatalogTagPlatformResourceId
221
+ allowed_values: Optional[List[str]]
222
+
223
+ def get_id(self) -> ExternalEntityId:
224
+ return self.id
225
+
226
+ def is_managed_by_datahub(self) -> bool:
227
+ return self.managed_by_datahub
228
+
229
+ def datahub_linked_resources(self) -> LinkedResourceSet:
230
+ return self.datahub_urns
231
+
232
+ def as_platform_resource(self) -> PlatformResource:
233
+ return PlatformResource.create(
234
+ key=self.id.to_platform_resource_key(),
235
+ secondary_keys=[u for u in self.datahub_urns.urns],
236
+ value=self,
237
+ )
238
+
239
+ @classmethod
240
+ def get_from_datahub(
241
+ cls,
242
+ unity_catalog_tag_id: UnityCatalogTagPlatformResourceId,
243
+ platform_resource_repository: PlatformResourceRepository,
244
+ managed_by_datahub: bool = False,
245
+ ) -> "UnityCatalogTagPlatformResource":
246
+ # Search for linked DataHub URNs
247
+ platform_resources = [
248
+ r
249
+ for r in platform_resource_repository.search_by_filter(
250
+ ElasticDocumentQuery.create_from(
251
+ (
252
+ PlatformResourceSearchFields.RESOURCE_TYPE,
253
+ str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
254
+ ),
255
+ (
256
+ PlatformResourceSearchFields.PRIMARY_KEY,
257
+ f"{unity_catalog_tag_id.tag_key}/{unity_catalog_tag_id.tag_value}",
258
+ ),
259
+ )
260
+ )
261
+ ]
262
+ if len(platform_resources) == 1:
263
+ platform_resource: PlatformResource = platform_resources[0]
264
+ if (
265
+ platform_resource.resource_info
266
+ and platform_resource.resource_info.value
267
+ ):
268
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
269
+ **platform_resource.resource_info.value.as_pydantic_object(
270
+ UnityCatalogTagPlatformResource
271
+ ).dict()
272
+ )
273
+ return unity_catalog_tag
274
+ else:
275
+ for platform_resource in platform_resources:
276
+ if (
277
+ platform_resource.resource_info
278
+ and platform_resource.resource_info.value
279
+ ):
280
+ unity_catalog_tag = UnityCatalogTagPlatformResource(
281
+ **platform_resource.resource_info.value.as_pydantic_object(
282
+ UnityCatalogTagPlatformResource
283
+ ).dict()
284
+ )
285
+ if (
286
+ unity_catalog_tag.id.platform_instance
287
+ == unity_catalog_tag_id.platform_instance
288
+ ):
289
+ return unity_catalog_tag
290
+ return cls(
291
+ id=unity_catalog_tag_id,
292
+ datahub_urns=LinkedResourceSet(urns=[]),
293
+ managed_by_datahub=managed_by_datahub,
294
+ allowed_values=None,
295
+ )
@@ -85,8 +85,11 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
85
85
  @platform_name("ClickHouse")
86
86
  @config_class(ClickHouseUsageConfig)
87
87
  @support_status(SupportStatus.CERTIFIED)
88
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
88
+ @capability(
89
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
90
+ )
89
91
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
92
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
90
93
  @dataclasses.dataclass
91
94
  class ClickHouseUsageSource(Source):
92
95
  """
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.time_window_config import get_time_bucket
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -112,6 +114,7 @@ class TrinoUsageReport(SourceReport):
112
114
  @platform_name("Trino")
113
115
  @config_class(TrinoUsageConfig)
114
116
  @support_status(SupportStatus.CERTIFIED)
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
115
118
  @dataclasses.dataclass
116
119
  class TrinoUsageSource(Source):
117
120
  """