acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,12 @@ from pydantic import validator
25
25
  from pydantic.fields import Field
26
26
 
27
27
  from datahub.api.entities.dataset.dataset import Dataset
28
+ from datahub.api.entities.external.external_entities import (
29
+ PlatformResourceRepository,
30
+ )
31
+ from datahub.api.entities.external.lake_formation_external_entites import (
32
+ LakeFormationTag,
33
+ )
28
34
  from datahub.configuration.common import AllowDenyPattern
29
35
  from datahub.configuration.source_common import DatasetSourceConfigMixin
30
36
  from datahub.emitter import mce_builder
@@ -62,6 +68,10 @@ from datahub.ingestion.source.aws.s3_util import (
62
68
  make_s3_urn,
63
69
  make_s3_urn_for_lineage,
64
70
  )
71
+ from datahub.ingestion.source.aws.tag_entities import (
72
+ LakeFormationTagPlatformResource,
73
+ LakeFormationTagPlatformResourceId,
74
+ )
65
75
  from datahub.ingestion.source.common.subtypes import (
66
76
  DatasetContainerSubTypes,
67
77
  DatasetSubTypes,
@@ -114,6 +124,7 @@ from datahub.metadata.schema_classes import (
114
124
  from datahub.utilities.delta import delta_type_to_hive_type
115
125
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
116
126
  from datahub.utilities.lossy_collections import LossyList
127
+ from datahub.utilities.urns.error import InvalidUrnError
117
128
 
118
129
  logger = logging.getLogger(__name__)
119
130
 
@@ -168,6 +179,12 @@ class GlueSourceConfig(
168
179
  default=False,
169
180
  description="If an S3 Objects Tags should be created for the Tables ingested by Glue.",
170
181
  )
182
+
183
+ extract_lakeformation_tags: Optional[bool] = Field(
184
+ default=False,
185
+ description="When True, extracts Lake Formation tags directly assigned to Glue tables/databases. Note: Tags inherited from databases or other parent resources are excluded.",
186
+ )
187
+
171
188
  profiling: GlueProfilingConfig = Field(
172
189
  default_factory=GlueProfilingConfig,
173
190
  description="Configs to ingest data profiles from glue table",
@@ -176,6 +193,7 @@ class GlueSourceConfig(
176
193
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
177
194
  default=None, description=""
178
195
  )
196
+
179
197
  extract_delta_schema_from_parameters: Optional[bool] = Field(
180
198
  default=False,
181
199
  description="If enabled, delta schemas can be alternatively fetched from table parameters.",
@@ -199,6 +217,10 @@ class GlueSourceConfig(
199
217
  def s3_client(self):
200
218
  return self.get_s3_client()
201
219
 
220
+ @property
221
+ def lakeformation_client(self):
222
+ return self.get_lakeformation_client()
223
+
202
224
  @validator("glue_s3_lineage_direction")
203
225
  def check_direction(cls, v: str) -> str:
204
226
  if v.lower() not in ["upstream", "downstream"]:
@@ -247,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
247
269
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
248
270
  @capability(
249
271
  SourceCapability.DELETION_DETECTION,
250
- "Enabled by default when stateful ingestion is turned on.",
272
+ "Enabled by default via stateful ingestion.",
251
273
  )
252
274
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
253
275
  @capability(
@@ -311,6 +333,8 @@ class GlueSource(StatefulIngestionSourceBase):
311
333
  source_config: GlueSourceConfig
312
334
  report: GlueSourceReport
313
335
 
336
+ lf_tag_cache: Dict[str, Dict[str, List[str]]] = {}
337
+
314
338
  def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
315
339
  super().__init__(config, ctx)
316
340
  self.ctx = ctx
@@ -320,9 +344,114 @@ class GlueSource(StatefulIngestionSourceBase):
320
344
  self.report.catalog_id = self.source_config.catalog_id
321
345
  self.glue_client = config.glue_client
322
346
  self.s3_client = config.s3_client
347
+ # Initialize Lake Formation client
348
+ self.lf_client = config.lakeformation_client
323
349
  self.extract_transforms = config.extract_transforms
324
350
  self.env = config.env
325
351
 
352
+ self.platform_resource_repository: Optional[PlatformResourceRepository] = None
353
+ if self.ctx.graph:
354
+ self.platform_resource_repository = PlatformResourceRepository(
355
+ self.ctx.graph
356
+ )
357
+
358
+ def get_database_lf_tags(
359
+ self,
360
+ catalog_id: str,
361
+ database_name: str,
362
+ ) -> List[LakeFormationTag]:
363
+ """Get all LF tags for a specific table."""
364
+ try:
365
+ # Get LF tags for the specified table
366
+ response = self.lf_client.get_resource_lf_tags(
367
+ CatalogId=catalog_id,
368
+ Resource={
369
+ "Database": {
370
+ "CatalogId": catalog_id,
371
+ "Name": database_name,
372
+ }
373
+ },
374
+ ShowAssignedLFTags=True,
375
+ )
376
+
377
+ if response:
378
+ logger.info(f"LF tags for database {database_name}: {response}")
379
+ # Extract and return the LF tags
380
+ lf_tags = response.get("LFTagOnDatabase", [])
381
+
382
+ tags = []
383
+ for lf_tag in lf_tags:
384
+ catalog_id = lf_tag.get("CatalogId")
385
+ tag_key = lf_tag.get("TagKey")
386
+ for tag_value in lf_tag.get("TagValues", []):
387
+ t = LakeFormationTag(
388
+ key=tag_key,
389
+ value=tag_value,
390
+ catalog_id=catalog_id,
391
+ )
392
+ tags.append(t)
393
+ return tags
394
+
395
+ except Exception as e:
396
+ print(
397
+ f"Error getting LF tags for table {catalog_id}.{database_name}: {str(e)}"
398
+ )
399
+ return []
400
+
401
+ def get_table_lf_tags(
402
+ self,
403
+ catalog_id: str,
404
+ database_name: str,
405
+ table_name: str,
406
+ ) -> List[LakeFormationTag]:
407
+ """Get all LF tags for a specific table."""
408
+ try:
409
+ # Get LF tags for the specified table
410
+ response = self.lf_client.get_resource_lf_tags(
411
+ CatalogId=catalog_id,
412
+ Resource={
413
+ "Table": {
414
+ "CatalogId": catalog_id,
415
+ "DatabaseName": database_name,
416
+ "Name": table_name,
417
+ },
418
+ },
419
+ ShowAssignedLFTags=True,
420
+ )
421
+
422
+ # Extract and return the LF tags
423
+ lf_tags = response.get("LFTagsOnTable", [])
424
+
425
+ tags = []
426
+ for lf_tag in lf_tags:
427
+ catalog_id = lf_tag.get("CatalogId")
428
+ tag_key = lf_tag.get("TagKey")
429
+ for tag_value in lf_tag.get("TagValues", []):
430
+ t = LakeFormationTag(
431
+ key=tag_key,
432
+ value=tag_value,
433
+ catalog_id=catalog_id,
434
+ )
435
+ tags.append(t)
436
+ return tags
437
+
438
+ except Exception:
439
+ return []
440
+
441
+ def get_all_lf_tags(self) -> List:
442
+ # 1. Get all LF-Tags in your account (metadata only)
443
+ response = self.lf_client.list_lf_tags(
444
+ MaxResults=50 # Adjust as needed
445
+ )
446
+ all_lf_tags = response["LFTags"]
447
+ # Continue pagination if necessary
448
+ while "NextToken" in response:
449
+ response = self.lf_client.list_lf_tags(
450
+ NextToken=response["NextToken"], MaxResults=50
451
+ )
452
+ all_lf_tags.extend(response["LFTags"])
453
+ return all_lf_tags
454
+
326
455
  def get_glue_arn(
327
456
  self, account_id: str, database: str, table: Optional[str] = None
328
457
  ) -> str:
@@ -869,7 +998,7 @@ class GlueSource(StatefulIngestionSourceBase):
869
998
  table_stats: dict,
870
999
  column_stats: dict,
871
1000
  partition_spec: Optional[str] = None,
872
- ) -> MetadataChangeProposalWrapper:
1001
+ ) -> Optional[MetadataChangeProposalWrapper]:
873
1002
  assert self.source_config.profiling
874
1003
 
875
1004
  # instantiate profile class
@@ -936,6 +1065,14 @@ class GlueSource(StatefulIngestionSourceBase):
936
1065
 
937
1066
  dataset_profile.fieldProfiles.append(column_profile)
938
1067
 
1068
+ # if no stats are available, skip ingestion
1069
+ if (
1070
+ not dataset_profile.fieldProfiles
1071
+ and dataset_profile.rowCount is None
1072
+ and dataset_profile.columnCount is None
1073
+ ):
1074
+ return None
1075
+
939
1076
  if partition_spec:
940
1077
  # inject partition level stats
941
1078
  dataset_profile.partitionSpec = PartitionSpecClass(
@@ -990,18 +1127,20 @@ class GlueSource(StatefulIngestionSourceBase):
990
1127
  if self.source_config.profiling.partition_patterns.allowed(
991
1128
  partition_spec
992
1129
  ):
993
- yield self._create_profile_mcp(
1130
+ profile_mcp = self._create_profile_mcp(
994
1131
  mce, table_stats, column_stats, partition_spec
995
- ).as_workunit()
1132
+ )
1133
+ if profile_mcp:
1134
+ yield profile_mcp.as_workunit()
996
1135
  else:
997
1136
  continue
998
1137
  else:
999
1138
  # ingest data profile without partition
1000
1139
  table_stats = response["Table"]["Parameters"]
1001
1140
  column_stats = response["Table"]["StorageDescriptor"]["Columns"]
1002
- yield self._create_profile_mcp(
1003
- mce, table_stats, column_stats
1004
- ).as_workunit()
1141
+ profile_mcp = self._create_profile_mcp(mce, table_stats, column_stats)
1142
+ if profile_mcp:
1143
+ yield profile_mcp.as_workunit()
1005
1144
 
1006
1145
  def gen_database_key(self, database: str) -> DatabaseKey:
1007
1146
  return DatabaseKey(
@@ -1012,9 +1151,66 @@ class GlueSource(StatefulIngestionSourceBase):
1012
1151
  backcompat_env_as_instance=True,
1013
1152
  )
1014
1153
 
1154
+ def gen_platform_resource(
1155
+ self, tag: LakeFormationTag
1156
+ ) -> Iterable[MetadataWorkUnit]:
1157
+ if self.ctx.graph and self.platform_resource_repository:
1158
+ platform_resource_id = LakeFormationTagPlatformResourceId.from_tag(
1159
+ platform_instance=self.source_config.platform_instance,
1160
+ platform_resource_repository=self.platform_resource_repository,
1161
+ catalog=tag.catalog,
1162
+ tag=tag,
1163
+ )
1164
+ logger.info(f"Created platform resource {platform_resource_id}")
1165
+
1166
+ lf_tag = LakeFormationTagPlatformResource.get_from_datahub(
1167
+ platform_resource_id, self.platform_resource_repository, False
1168
+ )
1169
+ if (
1170
+ tag.to_datahub_tag_urn().urn()
1171
+ not in lf_tag.datahub_linked_resources().urns
1172
+ ):
1173
+ try:
1174
+ lf_tag.datahub_linked_resources().add(
1175
+ tag.to_datahub_tag_urn().urn()
1176
+ )
1177
+ platform_resource = lf_tag.as_platform_resource()
1178
+ for mcp in platform_resource.to_mcps():
1179
+ yield MetadataWorkUnit(
1180
+ id=f"platform_resource-{platform_resource.id}",
1181
+ mcp=mcp,
1182
+ )
1183
+ except Exception as e:
1184
+ logger.warning(
1185
+ f"Failed to create platform resource for tag {tag}: {e}",
1186
+ exc_info=True,
1187
+ )
1188
+ self.report.report_warning(
1189
+ context="Failed to create platform resource",
1190
+ message=f"Failed to create platform resource for Tag: {tag}",
1191
+ )
1192
+
1015
1193
  def gen_database_containers(
1016
1194
  self, database: Mapping[str, Any]
1017
1195
  ) -> Iterable[MetadataWorkUnit]:
1196
+ container_tags: Optional[List] = None
1197
+ if self.source_config.extract_lakeformation_tags:
1198
+ try:
1199
+ tags = self.get_database_lf_tags(
1200
+ catalog_id=database["CatalogId"], database_name=database["Name"]
1201
+ )
1202
+ container_tags = []
1203
+ for tag in tags:
1204
+ try:
1205
+ container_tags.append(tag.to_datahub_tag_urn().name)
1206
+ yield from self.gen_platform_resource(tag)
1207
+ except InvalidUrnError:
1208
+ continue
1209
+ except Exception:
1210
+ self.report_warning(
1211
+ reason="Failed to extract Lake Formation tags for database",
1212
+ key=database["Name"],
1213
+ )
1018
1214
  domain_urn = self._gen_domain_urn(database["Name"])
1019
1215
  database_container_key = self.gen_database_key(database["Name"])
1020
1216
  parameters = database.get("Parameters", {})
@@ -1032,6 +1228,7 @@ class GlueSource(StatefulIngestionSourceBase):
1032
1228
  qualified_name=self.get_glue_arn(
1033
1229
  account_id=database["CatalogId"], database=database["Name"]
1034
1230
  ),
1231
+ tags=container_tags,
1035
1232
  extra_properties=parameters,
1036
1233
  )
1037
1234
 
@@ -1106,9 +1303,8 @@ class GlueSource(StatefulIngestionSourceBase):
1106
1303
  platform_instance=self.source_config.platform_instance,
1107
1304
  )
1108
1305
 
1109
- mce = self._extract_record(dataset_urn, table, full_table_name)
1110
- yield MetadataWorkUnit(full_table_name, mce=mce)
1111
-
1306
+ yield from self._extract_record(dataset_urn, table, full_table_name)
1307
+ # generate a Dataset snapshot
1112
1308
  # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1113
1309
  # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1114
1310
  yield MetadataChangeProposalWrapper(
@@ -1124,19 +1320,6 @@ class GlueSource(StatefulIngestionSourceBase):
1124
1320
  dataset_urn=dataset_urn, db_name=database_name
1125
1321
  )
1126
1322
 
1127
- wu = self.get_lineage_if_enabled(mce)
1128
- if wu:
1129
- yield wu
1130
-
1131
- try:
1132
- yield from self.get_profile_if_enabled(mce, database_name, table_name)
1133
- except KeyError as e:
1134
- self.report.report_failure(
1135
- message="Failed to extract profile for table",
1136
- context=f"Table: {dataset_urn}",
1137
- exc=e,
1138
- )
1139
-
1140
1323
  def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
1141
1324
  dags: Dict[str, Optional[Dict[str, Any]]] = {}
1142
1325
  flow_names: Dict[str, str] = {}
@@ -1191,159 +1374,201 @@ class GlueSource(StatefulIngestionSourceBase):
1191
1374
  for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces):
1192
1375
  yield MetadataWorkUnit(id=dataset_id, mce=dataset_mce)
1193
1376
 
1194
- # flake8: noqa: C901
1195
1377
  def _extract_record(
1196
1378
  self, dataset_urn: str, table: Dict, table_name: str
1197
- ) -> MetadataChangeEvent:
1379
+ ) -> Iterable[MetadataWorkUnit]:
1380
+ """Extract and yield metadata work units for a Glue table."""
1198
1381
  logger.debug(
1199
1382
  f"extract record from table={table_name} for dataset={dataset_urn}"
1200
1383
  )
1201
1384
 
1202
- def get_dataset_properties() -> DatasetPropertiesClass:
1203
- return DatasetPropertiesClass(
1204
- description=table.get("Description"),
1205
- customProperties={
1206
- **table.get("Parameters", {}),
1207
- **{
1208
- k: str(v)
1209
- for k, v in table.get("StorageDescriptor", {}).items()
1210
- if k not in ["Columns", "Parameters"]
1211
- },
1212
- },
1213
- uri=table.get("Location"),
1214
- tags=[],
1215
- name=table["Name"],
1216
- qualifiedName=self.get_glue_arn(
1217
- account_id=table["CatalogId"],
1218
- database=table["DatabaseName"],
1219
- table=table["Name"],
1220
- ),
1221
- )
1385
+ # Create the main dataset snapshot
1386
+ dataset_snapshot = DatasetSnapshot(
1387
+ urn=dataset_urn,
1388
+ aspects=[
1389
+ Status(removed=False),
1390
+ self._get_dataset_properties(table),
1391
+ ],
1392
+ )
1393
+
1394
+ # Add schema metadata if available
1395
+ schema_metadata = self._get_schema_metadata(table, table_name, dataset_urn)
1396
+ if schema_metadata:
1397
+ dataset_snapshot.aspects.append(schema_metadata)
1398
+
1399
+ # Add platform instance
1400
+ dataset_snapshot.aspects.append(self._get_data_platform_instance())
1222
1401
 
1223
- def get_s3_tags() -> Optional[GlobalTagsClass]:
1224
- # when TableType=VIRTUAL_VIEW the Location can be empty and we should
1225
- # return no tags rather than fail the entire ingestion
1226
- if table.get("StorageDescriptor", {}).get("Location") is None:
1227
- return None
1228
- bucket_name = s3_util.get_bucket_name(
1229
- table["StorageDescriptor"]["Location"]
1402
+ # Add ownership if enabled
1403
+ if self.extract_owners:
1404
+ ownership = GlueSource._get_ownership(table.get("Owner"))
1405
+ if ownership:
1406
+ dataset_snapshot.aspects.append(ownership)
1407
+
1408
+ # Add S3 tags if enabled
1409
+ s3_tags = self._get_s3_tags(table, dataset_urn)
1410
+ if s3_tags:
1411
+ dataset_snapshot.aspects.append(s3_tags)
1412
+
1413
+ # Add Lake Formation tags if enabled
1414
+ if self.source_config.extract_lakeformation_tags:
1415
+ tags = self.get_table_lf_tags(
1416
+ catalog_id=table["CatalogId"],
1417
+ database_name=table["DatabaseName"],
1418
+ table_name=table["Name"],
1230
1419
  )
1231
- tags_to_add = []
1232
- if self.source_config.use_s3_bucket_tags:
1233
- try:
1234
- bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
1235
- tags_to_add.extend(
1236
- [
1237
- make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
1238
- for tag in bucket_tags["TagSet"]
1239
- ]
1240
- )
1241
- except self.s3_client.exceptions.ClientError:
1242
- logger.warning(f"No tags found for bucket={bucket_name}")
1243
- if self.source_config.use_s3_object_tags:
1244
- key_prefix = s3_util.get_key_prefix(
1245
- table["StorageDescriptor"]["Location"]
1246
- )
1247
- object_tagging = self.s3_client.get_object_tagging(
1248
- Bucket=bucket_name, Key=key_prefix
1249
- )
1250
- tag_set = object_tagging["TagSet"]
1251
- if tag_set:
1252
- tags_to_add.extend(
1253
- [
1254
- make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
1255
- for tag in tag_set
1256
- ]
1257
- )
1258
- else:
1259
- # Unlike bucket tags, if an object does not have tags, it will just return an empty array
1260
- # as opposed to an exception.
1261
- logger.warning(
1262
- f"No tags found for bucket={bucket_name} key={key_prefix}"
1263
- )
1264
- if len(tags_to_add) == 0:
1265
- return None
1266
- if self.ctx.graph is not None:
1267
- logger.debug(
1268
- "Connected to DatahubApi, grabbing current tags to maintain."
1269
- )
1270
- current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
1271
- entity_urn=dataset_urn,
1272
- aspect_type=GlobalTagsClass,
1273
- )
1274
- if current_tags:
1275
- tags_to_add.extend(
1276
- [current_tag.tag for current_tag in current_tags.tags]
1277
- )
1278
- else:
1279
- logger.warning(
1280
- "Could not connect to DatahubApi. No current tags to maintain"
1281
- )
1282
- # Remove duplicate tags
1283
- tags_to_add = sorted(list(set(tags_to_add)))
1284
- new_tags = GlobalTagsClass(
1285
- tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add]
1420
+
1421
+ global_tags = self._get_lake_formation_tags(tags)
1422
+ if global_tags:
1423
+ dataset_snapshot.aspects.append(global_tags)
1424
+ # Generate platform resources for LF tags
1425
+ for tag in tags:
1426
+ yield from self.gen_platform_resource(tag)
1427
+
1428
+ # Create and yield the main metadata work unit
1429
+ metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1430
+ yield MetadataWorkUnit(table_name, mce=metadata_record)
1431
+
1432
+ # Add lineage if enabled
1433
+ lineage_wu = self.get_lineage_if_enabled(metadata_record)
1434
+ if lineage_wu:
1435
+ yield lineage_wu
1436
+
1437
+ # Add profile if enabled
1438
+ try:
1439
+ yield from self.get_profile_if_enabled(
1440
+ metadata_record, table["DatabaseName"], table["Name"]
1286
1441
  )
1287
- return new_tags
1288
-
1289
- def _is_delta_schema(
1290
- provider: str, num_parts: int, columns: Optional[List[Mapping[str, Any]]]
1291
- ) -> bool:
1292
- return (
1293
- (self.source_config.extract_delta_schema_from_parameters is True)
1294
- and (provider == "delta")
1295
- and (num_parts > 0)
1296
- and (columns is not None)
1297
- and (len(columns) == 1)
1298
- and (columns[0].get("Name", "") == "col")
1299
- and (columns[0].get("Type", "") == "array<string>")
1442
+ except KeyError as e:
1443
+ self.report.report_failure(
1444
+ message="Failed to extract profile for table",
1445
+ context=f"Table: {dataset_urn}",
1446
+ exc=e,
1300
1447
  )
1301
1448
 
1302
- def get_schema_metadata() -> Optional[SchemaMetadata]:
1303
- # As soon as the hive integration with Spark is correctly providing the schema as expected in the
1304
- # StorageProperties, the alternative path to fetch schema from table parameters for delta schemas can be removed.
1305
- # https://github.com/delta-io/delta/pull/2310
1306
- provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
1307
- num_parts = int(
1308
- table.get("Parameters", {}).get(
1309
- "spark.sql.sources.schema.numParts", "0"
1310
- )
1311
- )
1312
- columns = table.get("StorageDescriptor", {}).get("Columns", [{}])
1449
+ def _get_dataset_properties(self, table: Dict) -> DatasetPropertiesClass:
1450
+ """Extract dataset properties from Glue table."""
1451
+ storage_descriptor = table.get("StorageDescriptor", {})
1452
+ custom_properties = {
1453
+ **table.get("Parameters", {}),
1454
+ **{
1455
+ k: str(v)
1456
+ for k, v in storage_descriptor.items()
1457
+ if k not in ["Columns", "Parameters"]
1458
+ },
1459
+ }
1313
1460
 
1314
- if _is_delta_schema(provider, num_parts, columns):
1315
- return _get_delta_schema_metadata()
1461
+ return DatasetPropertiesClass(
1462
+ description=table.get("Description"),
1463
+ customProperties=custom_properties,
1464
+ uri=table.get("Location"),
1465
+ tags=[],
1466
+ name=table["Name"],
1467
+ qualifiedName=self.get_glue_arn(
1468
+ account_id=table["CatalogId"],
1469
+ database=table["DatabaseName"],
1470
+ table=table["Name"],
1471
+ ),
1472
+ )
1316
1473
 
1317
- elif table.get("StorageDescriptor"):
1318
- return _get_glue_schema_metadata()
1474
+ def _get_schema_metadata(
1475
+ self, table: Dict, table_name: str, dataset_urn: str
1476
+ ) -> Optional[SchemaMetadata]:
1477
+ """Extract schema metadata from Glue table."""
1478
+ if not table.get("StorageDescriptor"):
1479
+ return None
1319
1480
 
1320
- else:
1321
- return None
1481
+ # Check if this is a delta table with schema in parameters
1482
+ if self._is_delta_schema(table):
1483
+ return self._get_delta_schema_metadata(table, table_name, dataset_urn)
1484
+ else:
1485
+ return self._get_glue_schema_metadata(table, table_name)
1322
1486
 
1323
- def _get_glue_schema_metadata() -> Optional[SchemaMetadata]:
1324
- schema = table["StorageDescriptor"]["Columns"]
1325
- fields: List[SchemaField] = []
1326
- for field in schema:
1327
- schema_fields = get_schema_fields_for_hive_column(
1328
- hive_column_name=field["Name"],
1329
- hive_column_type=field["Type"],
1330
- description=field.get("Comment"),
1331
- default_nullable=True,
1332
- )
1333
- assert schema_fields
1487
+ def _is_delta_schema(self, table: Dict) -> bool:
1488
+ """Check if table uses delta format with schema in parameters."""
1489
+ if not self.source_config.extract_delta_schema_from_parameters:
1490
+ return False
1491
+
1492
+ provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
1493
+ num_parts = int(
1494
+ table.get("Parameters", {}).get("spark.sql.sources.schema.numParts", "0")
1495
+ )
1496
+ columns = table.get("StorageDescriptor", {}).get("Columns", [])
1497
+
1498
+ return (
1499
+ provider == "delta"
1500
+ and num_parts > 0
1501
+ and columns
1502
+ and len(columns) == 1
1503
+ and columns[0].get("Name", "") == "col"
1504
+ and columns[0].get("Type", "") == "array<string>"
1505
+ )
1506
+
1507
+ def _get_glue_schema_metadata(
1508
+ self, table: Dict, table_name: str
1509
+ ) -> Optional[SchemaMetadata]:
1510
+ """Extract schema metadata from Glue table columns."""
1511
+ schema = table["StorageDescriptor"]["Columns"]
1512
+ fields: List[SchemaField] = []
1513
+
1514
+ # Process regular columns
1515
+ for field in schema:
1516
+ schema_fields = get_schema_fields_for_hive_column(
1517
+ hive_column_name=field["Name"],
1518
+ hive_column_type=field["Type"],
1519
+ description=field.get("Comment"),
1520
+ default_nullable=True,
1521
+ )
1522
+ if schema_fields:
1334
1523
  fields.extend(schema_fields)
1335
1524
 
1336
- partition_keys = table.get("PartitionKeys", [])
1337
- for partition_key in partition_keys:
1525
+ # Process partition keys
1526
+ partition_keys = table.get("PartitionKeys", [])
1527
+ for partition_key in partition_keys:
1528
+ schema_fields = get_schema_fields_for_hive_column(
1529
+ hive_column_name=partition_key["Name"],
1530
+ hive_column_type=partition_key.get("Type", "unknown"),
1531
+ description=partition_key.get("Comment"),
1532
+ default_nullable=False,
1533
+ )
1534
+ if schema_fields:
1535
+ fields.extend(schema_fields)
1536
+
1537
+ return SchemaMetadata(
1538
+ schemaName=table_name,
1539
+ version=0,
1540
+ fields=fields,
1541
+ platform=f"urn:li:dataPlatform:{self.platform}",
1542
+ hash="",
1543
+ platformSchema=MySqlDDL(tableSchema=""),
1544
+ )
1545
+
1546
+ def _get_delta_schema_metadata(
1547
+ self, table: Dict, table_name: str, dataset_urn: str
1548
+ ) -> Optional[SchemaMetadata]:
1549
+ """Extract schema metadata from Delta table parameters."""
1550
+ try:
1551
+ # Reconstruct schema from parameters
1552
+ num_parts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
1553
+ schema_str = "".join(
1554
+ table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
1555
+ for i in range(num_parts)
1556
+ )
1557
+ schema_json = json.loads(schema_str)
1558
+
1559
+ fields: List[SchemaField] = []
1560
+ for field in schema_json["fields"]:
1561
+ field_type = delta_type_to_hive_type(field.get("type", "unknown"))
1338
1562
  schema_fields = get_schema_fields_for_hive_column(
1339
- hive_column_name=partition_key["Name"],
1340
- hive_column_type=partition_key.get("Type", "unknown"),
1341
- description=partition_key.get("Comment"),
1342
- default_nullable=False,
1563
+ hive_column_name=field["name"],
1564
+ hive_column_type=field_type,
1565
+ description=field.get("description"),
1566
+ default_nullable=bool(field.get("nullable", True)),
1343
1567
  )
1344
- assert schema_fields
1345
- fields.extend(schema_fields)
1568
+ if schema_fields:
1569
+ fields.extend(schema_fields)
1346
1570
 
1571
+ self.report.num_dataset_valid_delta_schema += 1
1347
1572
  return SchemaMetadata(
1348
1573
  schemaName=table_name,
1349
1574
  version=0,
@@ -1353,108 +1578,128 @@ class GlueSource(StatefulIngestionSourceBase):
1353
1578
  platformSchema=MySqlDDL(tableSchema=""),
1354
1579
  )
1355
1580
 
1356
- def _get_delta_schema_metadata() -> Optional[SchemaMetadata]:
1357
- assert (
1358
- table["Parameters"]["spark.sql.sources.provider"] == "delta"
1359
- and int(table["Parameters"]["spark.sql.sources.schema.numParts"]) > 0
1581
+ except Exception as e:
1582
+ self.report_warning(
1583
+ dataset_urn,
1584
+ f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
1360
1585
  )
1586
+ self.report.num_dataset_invalid_delta_schema += 1
1587
+ return None
1361
1588
 
1362
- try:
1363
- numParts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
1364
- schema_str = "".join(
1365
- [
1366
- table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
1367
- for i in range(numParts)
1368
- ]
1589
+ def _get_data_platform_instance(self) -> DataPlatformInstanceClass:
1590
+ """Get data platform instance aspect."""
1591
+ return DataPlatformInstanceClass(
1592
+ platform=make_data_platform_urn(self.platform),
1593
+ instance=(
1594
+ make_dataplatform_instance_urn(
1595
+ self.platform, self.source_config.platform_instance
1369
1596
  )
1370
- schema_json = json.loads(schema_str)
1371
- fields: List[SchemaField] = []
1372
- for field in schema_json["fields"]:
1373
- field_type = delta_type_to_hive_type(field.get("type", "unknown"))
1374
- schema_fields = get_schema_fields_for_hive_column(
1375
- hive_column_name=field["name"],
1376
- hive_column_type=field_type,
1377
- description=field.get("description"),
1378
- default_nullable=bool(field.get("nullable", True)),
1379
- )
1380
- assert schema_fields
1381
- fields.extend(schema_fields)
1597
+ if self.source_config.platform_instance
1598
+ else None
1599
+ ),
1600
+ )
1382
1601
 
1383
- self.report.num_dataset_valid_delta_schema += 1
1384
- return SchemaMetadata(
1385
- schemaName=table_name,
1386
- version=0,
1387
- fields=fields,
1388
- platform=f"urn:li:dataPlatform:{self.platform}",
1389
- hash="",
1390
- platformSchema=MySqlDDL(tableSchema=""),
1391
- )
1602
+ @staticmethod
1603
+ @lru_cache(maxsize=None)
1604
+ def _get_ownership(owner: str) -> Optional[OwnershipClass]:
1605
+ """Get ownership aspect for a given owner."""
1606
+ if not owner:
1607
+ return None
1392
1608
 
1393
- except Exception as e:
1394
- self.report_warning(
1395
- dataset_urn,
1396
- f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
1397
- )
1398
- self.report.num_dataset_invalid_delta_schema += 1
1399
- return None
1400
-
1401
- def get_data_platform_instance() -> DataPlatformInstanceClass:
1402
- return DataPlatformInstanceClass(
1403
- platform=make_data_platform_urn(self.platform),
1404
- instance=(
1405
- make_dataplatform_instance_urn(
1406
- self.platform, self.source_config.platform_instance
1407
- )
1408
- if self.source_config.platform_instance
1409
- else None
1410
- ),
1609
+ owners = [
1610
+ OwnerClass(
1611
+ owner=mce_builder.make_user_urn(owner),
1612
+ type=OwnershipTypeClass.DATAOWNER,
1411
1613
  )
1614
+ ]
1615
+ return OwnershipClass(owners=owners)
1412
1616
 
1413
- @lru_cache(maxsize=None)
1414
- def _get_ownership(owner: str) -> Optional[OwnershipClass]:
1415
- if owner:
1416
- owners = [
1417
- OwnerClass(
1418
- owner=mce_builder.make_user_urn(owner),
1419
- type=OwnershipTypeClass.DATAOWNER,
1420
- )
1421
- ]
1422
- return OwnershipClass(
1423
- owners=owners,
1424
- )
1617
+ def _get_s3_tags(self, table: Dict, dataset_urn: str) -> Optional[GlobalTagsClass]:
1618
+ """Extract S3 tags if enabled."""
1619
+ if not (
1620
+ self.source_config.use_s3_bucket_tags
1621
+ or self.source_config.use_s3_object_tags
1622
+ ):
1425
1623
  return None
1426
1624
 
1427
- dataset_snapshot = DatasetSnapshot(
1428
- urn=dataset_urn,
1429
- aspects=[
1430
- Status(removed=False),
1431
- get_dataset_properties(),
1432
- ],
1433
- )
1625
+ # Check if table has a location (VIRTUAL_VIEW tables may not)
1626
+ location = table.get("StorageDescriptor", {}).get("Location")
1627
+ if not location:
1628
+ return None
1434
1629
 
1435
- schema_metadata = get_schema_metadata()
1436
- if schema_metadata:
1437
- dataset_snapshot.aspects.append(schema_metadata)
1630
+ bucket_name = s3_util.get_bucket_name(location)
1631
+ tags_to_add: List[str] = []
1438
1632
 
1439
- dataset_snapshot.aspects.append(get_data_platform_instance())
1633
+ # Get bucket tags
1634
+ if self.source_config.use_s3_bucket_tags:
1635
+ try:
1636
+ bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
1637
+ tags_to_add.extend(
1638
+ make_tag_urn(f"{tag['Key']}:{tag['Value']}")
1639
+ for tag in bucket_tags["TagSet"]
1640
+ )
1641
+ except self.s3_client.exceptions.ClientError:
1642
+ logger.warning(f"No tags found for bucket={bucket_name}")
1440
1643
 
1441
- # Ownership
1442
- if self.extract_owners:
1443
- owner = table.get("Owner")
1444
- optional_owner_aspect = _get_ownership(owner)
1445
- if optional_owner_aspect is not None:
1446
- dataset_snapshot.aspects.append(optional_owner_aspect)
1644
+ # Get object tags
1645
+ if self.source_config.use_s3_object_tags:
1646
+ key_prefix = s3_util.get_key_prefix(location)
1647
+ try:
1648
+ object_tagging = self.s3_client.get_object_tagging(
1649
+ Bucket=bucket_name, Key=key_prefix
1650
+ )
1651
+ if object_tagging["TagSet"]:
1652
+ tags_to_add.extend(
1653
+ make_tag_urn(f"{tag['Key']}:{tag['Value']}")
1654
+ for tag in object_tagging["TagSet"]
1655
+ )
1656
+ else:
1657
+ logger.warning(
1658
+ f"No tags found for bucket={bucket_name} key={key_prefix}"
1659
+ )
1660
+ except Exception as e:
1661
+ logger.warning(f"Failed to get object tags: {e}")
1447
1662
 
1448
- if (
1449
- self.source_config.use_s3_bucket_tags
1450
- or self.source_config.use_s3_object_tags
1451
- ):
1452
- s3_tags = get_s3_tags()
1453
- if s3_tags is not None:
1454
- dataset_snapshot.aspects.append(s3_tags)
1663
+ if not tags_to_add:
1664
+ return None
1455
1665
 
1456
- metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1457
- return metadata_record
1666
+ # Merge with existing tags if connected to DataHub API
1667
+ if self.ctx.graph:
1668
+ logger.debug("Connected to DatahubApi, grabbing current tags to maintain.")
1669
+ current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
1670
+ entity_urn=dataset_urn, aspect_type=GlobalTagsClass
1671
+ )
1672
+ if current_tags:
1673
+ tags_to_add.extend(current_tag.tag for current_tag in current_tags.tags)
1674
+ else:
1675
+ logger.warning(
1676
+ "Could not connect to DatahubApi. No current tags to maintain"
1677
+ )
1678
+
1679
+ # Remove duplicates and create tags
1680
+ unique_tags = sorted(set(tags_to_add))
1681
+ return GlobalTagsClass(tags=[TagAssociationClass(tag) for tag in unique_tags])
1682
+
1683
+ def _get_lake_formation_tags(
1684
+ self, tags: List[LakeFormationTag]
1685
+ ) -> Optional[GlobalTagsClass]:
1686
+ """Extract Lake Formation tags if enabled."""
1687
+ tag_urns: List[str] = []
1688
+ for tag in tags:
1689
+ try:
1690
+ tag_urns.append(tag.to_datahub_tag_urn().urn())
1691
+ except InvalidUrnError as e:
1692
+ logger.warning(
1693
+ f"Invalid Lake Formation tag URN for {tag}: {e}", exc_info=True
1694
+ )
1695
+ continue # Skip invalid tags
1696
+
1697
+ tag_urns.sort() # Sort to maintain consistent order
1698
+ return (
1699
+ GlobalTagsClass(tags=[TagAssociationClass(tag_urn) for tag_urn in tag_urns])
1700
+ if tag_urns
1701
+ else None
1702
+ )
1458
1703
 
1459
1704
  def get_report(self):
1460
1705
  return self.report