acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,9 @@ from pydantic import validator
25
25
  from pydantic.fields import Field
26
26
 
27
27
  from datahub.api.entities.dataset.dataset import Dataset
28
+ from datahub.api.entities.external.lake_formation_external_entites import (
29
+ LakeFormationTag,
30
+ )
28
31
  from datahub.configuration.common import AllowDenyPattern
29
32
  from datahub.configuration.source_common import DatasetSourceConfigMixin
30
33
  from datahub.emitter import mce_builder
@@ -57,14 +60,21 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
57
60
  from datahub.ingestion.api.workunit import MetadataWorkUnit
58
61
  from datahub.ingestion.source.aws import s3_util
59
62
  from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
63
+ from datahub.ingestion.source.aws.platform_resource_repository import (
64
+ GluePlatformResourceRepository,
65
+ )
60
66
  from datahub.ingestion.source.aws.s3_util import (
61
67
  is_s3_uri,
62
68
  make_s3_urn,
63
69
  make_s3_urn_for_lineage,
64
70
  )
71
+ from datahub.ingestion.source.aws.tag_entities import (
72
+ LakeFormationTagPlatformResourceId,
73
+ )
65
74
  from datahub.ingestion.source.common.subtypes import (
66
75
  DatasetContainerSubTypes,
67
76
  DatasetSubTypes,
77
+ SourceCapabilityModifier,
68
78
  )
69
79
  from datahub.ingestion.source.glue_profiling_config import GlueProfilingConfig
70
80
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -114,6 +124,7 @@ from datahub.metadata.schema_classes import (
114
124
  from datahub.utilities.delta import delta_type_to_hive_type
115
125
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
116
126
  from datahub.utilities.lossy_collections import LossyList
127
+ from datahub.utilities.urns.error import InvalidUrnError
117
128
 
118
129
  logger = logging.getLogger(__name__)
119
130
 
@@ -168,6 +179,12 @@ class GlueSourceConfig(
168
179
  default=False,
169
180
  description="If an S3 Objects Tags should be created for the Tables ingested by Glue.",
170
181
  )
182
+
183
+ extract_lakeformation_tags: Optional[bool] = Field(
184
+ default=False,
185
+ description="When True, extracts Lake Formation tags directly assigned to Glue tables/databases. Note: Tags inherited from databases or other parent resources are excluded.",
186
+ )
187
+
171
188
  profiling: GlueProfilingConfig = Field(
172
189
  default_factory=GlueProfilingConfig,
173
190
  description="Configs to ingest data profiles from glue table",
@@ -176,6 +193,7 @@ class GlueSourceConfig(
176
193
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
177
194
  default=None, description=""
178
195
  )
196
+
179
197
  extract_delta_schema_from_parameters: Optional[bool] = Field(
180
198
  default=False,
181
199
  description="If enabled, delta schemas can be alternatively fetched from table parameters.",
@@ -199,6 +217,10 @@ class GlueSourceConfig(
199
217
  def s3_client(self):
200
218
  return self.get_s3_client()
201
219
 
220
+ @property
221
+ def lakeformation_client(self):
222
+ return self.get_lakeformation_client()
223
+
202
224
  @validator("glue_s3_lineage_direction")
203
225
  def check_direction(cls, v: str) -> str:
204
226
  if v.lower() not in ["upstream", "downstream"]:
@@ -247,12 +269,19 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
247
269
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
248
270
  @capability(
249
271
  SourceCapability.DELETION_DETECTION,
250
- "Enabled by default when stateful ingestion is turned on.",
272
+ "Enabled by default via stateful ingestion.",
251
273
  )
252
274
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
253
275
  @capability(
254
276
  SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
255
277
  )
278
+ @capability(
279
+ SourceCapability.CONTAINERS,
280
+ "Enabled by default",
281
+ subtype_modifier=[
282
+ SourceCapabilityModifier.DATABASE,
283
+ ],
284
+ )
256
285
  class GlueSource(StatefulIngestionSourceBase):
257
286
  """
258
287
  Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -311,6 +340,8 @@ class GlueSource(StatefulIngestionSourceBase):
311
340
  source_config: GlueSourceConfig
312
341
  report: GlueSourceReport
313
342
 
343
+ lf_tag_cache: Dict[str, Dict[str, List[str]]] = {}
344
+
314
345
  def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
315
346
  super().__init__(config, ctx)
316
347
  self.ctx = ctx
@@ -320,9 +351,118 @@ class GlueSource(StatefulIngestionSourceBase):
320
351
  self.report.catalog_id = self.source_config.catalog_id
321
352
  self.glue_client = config.glue_client
322
353
  self.s3_client = config.s3_client
354
+ # Initialize Lake Formation client
355
+ self.lf_client = config.lakeformation_client
323
356
  self.extract_transforms = config.extract_transforms
324
357
  self.env = config.env
325
358
 
359
+ self.platform_resource_repository: Optional[
360
+ "GluePlatformResourceRepository"
361
+ ] = None
362
+ if self.ctx.graph:
363
+ self.platform_resource_repository = GluePlatformResourceRepository(
364
+ self.ctx.graph,
365
+ platform_instance=self.source_config.platform_instance,
366
+ catalog=self.source_config.catalog_id,
367
+ )
368
+
369
+ def get_database_lf_tags(
370
+ self,
371
+ catalog_id: str,
372
+ database_name: str,
373
+ ) -> List[LakeFormationTag]:
374
+ """Get all LF tags for a specific table."""
375
+ try:
376
+ # Get LF tags for the specified table
377
+ response = self.lf_client.get_resource_lf_tags(
378
+ CatalogId=catalog_id,
379
+ Resource={
380
+ "Database": {
381
+ "CatalogId": catalog_id,
382
+ "Name": database_name,
383
+ }
384
+ },
385
+ ShowAssignedLFTags=True,
386
+ )
387
+
388
+ if response:
389
+ logger.info(f"LF tags for database {database_name}: {response}")
390
+ # Extract and return the LF tags
391
+ lf_tags = response.get("LFTagOnDatabase", [])
392
+
393
+ tags = []
394
+ for lf_tag in lf_tags:
395
+ catalog_id = lf_tag.get("CatalogId")
396
+ tag_key = lf_tag.get("TagKey")
397
+ for tag_value in lf_tag.get("TagValues", []):
398
+ t = LakeFormationTag(
399
+ key=tag_key,
400
+ value=tag_value,
401
+ catalog=catalog_id,
402
+ )
403
+ tags.append(t)
404
+ return tags
405
+
406
+ except Exception as e:
407
+ print(
408
+ f"Error getting LF tags for table {catalog_id}.{database_name}: {str(e)}"
409
+ )
410
+ return []
411
+
412
+ def get_table_lf_tags(
413
+ self,
414
+ catalog_id: str,
415
+ database_name: str,
416
+ table_name: str,
417
+ ) -> List[LakeFormationTag]:
418
+ """Get all LF tags for a specific table."""
419
+ try:
420
+ # Get LF tags for the specified table
421
+ response = self.lf_client.get_resource_lf_tags(
422
+ CatalogId=catalog_id,
423
+ Resource={
424
+ "Table": {
425
+ "CatalogId": catalog_id,
426
+ "DatabaseName": database_name,
427
+ "Name": table_name,
428
+ },
429
+ },
430
+ ShowAssignedLFTags=True,
431
+ )
432
+
433
+ # Extract and return the LF tags
434
+ lf_tags = response.get("LFTagsOnTable", [])
435
+
436
+ tags = []
437
+ for lf_tag in lf_tags:
438
+ catalog_id = lf_tag.get("CatalogId")
439
+ tag_key = lf_tag.get("TagKey")
440
+ for tag_value in lf_tag.get("TagValues", []):
441
+ t = LakeFormationTag(
442
+ key=tag_key,
443
+ value=tag_value,
444
+ catalog=catalog_id,
445
+ )
446
+ tags.append(t)
447
+ return tags
448
+
449
+ except Exception:
450
+ return []
451
+
452
+ def get_all_lf_tags(self) -> List:
453
+ # 1. Get all LF-Tags in your account (metadata only)
454
+ response = self.lf_client.list_lf_tags(
455
+ MaxResults=50 # Adjust as needed
456
+ )
457
+ all_lf_tags = response["LFTags"]
458
+ # Continue pagination if necessary
459
+ while "NextToken" in response:
460
+ response = self.lf_client.list_lf_tags(
461
+ NextToken=response["NextToken"], MaxResults=50
462
+ )
463
+ all_lf_tags.extend(response["LFTags"])
464
+ return all_lf_tags
465
+
326
466
  def get_glue_arn(
327
467
  self, account_id: str, database: str, table: Optional[str] = None
328
468
  ) -> str:
@@ -385,6 +525,14 @@ class GlueSource(StatefulIngestionSourceBase):
385
525
  bucket = url.netloc
386
526
  key = url.path[1:]
387
527
 
528
+ # validate that we have a non-empty key
529
+ if not key:
530
+ self.report.num_job_script_location_invalid += 1
531
+ logger.warning(
532
+ f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path for flow urn: {flow_urn}."
533
+ )
534
+ return None
535
+
388
536
  # download the script contents
389
537
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
390
538
  try:
@@ -396,6 +544,14 @@ class GlueSource(StatefulIngestionSourceBase):
396
544
  )
397
545
  self.report.num_job_script_failed_download += 1
398
546
  return None
547
+ except botocore.exceptions.ParamValidationError as e:
548
+ self.report_warning(
549
+ flow_urn,
550
+ f"Invalid S3 path for Glue job script {script_path}: {e}",
551
+ )
552
+ self.report.num_job_script_location_invalid += 1
553
+ return None
554
+
399
555
  script = obj["Body"].read().decode("utf-8")
400
556
 
401
557
  try:
@@ -869,7 +1025,7 @@ class GlueSource(StatefulIngestionSourceBase):
869
1025
  table_stats: dict,
870
1026
  column_stats: dict,
871
1027
  partition_spec: Optional[str] = None,
872
- ) -> MetadataChangeProposalWrapper:
1028
+ ) -> Optional[MetadataChangeProposalWrapper]:
873
1029
  assert self.source_config.profiling
874
1030
 
875
1031
  # instantiate profile class
@@ -936,6 +1092,14 @@ class GlueSource(StatefulIngestionSourceBase):
936
1092
 
937
1093
  dataset_profile.fieldProfiles.append(column_profile)
938
1094
 
1095
+ # if no stats are available, skip ingestion
1096
+ if (
1097
+ not dataset_profile.fieldProfiles
1098
+ and dataset_profile.rowCount is None
1099
+ and dataset_profile.columnCount is None
1100
+ ):
1101
+ return None
1102
+
939
1103
  if partition_spec:
940
1104
  # inject partition level stats
941
1105
  dataset_profile.partitionSpec = PartitionSpecClass(
@@ -990,18 +1154,20 @@ class GlueSource(StatefulIngestionSourceBase):
990
1154
  if self.source_config.profiling.partition_patterns.allowed(
991
1155
  partition_spec
992
1156
  ):
993
- yield self._create_profile_mcp(
1157
+ profile_mcp = self._create_profile_mcp(
994
1158
  mce, table_stats, column_stats, partition_spec
995
- ).as_workunit()
1159
+ )
1160
+ if profile_mcp:
1161
+ yield profile_mcp.as_workunit()
996
1162
  else:
997
1163
  continue
998
1164
  else:
999
1165
  # ingest data profile without partition
1000
1166
  table_stats = response["Table"]["Parameters"]
1001
1167
  column_stats = response["Table"]["StorageDescriptor"]["Columns"]
1002
- yield self._create_profile_mcp(
1003
- mce, table_stats, column_stats
1004
- ).as_workunit()
1168
+ profile_mcp = self._create_profile_mcp(mce, table_stats, column_stats)
1169
+ if profile_mcp:
1170
+ yield profile_mcp.as_workunit()
1005
1171
 
1006
1172
  def gen_database_key(self, database: str) -> DatabaseKey:
1007
1173
  return DatabaseKey(
@@ -1012,9 +1178,67 @@ class GlueSource(StatefulIngestionSourceBase):
1012
1178
  backcompat_env_as_instance=True,
1013
1179
  )
1014
1180
 
1181
+ def gen_platform_resource(
1182
+ self, tag: LakeFormationTag
1183
+ ) -> Iterable[MetadataWorkUnit]:
1184
+ if self.ctx.graph and self.platform_resource_repository:
1185
+ platform_resource_id = (
1186
+ LakeFormationTagPlatformResourceId.get_or_create_from_tag(
1187
+ tag=tag,
1188
+ platform_resource_repository=self.platform_resource_repository,
1189
+ catalog_id=tag.catalog,
1190
+ )
1191
+ )
1192
+ logger.info(f"Created platform resource {platform_resource_id}")
1193
+
1194
+ lf_tag = self.platform_resource_repository.get_entity_from_datahub(
1195
+ platform_resource_id, False
1196
+ )
1197
+ if (
1198
+ tag.to_datahub_tag_urn().urn()
1199
+ not in lf_tag.datahub_linked_resources().urns
1200
+ ):
1201
+ try:
1202
+ lf_tag.datahub_linked_resources().add(
1203
+ tag.to_datahub_tag_urn().urn()
1204
+ )
1205
+ platform_resource = lf_tag.as_platform_resource()
1206
+ for mcp in platform_resource.to_mcps():
1207
+ yield MetadataWorkUnit(
1208
+ id=f"platform_resource-{platform_resource.id}",
1209
+ mcp=mcp,
1210
+ )
1211
+ except Exception as e:
1212
+ logger.warning(
1213
+ f"Failed to create platform resource for tag {tag}: {e}",
1214
+ exc_info=True,
1215
+ )
1216
+ self.report.report_warning(
1217
+ context="Failed to create platform resource",
1218
+ message=f"Failed to create platform resource for Tag: {tag}",
1219
+ )
1220
+
1015
1221
  def gen_database_containers(
1016
1222
  self, database: Mapping[str, Any]
1017
1223
  ) -> Iterable[MetadataWorkUnit]:
1224
+ container_tags: Optional[List] = None
1225
+ if self.source_config.extract_lakeformation_tags:
1226
+ try:
1227
+ tags = self.get_database_lf_tags(
1228
+ catalog_id=database["CatalogId"], database_name=database["Name"]
1229
+ )
1230
+ container_tags = []
1231
+ for tag in tags:
1232
+ try:
1233
+ container_tags.append(tag.to_datahub_tag_urn().name)
1234
+ yield from self.gen_platform_resource(tag)
1235
+ except InvalidUrnError:
1236
+ continue
1237
+ except Exception:
1238
+ self.report_warning(
1239
+ reason="Failed to extract Lake Formation tags for database",
1240
+ key=database["Name"],
1241
+ )
1018
1242
  domain_urn = self._gen_domain_urn(database["Name"])
1019
1243
  database_container_key = self.gen_database_key(database["Name"])
1020
1244
  parameters = database.get("Parameters", {})
@@ -1032,6 +1256,7 @@ class GlueSource(StatefulIngestionSourceBase):
1032
1256
  qualified_name=self.get_glue_arn(
1033
1257
  account_id=database["CatalogId"], database=database["Name"]
1034
1258
  ),
1259
+ tags=container_tags,
1035
1260
  extra_properties=parameters,
1036
1261
  )
1037
1262
 
@@ -1106,9 +1331,8 @@ class GlueSource(StatefulIngestionSourceBase):
1106
1331
  platform_instance=self.source_config.platform_instance,
1107
1332
  )
1108
1333
 
1109
- mce = self._extract_record(dataset_urn, table, full_table_name)
1110
- yield MetadataWorkUnit(full_table_name, mce=mce)
1111
-
1334
+ yield from self._extract_record(dataset_urn, table, full_table_name)
1335
+ # generate a Dataset snapshot
1112
1336
  # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
1113
1337
  # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
1114
1338
  yield MetadataChangeProposalWrapper(
@@ -1124,19 +1348,6 @@ class GlueSource(StatefulIngestionSourceBase):
1124
1348
  dataset_urn=dataset_urn, db_name=database_name
1125
1349
  )
1126
1350
 
1127
- wu = self.get_lineage_if_enabled(mce)
1128
- if wu:
1129
- yield wu
1130
-
1131
- try:
1132
- yield from self.get_profile_if_enabled(mce, database_name, table_name)
1133
- except KeyError as e:
1134
- self.report.report_failure(
1135
- message="Failed to extract profile for table",
1136
- context=f"Table: {dataset_urn}",
1137
- exc=e,
1138
- )
1139
-
1140
1351
  def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
1141
1352
  dags: Dict[str, Optional[Dict[str, Any]]] = {}
1142
1353
  flow_names: Dict[str, str] = {}
@@ -1191,159 +1402,201 @@ class GlueSource(StatefulIngestionSourceBase):
1191
1402
  for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces):
1192
1403
  yield MetadataWorkUnit(id=dataset_id, mce=dataset_mce)
1193
1404
 
1194
- # flake8: noqa: C901
1195
1405
  def _extract_record(
1196
1406
  self, dataset_urn: str, table: Dict, table_name: str
1197
- ) -> MetadataChangeEvent:
1407
+ ) -> Iterable[MetadataWorkUnit]:
1408
+ """Extract and yield metadata work units for a Glue table."""
1198
1409
  logger.debug(
1199
1410
  f"extract record from table={table_name} for dataset={dataset_urn}"
1200
1411
  )
1201
1412
 
1202
- def get_dataset_properties() -> DatasetPropertiesClass:
1203
- return DatasetPropertiesClass(
1204
- description=table.get("Description"),
1205
- customProperties={
1206
- **table.get("Parameters", {}),
1207
- **{
1208
- k: str(v)
1209
- for k, v in table.get("StorageDescriptor", {}).items()
1210
- if k not in ["Columns", "Parameters"]
1211
- },
1212
- },
1213
- uri=table.get("Location"),
1214
- tags=[],
1215
- name=table["Name"],
1216
- qualifiedName=self.get_glue_arn(
1217
- account_id=table["CatalogId"],
1218
- database=table["DatabaseName"],
1219
- table=table["Name"],
1220
- ),
1221
- )
1413
+ # Create the main dataset snapshot
1414
+ dataset_snapshot = DatasetSnapshot(
1415
+ urn=dataset_urn,
1416
+ aspects=[
1417
+ Status(removed=False),
1418
+ self._get_dataset_properties(table),
1419
+ ],
1420
+ )
1421
+
1422
+ # Add schema metadata if available
1423
+ schema_metadata = self._get_schema_metadata(table, table_name, dataset_urn)
1424
+ if schema_metadata:
1425
+ dataset_snapshot.aspects.append(schema_metadata)
1222
1426
 
1223
- def get_s3_tags() -> Optional[GlobalTagsClass]:
1224
- # when TableType=VIRTUAL_VIEW the Location can be empty and we should
1225
- # return no tags rather than fail the entire ingestion
1226
- if table.get("StorageDescriptor", {}).get("Location") is None:
1227
- return None
1228
- bucket_name = s3_util.get_bucket_name(
1229
- table["StorageDescriptor"]["Location"]
1427
+ # Add platform instance
1428
+ dataset_snapshot.aspects.append(self._get_data_platform_instance())
1429
+
1430
+ # Add ownership if enabled
1431
+ if self.extract_owners:
1432
+ ownership = GlueSource._get_ownership(table.get("Owner"))
1433
+ if ownership:
1434
+ dataset_snapshot.aspects.append(ownership)
1435
+
1436
+ # Add S3 tags if enabled
1437
+ s3_tags = self._get_s3_tags(table, dataset_urn)
1438
+ if s3_tags:
1439
+ dataset_snapshot.aspects.append(s3_tags)
1440
+
1441
+ # Add Lake Formation tags if enabled
1442
+ if self.source_config.extract_lakeformation_tags:
1443
+ tags = self.get_table_lf_tags(
1444
+ catalog_id=table["CatalogId"],
1445
+ database_name=table["DatabaseName"],
1446
+ table_name=table["Name"],
1230
1447
  )
1231
- tags_to_add = []
1232
- if self.source_config.use_s3_bucket_tags:
1233
- try:
1234
- bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
1235
- tags_to_add.extend(
1236
- [
1237
- make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
1238
- for tag in bucket_tags["TagSet"]
1239
- ]
1240
- )
1241
- except self.s3_client.exceptions.ClientError:
1242
- logger.warning(f"No tags found for bucket={bucket_name}")
1243
- if self.source_config.use_s3_object_tags:
1244
- key_prefix = s3_util.get_key_prefix(
1245
- table["StorageDescriptor"]["Location"]
1246
- )
1247
- object_tagging = self.s3_client.get_object_tagging(
1248
- Bucket=bucket_name, Key=key_prefix
1249
- )
1250
- tag_set = object_tagging["TagSet"]
1251
- if tag_set:
1252
- tags_to_add.extend(
1253
- [
1254
- make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
1255
- for tag in tag_set
1256
- ]
1257
- )
1258
- else:
1259
- # Unlike bucket tags, if an object does not have tags, it will just return an empty array
1260
- # as opposed to an exception.
1261
- logger.warning(
1262
- f"No tags found for bucket={bucket_name} key={key_prefix}"
1263
- )
1264
- if len(tags_to_add) == 0:
1265
- return None
1266
- if self.ctx.graph is not None:
1267
- logger.debug(
1268
- "Connected to DatahubApi, grabbing current tags to maintain."
1269
- )
1270
- current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
1271
- entity_urn=dataset_urn,
1272
- aspect_type=GlobalTagsClass,
1273
- )
1274
- if current_tags:
1275
- tags_to_add.extend(
1276
- [current_tag.tag for current_tag in current_tags.tags]
1277
- )
1278
- else:
1279
- logger.warning(
1280
- "Could not connect to DatahubApi. No current tags to maintain"
1281
- )
1282
- # Remove duplicate tags
1283
- tags_to_add = sorted(list(set(tags_to_add)))
1284
- new_tags = GlobalTagsClass(
1285
- tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add]
1448
+
1449
+ global_tags = self._get_lake_formation_tags(tags)
1450
+ if global_tags:
1451
+ dataset_snapshot.aspects.append(global_tags)
1452
+ # Generate platform resources for LF tags
1453
+ for tag in tags:
1454
+ yield from self.gen_platform_resource(tag)
1455
+
1456
+ # Create and yield the main metadata work unit
1457
+ metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1458
+ yield MetadataWorkUnit(table_name, mce=metadata_record)
1459
+
1460
+ # Add lineage if enabled
1461
+ lineage_wu = self.get_lineage_if_enabled(metadata_record)
1462
+ if lineage_wu:
1463
+ yield lineage_wu
1464
+
1465
+ # Add profile if enabled
1466
+ try:
1467
+ yield from self.get_profile_if_enabled(
1468
+ metadata_record, table["DatabaseName"], table["Name"]
1286
1469
  )
1287
- return new_tags
1288
-
1289
- def _is_delta_schema(
1290
- provider: str, num_parts: int, columns: Optional[List[Mapping[str, Any]]]
1291
- ) -> bool:
1292
- return (
1293
- (self.source_config.extract_delta_schema_from_parameters is True)
1294
- and (provider == "delta")
1295
- and (num_parts > 0)
1296
- and (columns is not None)
1297
- and (len(columns) == 1)
1298
- and (columns[0].get("Name", "") == "col")
1299
- and (columns[0].get("Type", "") == "array<string>")
1470
+ except KeyError as e:
1471
+ self.report.report_failure(
1472
+ message="Failed to extract profile for table",
1473
+ context=f"Table: {dataset_urn}",
1474
+ exc=e,
1300
1475
  )
1301
1476
 
1302
- def get_schema_metadata() -> Optional[SchemaMetadata]:
1303
- # As soon as the hive integration with Spark is correctly providing the schema as expected in the
1304
- # StorageProperties, the alternative path to fetch schema from table parameters for delta schemas can be removed.
1305
- # https://github.com/delta-io/delta/pull/2310
1306
- provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
1307
- num_parts = int(
1308
- table.get("Parameters", {}).get(
1309
- "spark.sql.sources.schema.numParts", "0"
1310
- )
1311
- )
1312
- columns = table.get("StorageDescriptor", {}).get("Columns", [{}])
1477
+ def _get_dataset_properties(self, table: Dict) -> DatasetPropertiesClass:
1478
+ """Extract dataset properties from Glue table."""
1479
+ storage_descriptor = table.get("StorageDescriptor", {})
1480
+ custom_properties = {
1481
+ **table.get("Parameters", {}),
1482
+ **{
1483
+ k: str(v)
1484
+ for k, v in storage_descriptor.items()
1485
+ if k not in ["Columns", "Parameters"]
1486
+ },
1487
+ }
1313
1488
 
1314
- if _is_delta_schema(provider, num_parts, columns):
1315
- return _get_delta_schema_metadata()
1489
+ return DatasetPropertiesClass(
1490
+ description=table.get("Description"),
1491
+ customProperties=custom_properties,
1492
+ uri=table.get("Location"),
1493
+ tags=[],
1494
+ name=table["Name"],
1495
+ qualifiedName=self.get_glue_arn(
1496
+ account_id=table["CatalogId"],
1497
+ database=table["DatabaseName"],
1498
+ table=table["Name"],
1499
+ ),
1500
+ )
1316
1501
 
1317
- elif table.get("StorageDescriptor"):
1318
- return _get_glue_schema_metadata()
1502
+ def _get_schema_metadata(
1503
+ self, table: Dict, table_name: str, dataset_urn: str
1504
+ ) -> Optional[SchemaMetadata]:
1505
+ """Extract schema metadata from Glue table."""
1506
+ if not table.get("StorageDescriptor"):
1507
+ return None
1319
1508
 
1320
- else:
1321
- return None
1509
+ # Check if this is a delta table with schema in parameters
1510
+ if self._is_delta_schema(table):
1511
+ return self._get_delta_schema_metadata(table, table_name, dataset_urn)
1512
+ else:
1513
+ return self._get_glue_schema_metadata(table, table_name)
1322
1514
 
1323
- def _get_glue_schema_metadata() -> Optional[SchemaMetadata]:
1324
- schema = table["StorageDescriptor"]["Columns"]
1325
- fields: List[SchemaField] = []
1326
- for field in schema:
1327
- schema_fields = get_schema_fields_for_hive_column(
1328
- hive_column_name=field["Name"],
1329
- hive_column_type=field["Type"],
1330
- description=field.get("Comment"),
1331
- default_nullable=True,
1332
- )
1333
- assert schema_fields
1515
+ def _is_delta_schema(self, table: Dict) -> bool:
1516
+ """Check if table uses delta format with schema in parameters."""
1517
+ if not self.source_config.extract_delta_schema_from_parameters:
1518
+ return False
1519
+
1520
+ provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
1521
+ num_parts = int(
1522
+ table.get("Parameters", {}).get("spark.sql.sources.schema.numParts", "0")
1523
+ )
1524
+ columns = table.get("StorageDescriptor", {}).get("Columns", [])
1525
+
1526
+ return (
1527
+ provider == "delta"
1528
+ and num_parts > 0
1529
+ and columns
1530
+ and len(columns) == 1
1531
+ and columns[0].get("Name", "") == "col"
1532
+ and columns[0].get("Type", "") == "array<string>"
1533
+ )
1534
+
1535
+ def _get_glue_schema_metadata(
1536
+ self, table: Dict, table_name: str
1537
+ ) -> Optional[SchemaMetadata]:
1538
+ """Extract schema metadata from Glue table columns."""
1539
+ schema = table["StorageDescriptor"]["Columns"]
1540
+ fields: List[SchemaField] = []
1541
+
1542
+ # Process regular columns
1543
+ for field in schema:
1544
+ schema_fields = get_schema_fields_for_hive_column(
1545
+ hive_column_name=field["Name"],
1546
+ hive_column_type=field["Type"],
1547
+ description=field.get("Comment"),
1548
+ default_nullable=True,
1549
+ )
1550
+ if schema_fields:
1551
+ fields.extend(schema_fields)
1552
+
1553
+ # Process partition keys
1554
+ partition_keys = table.get("PartitionKeys", [])
1555
+ for partition_key in partition_keys:
1556
+ schema_fields = get_schema_fields_for_hive_column(
1557
+ hive_column_name=partition_key["Name"],
1558
+ hive_column_type=partition_key.get("Type", "unknown"),
1559
+ description=partition_key.get("Comment"),
1560
+ default_nullable=False,
1561
+ )
1562
+ if schema_fields:
1334
1563
  fields.extend(schema_fields)
1335
1564
 
1336
- partition_keys = table.get("PartitionKeys", [])
1337
- for partition_key in partition_keys:
1565
+ return SchemaMetadata(
1566
+ schemaName=table_name,
1567
+ version=0,
1568
+ fields=fields,
1569
+ platform=f"urn:li:dataPlatform:{self.platform}",
1570
+ hash="",
1571
+ platformSchema=MySqlDDL(tableSchema=""),
1572
+ )
1573
+
1574
+ def _get_delta_schema_metadata(
1575
+ self, table: Dict, table_name: str, dataset_urn: str
1576
+ ) -> Optional[SchemaMetadata]:
1577
+ """Extract schema metadata from Delta table parameters."""
1578
+ try:
1579
+ # Reconstruct schema from parameters
1580
+ num_parts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
1581
+ schema_str = "".join(
1582
+ table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
1583
+ for i in range(num_parts)
1584
+ )
1585
+ schema_json = json.loads(schema_str)
1586
+
1587
+ fields: List[SchemaField] = []
1588
+ for field in schema_json["fields"]:
1589
+ field_type = delta_type_to_hive_type(field.get("type", "unknown"))
1338
1590
  schema_fields = get_schema_fields_for_hive_column(
1339
- hive_column_name=partition_key["Name"],
1340
- hive_column_type=partition_key.get("Type", "unknown"),
1341
- description=partition_key.get("Comment"),
1342
- default_nullable=False,
1591
+ hive_column_name=field["name"],
1592
+ hive_column_type=field_type,
1593
+ description=field.get("description"),
1594
+ default_nullable=bool(field.get("nullable", True)),
1343
1595
  )
1344
- assert schema_fields
1345
- fields.extend(schema_fields)
1596
+ if schema_fields:
1597
+ fields.extend(schema_fields)
1346
1598
 
1599
+ self.report.num_dataset_valid_delta_schema += 1
1347
1600
  return SchemaMetadata(
1348
1601
  schemaName=table_name,
1349
1602
  version=0,
@@ -1353,108 +1606,128 @@ class GlueSource(StatefulIngestionSourceBase):
1353
1606
  platformSchema=MySqlDDL(tableSchema=""),
1354
1607
  )
1355
1608
 
1356
- def _get_delta_schema_metadata() -> Optional[SchemaMetadata]:
1357
- assert (
1358
- table["Parameters"]["spark.sql.sources.provider"] == "delta"
1359
- and int(table["Parameters"]["spark.sql.sources.schema.numParts"]) > 0
1609
+ except Exception as e:
1610
+ self.report_warning(
1611
+ dataset_urn,
1612
+ f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
1360
1613
  )
1614
+ self.report.num_dataset_invalid_delta_schema += 1
1615
+ return None
1361
1616
 
1362
- try:
1363
- numParts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
1364
- schema_str = "".join(
1365
- [
1366
- table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
1367
- for i in range(numParts)
1368
- ]
1617
+ def _get_data_platform_instance(self) -> DataPlatformInstanceClass:
1618
+ """Get data platform instance aspect."""
1619
+ return DataPlatformInstanceClass(
1620
+ platform=make_data_platform_urn(self.platform),
1621
+ instance=(
1622
+ make_dataplatform_instance_urn(
1623
+ self.platform, self.source_config.platform_instance
1369
1624
  )
1370
- schema_json = json.loads(schema_str)
1371
- fields: List[SchemaField] = []
1372
- for field in schema_json["fields"]:
1373
- field_type = delta_type_to_hive_type(field.get("type", "unknown"))
1374
- schema_fields = get_schema_fields_for_hive_column(
1375
- hive_column_name=field["name"],
1376
- hive_column_type=field_type,
1377
- description=field.get("description"),
1378
- default_nullable=bool(field.get("nullable", True)),
1379
- )
1380
- assert schema_fields
1381
- fields.extend(schema_fields)
1625
+ if self.source_config.platform_instance
1626
+ else None
1627
+ ),
1628
+ )
1382
1629
 
1383
- self.report.num_dataset_valid_delta_schema += 1
1384
- return SchemaMetadata(
1385
- schemaName=table_name,
1386
- version=0,
1387
- fields=fields,
1388
- platform=f"urn:li:dataPlatform:{self.platform}",
1389
- hash="",
1390
- platformSchema=MySqlDDL(tableSchema=""),
1391
- )
1630
+ @staticmethod
1631
+ @lru_cache(maxsize=None)
1632
+ def _get_ownership(owner: str) -> Optional[OwnershipClass]:
1633
+ """Get ownership aspect for a given owner."""
1634
+ if not owner:
1635
+ return None
1392
1636
 
1393
- except Exception as e:
1394
- self.report_warning(
1395
- dataset_urn,
1396
- f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
1397
- )
1398
- self.report.num_dataset_invalid_delta_schema += 1
1399
- return None
1400
-
1401
- def get_data_platform_instance() -> DataPlatformInstanceClass:
1402
- return DataPlatformInstanceClass(
1403
- platform=make_data_platform_urn(self.platform),
1404
- instance=(
1405
- make_dataplatform_instance_urn(
1406
- self.platform, self.source_config.platform_instance
1407
- )
1408
- if self.source_config.platform_instance
1409
- else None
1410
- ),
1637
+ owners = [
1638
+ OwnerClass(
1639
+ owner=mce_builder.make_user_urn(owner),
1640
+ type=OwnershipTypeClass.DATAOWNER,
1411
1641
  )
1642
+ ]
1643
+ return OwnershipClass(owners=owners)
1412
1644
 
1413
- @lru_cache(maxsize=None)
1414
- def _get_ownership(owner: str) -> Optional[OwnershipClass]:
1415
- if owner:
1416
- owners = [
1417
- OwnerClass(
1418
- owner=mce_builder.make_user_urn(owner),
1419
- type=OwnershipTypeClass.DATAOWNER,
1420
- )
1421
- ]
1422
- return OwnershipClass(
1423
- owners=owners,
1424
- )
1645
+ def _get_s3_tags(self, table: Dict, dataset_urn: str) -> Optional[GlobalTagsClass]:
1646
+ """Extract S3 tags if enabled."""
1647
+ if not (
1648
+ self.source_config.use_s3_bucket_tags
1649
+ or self.source_config.use_s3_object_tags
1650
+ ):
1425
1651
  return None
1426
1652
 
1427
- dataset_snapshot = DatasetSnapshot(
1428
- urn=dataset_urn,
1429
- aspects=[
1430
- Status(removed=False),
1431
- get_dataset_properties(),
1432
- ],
1433
- )
1653
+ # Check if table has a location (VIRTUAL_VIEW tables may not)
1654
+ location = table.get("StorageDescriptor", {}).get("Location")
1655
+ if not location:
1656
+ return None
1434
1657
 
1435
- schema_metadata = get_schema_metadata()
1436
- if schema_metadata:
1437
- dataset_snapshot.aspects.append(schema_metadata)
1658
+ bucket_name = s3_util.get_bucket_name(location)
1659
+ tags_to_add: List[str] = []
1438
1660
 
1439
- dataset_snapshot.aspects.append(get_data_platform_instance())
1661
+ # Get bucket tags
1662
+ if self.source_config.use_s3_bucket_tags:
1663
+ try:
1664
+ bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
1665
+ tags_to_add.extend(
1666
+ make_tag_urn(f"{tag['Key']}:{tag['Value']}")
1667
+ for tag in bucket_tags["TagSet"]
1668
+ )
1669
+ except self.s3_client.exceptions.ClientError:
1670
+ logger.warning(f"No tags found for bucket={bucket_name}")
1440
1671
 
1441
- # Ownership
1442
- if self.extract_owners:
1443
- owner = table.get("Owner")
1444
- optional_owner_aspect = _get_ownership(owner)
1445
- if optional_owner_aspect is not None:
1446
- dataset_snapshot.aspects.append(optional_owner_aspect)
1672
+ # Get object tags
1673
+ if self.source_config.use_s3_object_tags:
1674
+ key_prefix = s3_util.get_key_prefix(location)
1675
+ try:
1676
+ object_tagging = self.s3_client.get_object_tagging(
1677
+ Bucket=bucket_name, Key=key_prefix
1678
+ )
1679
+ if object_tagging["TagSet"]:
1680
+ tags_to_add.extend(
1681
+ make_tag_urn(f"{tag['Key']}:{tag['Value']}")
1682
+ for tag in object_tagging["TagSet"]
1683
+ )
1684
+ else:
1685
+ logger.warning(
1686
+ f"No tags found for bucket={bucket_name} key={key_prefix}"
1687
+ )
1688
+ except Exception as e:
1689
+ logger.warning(f"Failed to get object tags: {e}")
1447
1690
 
1448
- if (
1449
- self.source_config.use_s3_bucket_tags
1450
- or self.source_config.use_s3_object_tags
1451
- ):
1452
- s3_tags = get_s3_tags()
1453
- if s3_tags is not None:
1454
- dataset_snapshot.aspects.append(s3_tags)
1691
+ if not tags_to_add:
1692
+ return None
1455
1693
 
1456
- metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1457
- return metadata_record
1694
+ # Merge with existing tags if connected to DataHub API
1695
+ if self.ctx.graph:
1696
+ logger.debug("Connected to DatahubApi, grabbing current tags to maintain.")
1697
+ current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
1698
+ entity_urn=dataset_urn, aspect_type=GlobalTagsClass
1699
+ )
1700
+ if current_tags:
1701
+ tags_to_add.extend(current_tag.tag for current_tag in current_tags.tags)
1702
+ else:
1703
+ logger.warning(
1704
+ "Could not connect to DatahubApi. No current tags to maintain"
1705
+ )
1706
+
1707
+ # Remove duplicates and create tags
1708
+ unique_tags = sorted(set(tags_to_add))
1709
+ return GlobalTagsClass(tags=[TagAssociationClass(tag) for tag in unique_tags])
1710
+
1711
+ def _get_lake_formation_tags(
1712
+ self, tags: List[LakeFormationTag]
1713
+ ) -> Optional[GlobalTagsClass]:
1714
+ """Extract Lake Formation tags if enabled."""
1715
+ tag_urns: List[str] = []
1716
+ for tag in tags:
1717
+ try:
1718
+ tag_urns.append(tag.to_datahub_tag_urn().urn())
1719
+ except InvalidUrnError as e:
1720
+ logger.warning(
1721
+ f"Invalid Lake Formation tag URN for {tag}: {e}", exc_info=True
1722
+ )
1723
+ continue # Skip invalid tags
1724
+
1725
+ tag_urns.sort() # Sort to maintain consistent order
1726
+ return (
1727
+ GlobalTagsClass(tags=[TagAssociationClass(tag_urn) for tag_urn in tag_urns])
1728
+ if tag_urns
1729
+ else None
1730
+ )
1458
1731
 
1459
1732
  def get_report(self):
1460
1733
  return self.report