acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,795 @@
1
+ """
2
+ Athena Properties Extractor - A robust tool for parsing CREATE TABLE statements.
3
+
4
+ This module provides functionality to extract properties, partitioning information,
5
+ and row format details from Athena CREATE TABLE SQL statements.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Optional, Set, Tuple, Union
12
+
13
+ from sqlglot import ParseError, parse_one
14
+ from sqlglot.dialects.athena import Athena
15
+ from sqlglot.expressions import (
16
+ Anonymous,
17
+ ColumnDef,
18
+ Create,
19
+ Day,
20
+ Expression,
21
+ FileFormatProperty,
22
+ Identifier,
23
+ LocationProperty,
24
+ Month,
25
+ PartitionByTruncate,
26
+ PartitionedByBucket,
27
+ PartitionedByProperty,
28
+ Property,
29
+ RowFormatDelimitedProperty,
30
+ Schema,
31
+ SchemaCommentProperty,
32
+ SerdeProperties,
33
+ Year,
34
+ )
35
+
36
+
37
+ class AthenaPropertiesExtractionError(Exception):
38
+ """Custom exception for Athena properties extraction errors."""
39
+
40
+ pass
41
+
42
+
43
+ @dataclass
44
+ class ColumnInfo:
45
+ """Information about a table column."""
46
+
47
+ name: str
48
+ type: str
49
+
50
+
51
+ @dataclass
52
+ class TransformInfo:
53
+ """Information about a partition transform."""
54
+
55
+ type: str
56
+ column: ColumnInfo
57
+ bucket_count: Optional[int] = None
58
+ length: Optional[int] = None
59
+
60
+
61
+ @dataclass
62
+ class PartitionInfo:
63
+ """Information about table partitioning."""
64
+
65
+ simple_columns: List[ColumnInfo]
66
+ transforms: List[TransformInfo]
67
+
68
+
69
+ @dataclass
70
+ class TableProperties:
71
+ """General table properties."""
72
+
73
+ location: Optional[str] = None
74
+ format: Optional[str] = None
75
+ comment: Optional[str] = None
76
+ serde_properties: Optional[Dict[str, str]] = None
77
+ row_format: Optional[Dict[str, str]] = None
78
+ additional_properties: Optional[Dict[str, str]] = None
79
+
80
+
81
+ @dataclass
82
+ class RowFormatInfo:
83
+ """Row format information."""
84
+
85
+ properties: Dict[str, str]
86
+ json_formatted: str
87
+
88
+
89
+ @dataclass
90
+ class AthenaTableInfo:
91
+ """Complete information about an Athena table."""
92
+
93
+ partition_info: PartitionInfo
94
+ table_properties: TableProperties
95
+ row_format: RowFormatInfo
96
+
97
+
98
+ class AthenaPropertiesExtractor:
99
+ """A class to extract properties from Athena CREATE TABLE statements."""
100
+
101
+ CREATE_TABLE_REGEXP = re.compile(
102
+ r"(CREATE TABLE[\s\n]*)(.*?)(\s*\()", re.MULTILINE | re.IGNORECASE
103
+ )
104
+ PARTITIONED_BY_REGEXP = re.compile(
105
+ r"(PARTITIONED BY[\s\n]*\()((?:[^()]|\([^)]*\))*?)(\))",
106
+ re.MULTILINE | re.IGNORECASE,
107
+ )
108
+
109
+ def __init__(self) -> None:
110
+ """Initialize the extractor."""
111
+ pass
112
+
113
+ @staticmethod
114
+ def get_table_properties(sql: str) -> AthenaTableInfo:
115
+ """Get all table properties from a SQL statement.
116
+
117
+ Args:
118
+ sql: The SQL statement to parse
119
+
120
+ Returns:
121
+ An AthenaTableInfo object containing all table properties
122
+
123
+ Raises:
124
+ AthenaPropertiesExtractionError: If extraction fails
125
+ """
126
+ extractor = AthenaPropertiesExtractor()
127
+ return extractor._extract_all_properties(sql)
128
+
129
+ def _extract_all_properties(self, sql: str) -> AthenaTableInfo:
130
+ """Extract all properties from a SQL statement.
131
+
132
+ Args:
133
+ sql: The SQL statement to parse
134
+
135
+ Returns:
136
+ An AthenaTableInfo object containing all properties
137
+
138
+ Raises:
139
+ AthenaPropertiesExtractionError: If extraction fails
140
+ """
141
+ if not sql or not sql.strip():
142
+ raise AthenaPropertiesExtractionError("SQL statement cannot be empty")
143
+
144
+ try:
145
+ # We need to do certain transformations on the sql create statement:
146
+ # - table names are not quoted
147
+ # - column expression is not quoted
148
+ # - sql parser fails if partition colums quoted
149
+ fixed_sql = self._fix_sql_partitioning(sql)
150
+ parsed = parse_one(fixed_sql, dialect=Athena)
151
+ except ParseError as e:
152
+ raise AthenaPropertiesExtractionError(f"Failed to parse SQL: {e}") from e
153
+ except Exception as e:
154
+ raise AthenaPropertiesExtractionError(
155
+ f"Unexpected error during SQL parsing: {e}"
156
+ ) from e
157
+
158
+ try:
159
+ partition_info = self._extract_partition_info(parsed)
160
+ table_properties = self._extract_table_properties(parsed)
161
+ row_format = self._extract_row_format(parsed)
162
+
163
+ return AthenaTableInfo(
164
+ partition_info=partition_info,
165
+ table_properties=table_properties,
166
+ row_format=row_format,
167
+ )
168
+ except Exception as e:
169
+ raise AthenaPropertiesExtractionError(
170
+ f"Failed to extract table properties: {e}"
171
+ ) from e
172
+
173
+ @staticmethod
174
+ def format_column_definition(line):
175
+ # Use regex to parse the line more accurately
176
+ # Pattern: column_name data_type [COMMENT comment_text] [,]
177
+ # Improved pattern to better separate column name, data type, and comment
178
+ pattern = r"^\s*([`\w']+)\s+([\w<>\[\](),\s]+?)(\s+COMMENT\s+(.+?))?(,?)\s*$"
179
+ match = re.match(pattern, line.strip(), re.IGNORECASE)
180
+
181
+ if not match:
182
+ return line
183
+ column_name = match.group(1).strip()
184
+ data_type = match.group(2).strip()
185
+ comment_part = match.group(4) # COMMENT part
186
+ trailing_comma = match.group(5) if match.group(5) else ""
187
+
188
+ # Add backticks to column name if not already present
189
+ if not (column_name.startswith("`") and column_name.endswith("`")):
190
+ column_name = f"`{column_name}`"
191
+
192
+ # Build the result
193
+ result_parts = [column_name, data_type]
194
+
195
+ if comment_part:
196
+ comment_part = comment_part.strip()
197
+
198
+ # Handle comment quoting and escaping
199
+ if comment_part.startswith("'") and comment_part.endswith("'"):
200
+ # Already single quoted - but check for proper escaping
201
+ inner_content = comment_part[1:-1]
202
+ # Re-escape any single quotes that aren't properly escaped
203
+ escaped_content = inner_content.replace("'", "''")
204
+ formatted_comment = f"'{escaped_content}'"
205
+ elif comment_part.startswith('"') and comment_part.endswith('"'):
206
+ # Double quoted - convert to single quotes and escape internal single quotes
207
+ inner_content = comment_part[1:-1]
208
+ escaped_content = inner_content.replace("'", "''")
209
+ formatted_comment = f"'{escaped_content}'"
210
+ else:
211
+ # Not quoted - use double quotes to avoid escaping issues with single quotes
212
+ formatted_comment = f'"{comment_part}"'
213
+
214
+ result_parts.extend(["COMMENT", formatted_comment])
215
+
216
+ result = " " + " ".join(result_parts) + trailing_comma
217
+
218
+ return result
219
+
220
+ @staticmethod
221
+ def format_athena_column_definitions(sql_statement: str) -> str:
222
+ """
223
+ Format Athena CREATE TABLE statement by:
224
+ 1. Adding backticks around column names in column definitions (only in the main table definition)
225
+ 2. Quoting comments (if any exist)
226
+ """
227
+ lines = sql_statement.split("\n")
228
+ formatted_lines = []
229
+
230
+ in_column_definition = False
231
+
232
+ for line in lines:
233
+ stripped_line = line.strip()
234
+
235
+ # Check if we're entering column definitions
236
+ if "CREATE TABLE" in line.upper() and "(" in line:
237
+ in_column_definition = True
238
+ formatted_lines.append(line)
239
+ continue
240
+
241
+ # Skip processing PARTITIONED BY clauses as column definitions
242
+ if in_column_definition and "PARTITIONED BY" in line.upper():
243
+ formatted_lines.append(line)
244
+ continue
245
+
246
+ # Process column definitions first, then check for exit condition
247
+ if in_column_definition and stripped_line:
248
+ # Check if this line contains a column definition (before the closing paren)
249
+ if ")" in line:
250
+ # Split the line at the closing parenthesis
251
+ paren_index = line.find(")")
252
+ column_part = line[:paren_index].strip()
253
+ closing_part = line[paren_index:]
254
+
255
+ if column_part:
256
+ # Format the column part
257
+ formatted_column = (
258
+ AthenaPropertiesExtractor.format_column_definition(
259
+ column_part
260
+ )
261
+ )
262
+ # Reconstruct the line
263
+ formatted_line = formatted_column.rstrip() + closing_part
264
+ formatted_lines.append(formatted_line)
265
+ else:
266
+ formatted_lines.append(line)
267
+ in_column_definition = False
268
+ else:
269
+ # Regular column definition line
270
+ formatted_line = AthenaPropertiesExtractor.format_column_definition(
271
+ line
272
+ )
273
+ formatted_lines.append(formatted_line)
274
+ else:
275
+ # For all other lines, keep as-is
276
+ formatted_lines.append(line)
277
+
278
+ return "\n".join(formatted_lines)
279
+
280
+ @staticmethod
281
+ def _fix_sql_partitioning(sql: str) -> str:
282
+ """Fix SQL partitioning by removing backticks from partition expressions and quoting table names.
283
+
284
+ Args:
285
+ sql: The SQL statement to fix
286
+
287
+ Returns:
288
+ The fixed SQL statement
289
+ """
290
+ if not sql:
291
+ return sql
292
+
293
+ # Quote table name
294
+ table_name_match = AthenaPropertiesExtractor.CREATE_TABLE_REGEXP.search(sql)
295
+
296
+ if table_name_match:
297
+ table_name = table_name_match.group(2).strip()
298
+ if table_name and not (table_name.startswith("`") or "`" in table_name):
299
+ # Split on dots and quote each part
300
+ quoted_parts = [
301
+ f"`{part.strip()}`"
302
+ for part in table_name.split(".")
303
+ if part.strip()
304
+ ]
305
+ if quoted_parts:
306
+ quoted_table = ".".join(quoted_parts)
307
+ create_part = table_name_match.group(0).replace(
308
+ table_name, quoted_table
309
+ )
310
+ sql = sql.replace(table_name_match.group(0), create_part)
311
+
312
+ # Fix partition expressions
313
+ partition_match = AthenaPropertiesExtractor.PARTITIONED_BY_REGEXP.search(sql)
314
+
315
+ if partition_match:
316
+ partition_section = partition_match.group(2)
317
+ if partition_section:
318
+ partition_section_modified = partition_section.replace("`", "")
319
+ sql = sql.replace(partition_section, partition_section_modified)
320
+
321
+ return AthenaPropertiesExtractor.format_athena_column_definitions(sql)
322
+
323
+ @staticmethod
324
+ def _extract_column_types(create_expr: Create) -> Dict[str, str]:
325
+ """Extract column types from a CREATE TABLE expression.
326
+
327
+ Args:
328
+ create_expr: The CREATE TABLE expression to extract types from
329
+
330
+ Returns:
331
+ A dictionary mapping column names to their types
332
+ """
333
+ column_types: Dict[str, str] = {}
334
+
335
+ if not create_expr.this or not hasattr(create_expr.this, "expressions"):
336
+ return column_types
337
+
338
+ try:
339
+ for expr in create_expr.this.expressions:
340
+ if isinstance(expr, ColumnDef) and expr.this:
341
+ column_types[expr.name] = str(expr.kind)
342
+ except Exception:
343
+ # If we can't extract column types, return empty dict
344
+ pass
345
+
346
+ return column_types
347
+
348
+ @staticmethod
349
+ def _create_column_info(column_name: str, column_type: str) -> ColumnInfo:
350
+ """Create a column info object.
351
+
352
+ Args:
353
+ column_name: Name of the column
354
+ column_type: Type of the column
355
+
356
+ Returns:
357
+ A ColumnInfo object
358
+ """
359
+ return ColumnInfo(
360
+ name=str(column_name) if column_name else "unknown",
361
+ type=column_type if column_type else "unknown",
362
+ )
363
+
364
+ @staticmethod
365
+ def _handle_function_expression(
366
+ expr: Identifier, column_types: Dict[str, str]
367
+ ) -> Tuple[ColumnInfo, TransformInfo]:
368
+ """Handle function expressions like day(event_timestamp).
369
+
370
+ Args:
371
+ expr: The function expression to handle
372
+ column_types: Dictionary of column types
373
+
374
+ Returns:
375
+ A tuple of (column_info, transform_info)
376
+ """
377
+ func_str = str(expr)
378
+
379
+ if "(" not in func_str or ")" not in func_str:
380
+ # Fallback for malformed function expressions
381
+ column_info = AthenaPropertiesExtractor._create_column_info(
382
+ func_str, "unknown"
383
+ )
384
+ transform_info = TransformInfo(type="unknown", column=column_info)
385
+ return column_info, transform_info
386
+
387
+ try:
388
+ func_name = func_str.split("(")[0].lower()
389
+ column_part = func_str.split("(")[1].split(")")[0].strip("`")
390
+
391
+ column_info = AthenaPropertiesExtractor._create_column_info(
392
+ column_part, column_types.get(column_part, "unknown")
393
+ )
394
+ transform_info = TransformInfo(type=func_name, column=column_info)
395
+
396
+ return column_info, transform_info
397
+ except (IndexError, AttributeError):
398
+ # Fallback for parsing errors
399
+ column_info = AthenaPropertiesExtractor._create_column_info(
400
+ func_str, "unknown"
401
+ )
402
+ transform_info = TransformInfo(type="unknown", column=column_info)
403
+ return column_info, transform_info
404
+
405
+ @staticmethod
406
+ def _handle_time_function(
407
+ expr: Union[Year, Month, Day], column_types: Dict[str, str]
408
+ ) -> Tuple[ColumnInfo, TransformInfo]:
409
+ """Handle time-based functions like year, month, day.
410
+
411
+ Args:
412
+ expr: The time function expression to handle
413
+ column_types: Dictionary of column types
414
+
415
+ Returns:
416
+ A tuple of (column_info, transform_info)
417
+ """
418
+ try:
419
+ # Navigate the expression tree safely
420
+ column_name = "unknown"
421
+ if hasattr(expr, "this") and expr.this:
422
+ if hasattr(expr.this, "this") and expr.this.this:
423
+ if hasattr(expr.this.this, "this") and expr.this.this.this:
424
+ column_name = str(expr.this.this.this)
425
+ else:
426
+ column_name = str(expr.this.this)
427
+ else:
428
+ column_name = str(expr.this)
429
+
430
+ column_info = AthenaPropertiesExtractor._create_column_info(
431
+ column_name, column_types.get(column_name, "unknown")
432
+ )
433
+ transform_info = TransformInfo(
434
+ type=expr.__class__.__name__.lower(), column=column_info
435
+ )
436
+
437
+ return column_info, transform_info
438
+ except (AttributeError, TypeError):
439
+ # Fallback for navigation errors
440
+ column_info = AthenaPropertiesExtractor._create_column_info(
441
+ "unknown", "unknown"
442
+ )
443
+ transform_info = TransformInfo(type="unknown", column=column_info)
444
+ return column_info, transform_info
445
+
446
+ @staticmethod
447
+ def _handle_transform_function(
448
+ expr: Anonymous, column_types: Dict[str, str]
449
+ ) -> Tuple[ColumnInfo, TransformInfo]:
450
+ """Handle transform functions like bucket, hour, truncate.
451
+
452
+ Args:
453
+ expr: The transform function expression to handle
454
+ column_types: Dictionary of column types
455
+
456
+ Returns:
457
+ A tuple of (column_info, transform_info)
458
+ """
459
+ try:
460
+ # Safely extract column name from the last expression
461
+ column_name = "unknown"
462
+ if (
463
+ hasattr(expr, "expressions")
464
+ and expr.expressions
465
+ and len(expr.expressions) > 0
466
+ ):
467
+ last_expr = expr.expressions[-1]
468
+ if hasattr(last_expr, "this") and last_expr.this:
469
+ if hasattr(last_expr.this, "this") and last_expr.this.this:
470
+ column_name = str(last_expr.this.this)
471
+ else:
472
+ column_name = str(last_expr.this)
473
+
474
+ column_info = AthenaPropertiesExtractor._create_column_info(
475
+ column_name, column_types.get(column_name, "unknown")
476
+ )
477
+
478
+ transform_type = str(expr.this).lower() if expr.this else "unknown"
479
+ transform_info = TransformInfo(type=transform_type, column=column_info)
480
+
481
+ # Add transform-specific parameters safely
482
+ if (
483
+ transform_type == "bucket"
484
+ and hasattr(expr, "expressions")
485
+ and expr.expressions
486
+ and len(expr.expressions) > 0
487
+ ):
488
+ first_expr = expr.expressions[0]
489
+ if hasattr(first_expr, "this"):
490
+ transform_info.bucket_count = first_expr.this
491
+ elif (
492
+ transform_type == "truncate"
493
+ and hasattr(expr, "expressions")
494
+ and expr.expressions
495
+ and len(expr.expressions) > 0
496
+ ):
497
+ first_expr = expr.expressions[0]
498
+ if hasattr(first_expr, "this"):
499
+ transform_info.length = first_expr.this
500
+
501
+ return column_info, transform_info
502
+ except (AttributeError, TypeError, IndexError):
503
+ # Fallback for any parsing errors
504
+ column_info = AthenaPropertiesExtractor._create_column_info(
505
+ "unknown", "unknown"
506
+ )
507
+ transform_info = TransformInfo(type="unknown", column=column_info)
508
+ return column_info, transform_info
509
+
510
+ def _extract_partition_info(self, parsed: Expression) -> PartitionInfo:
511
+ """Extract partitioning information from the parsed SQL statement.
512
+
513
+ Args:
514
+ parsed: The parsed SQL expression
515
+
516
+ Returns:
517
+ A PartitionInfo object containing simple columns and transforms
518
+ """
519
+ # Get the PARTITIONED BY expression
520
+ partition_by_expr: Optional[Schema] = None
521
+
522
+ try:
523
+ for prop in parsed.find_all(Property):
524
+ if isinstance(prop, PartitionedByProperty):
525
+ partition_by_expr = prop.this
526
+ break
527
+ except Exception:
528
+ # If we can't find properties, return empty result
529
+ return PartitionInfo(simple_columns=[], transforms=[])
530
+
531
+ if not partition_by_expr:
532
+ return PartitionInfo(simple_columns=[], transforms=[])
533
+
534
+ # Extract partitioning columns and transforms
535
+ simple_columns: List[ColumnInfo] = []
536
+ transforms: List[TransformInfo] = []
537
+
538
+ # Get column types from the table definition
539
+ column_types: Dict[str, str] = {}
540
+ if isinstance(parsed, Create):
541
+ column_types = self._extract_column_types(parsed)
542
+
543
+ # Process each expression in the PARTITIONED BY clause
544
+ if hasattr(partition_by_expr, "expressions") and partition_by_expr.expressions:
545
+ for expr in partition_by_expr.expressions:
546
+ try:
547
+ if isinstance(expr, Identifier) and "(" in str(expr):
548
+ column_info, transform_info = self._handle_function_expression(
549
+ expr, column_types
550
+ )
551
+ simple_columns.append(column_info)
552
+ transforms.append(transform_info)
553
+ elif isinstance(expr, PartitionByTruncate):
554
+ column_info = AthenaPropertiesExtractor._create_column_info(
555
+ str(expr.this), column_types.get(str(expr.this), "unknown")
556
+ )
557
+
558
+ expression = expr.args.get("expression")
559
+ transform_info = TransformInfo(
560
+ type="truncate",
561
+ column=column_info,
562
+ length=int(expression.name)
563
+ if expression and expression.name
564
+ else None,
565
+ )
566
+ transforms.append(transform_info)
567
+ simple_columns.append(column_info)
568
+ elif isinstance(expr, PartitionedByBucket):
569
+ column_info = AthenaPropertiesExtractor._create_column_info(
570
+ str(expr.this), column_types.get(str(expr.this), "unknown")
571
+ )
572
+ expression = expr.args.get("expression")
573
+ transform_info = TransformInfo(
574
+ type="bucket",
575
+ column=column_info,
576
+ bucket_count=int(expression.name)
577
+ if expression and expression.name
578
+ else None,
579
+ )
580
+ simple_columns.append(column_info)
581
+ transforms.append(transform_info)
582
+ elif isinstance(expr, (Year, Month, Day)):
583
+ column_info, transform_info = self._handle_time_function(
584
+ expr, column_types
585
+ )
586
+ transforms.append(transform_info)
587
+ simple_columns.append(column_info)
588
+ elif (
589
+ isinstance(expr, Anonymous)
590
+ and expr.this
591
+ and str(expr.this).lower() in ["bucket", "hour", "truncate"]
592
+ ):
593
+ column_info, transform_info = self._handle_transform_function(
594
+ expr, column_types
595
+ )
596
+ transforms.append(transform_info)
597
+ simple_columns.append(column_info)
598
+ elif hasattr(expr, "this") and expr.this:
599
+ column_name = str(expr.this)
600
+ column_info = self._create_column_info(
601
+ column_name, column_types.get(column_name, "unknown")
602
+ )
603
+ simple_columns.append(column_info)
604
+ except Exception:
605
+ # Skip problematic expressions rather than failing completely
606
+ continue
607
+
608
+ # Remove duplicates from simple_columns while preserving order
609
+ seen_names: Set[str] = set()
610
+ unique_simple_columns: List[ColumnInfo] = []
611
+
612
+ for col in simple_columns:
613
+ if col.name and col.name not in seen_names:
614
+ seen_names.add(col.name)
615
+ unique_simple_columns.append(col)
616
+
617
+ return PartitionInfo(
618
+ simple_columns=unique_simple_columns, transforms=transforms
619
+ )
620
+
621
+ def _extract_table_properties(self, parsed: Expression) -> TableProperties:
622
+ """Extract table properties from the parsed SQL statement.
623
+
624
+ Args:
625
+ parsed: The parsed SQL expression
626
+
627
+ Returns:
628
+ A TableProperties object
629
+ """
630
+ location: Optional[str] = None
631
+ format_prop: Optional[str] = None
632
+ comment: Optional[str] = None
633
+ serde_properties: Optional[Dict[str, str]] = None
634
+ row_format: Optional[Dict[str, str]] = None
635
+ additional_properties: Dict[str, str] = {}
636
+
637
+ try:
638
+ props = list(parsed.find_all(Property))
639
+ except Exception:
640
+ return TableProperties()
641
+
642
+ for prop in props:
643
+ try:
644
+ if isinstance(prop, LocationProperty):
645
+ location = self._safe_get_property_value(prop)
646
+
647
+ elif isinstance(prop, FileFormatProperty):
648
+ format_prop = self._safe_get_property_value(prop)
649
+
650
+ elif isinstance(prop, SchemaCommentProperty):
651
+ comment = self._safe_get_property_value(prop)
652
+
653
+ elif isinstance(prop, PartitionedByProperty):
654
+ continue # Skip partition properties here
655
+
656
+ elif isinstance(prop, SerdeProperties):
657
+ serde_props = self._extract_serde_properties(prop)
658
+ if serde_props:
659
+ serde_properties = serde_props
660
+
661
+ elif isinstance(prop, RowFormatDelimitedProperty):
662
+ row_format_props = self._extract_row_format_properties(prop)
663
+ if row_format_props:
664
+ row_format = row_format_props
665
+
666
+ else:
667
+ # Handle generic properties
668
+ key, value = self._extract_generic_property(prop)
669
+ if (
670
+ key
671
+ and value
672
+ and (not serde_properties or key not in serde_properties)
673
+ ):
674
+ additional_properties[key] = value
675
+
676
+ except Exception:
677
+ # Skip problematic properties rather than failing completely
678
+ continue
679
+
680
+ if (
681
+ not location
682
+ and additional_properties
683
+ and additional_properties.get("external_location")
684
+ ):
685
+ location = additional_properties.pop("external_location")
686
+
687
+ return TableProperties(
688
+ location=location,
689
+ format=format_prop,
690
+ comment=comment,
691
+ serde_properties=serde_properties,
692
+ row_format=row_format,
693
+ additional_properties=additional_properties
694
+ if additional_properties
695
+ else None,
696
+ )
697
+
698
+ def _safe_get_property_value(self, prop: Property) -> Optional[str]:
699
+ """Safely extract value from a property."""
700
+ try:
701
+ if (
702
+ hasattr(prop, "args")
703
+ and "this" in prop.args
704
+ and prop.args["this"]
705
+ and hasattr(prop.args["this"], "name")
706
+ ):
707
+ return prop.args["this"].name
708
+ except (AttributeError, KeyError, TypeError):
709
+ pass
710
+ return None
711
+
712
+ def _extract_serde_properties(self, prop: SerdeProperties) -> Dict[str, str]:
713
+ """Extract SERDE properties safely."""
714
+ serde_props: Dict[str, str] = {}
715
+ try:
716
+ if hasattr(prop, "expressions") and prop.expressions:
717
+ for exp in prop.expressions:
718
+ if (
719
+ hasattr(exp, "name")
720
+ and hasattr(exp, "args")
721
+ and "value" in exp.args
722
+ and exp.args["value"]
723
+ and hasattr(exp.args["value"], "name")
724
+ ):
725
+ serde_props[exp.name] = exp.args["value"].name
726
+ except Exception:
727
+ pass
728
+ return serde_props
729
+
730
+ def _extract_row_format_properties(
731
+ self, prop: RowFormatDelimitedProperty
732
+ ) -> Dict[str, str]:
733
+ """Extract row format properties safely."""
734
+ row_format: Dict[str, str] = {}
735
+ try:
736
+ if hasattr(prop, "args") and prop.args:
737
+ for key, value in prop.args.items():
738
+ if hasattr(value, "this"):
739
+ row_format[key] = str(value.this)
740
+ else:
741
+ row_format[key] = str(value)
742
+ except Exception:
743
+ pass
744
+ return row_format
745
+
746
+ def _extract_generic_property(
747
+ self, prop: Property
748
+ ) -> Tuple[Optional[str], Optional[str]]:
749
+ """Extract key-value pair from generic property."""
750
+ try:
751
+ if (
752
+ hasattr(prop, "args")
753
+ and "this" in prop.args
754
+ and prop.args["this"]
755
+ and hasattr(prop.args["this"], "name")
756
+ and "value" in prop.args
757
+ and prop.args["value"]
758
+ and hasattr(prop.args["value"], "name")
759
+ ):
760
+ key = prop.args["this"].name.lower()
761
+ value = prop.args["value"].name
762
+ return key, value
763
+ except (AttributeError, KeyError, TypeError):
764
+ pass
765
+ return None, None
766
+
767
+ def _extract_row_format(self, parsed: Expression) -> RowFormatInfo:
768
+ """Extract and format RowFormatDelimitedProperty.
769
+
770
+ Args:
771
+ parsed: The parsed SQL expression
772
+
773
+ Returns:
774
+ A RowFormatInfo object
775
+ """
776
+ row_format_props: Dict[str, str] = {}
777
+
778
+ try:
779
+ props = parsed.find_all(Property)
780
+ for prop in props:
781
+ if isinstance(prop, RowFormatDelimitedProperty):
782
+ row_format_props = self._extract_row_format_properties(prop)
783
+ break
784
+ except Exception:
785
+ pass
786
+
787
+ if row_format_props:
788
+ try:
789
+ json_formatted = json.dumps(row_format_props, indent=2)
790
+ except (TypeError, ValueError):
791
+ json_formatted = "Error formatting row format properties"
792
+ else:
793
+ json_formatted = "No RowFormatDelimitedProperty found"
794
+
795
+ return RowFormatInfo(properties=row_format_props, json_formatted=json_formatted)