acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,777 @@
1
+ """
2
+ Athena Properties Extractor - A robust tool for parsing CREATE TABLE statements.
3
+
4
+ This module provides functionality to extract properties, partitioning information,
5
+ and row format details from Athena CREATE TABLE SQL statements.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Optional, Set, Tuple, Union
12
+
13
+ from sqlglot import ParseError, parse_one
14
+ from sqlglot.dialects.athena import Athena
15
+ from sqlglot.expressions import (
16
+ Anonymous,
17
+ ColumnDef,
18
+ Create,
19
+ Day,
20
+ Expression,
21
+ FileFormatProperty,
22
+ Identifier,
23
+ LocationProperty,
24
+ Month,
25
+ PartitionByTruncate,
26
+ PartitionedByBucket,
27
+ PartitionedByProperty,
28
+ Property,
29
+ RowFormatDelimitedProperty,
30
+ Schema,
31
+ SchemaCommentProperty,
32
+ SerdeProperties,
33
+ Year,
34
+ )
35
+
36
+
37
+ class AthenaPropertiesExtractionError(Exception):
38
+ """Custom exception for Athena properties extraction errors."""
39
+
40
+ pass
41
+
42
+
43
+ @dataclass
44
+ class ColumnInfo:
45
+ """Information about a table column."""
46
+
47
+ name: str
48
+ type: str
49
+
50
+
51
+ @dataclass
52
+ class TransformInfo:
53
+ """Information about a partition transform."""
54
+
55
+ type: str
56
+ column: ColumnInfo
57
+ bucket_count: Optional[int] = None
58
+ length: Optional[int] = None
59
+
60
+
61
+ @dataclass
62
+ class PartitionInfo:
63
+ """Information about table partitioning."""
64
+
65
+ simple_columns: List[ColumnInfo]
66
+ transforms: List[TransformInfo]
67
+
68
+
69
+ @dataclass
70
+ class TableProperties:
71
+ """General table properties."""
72
+
73
+ location: Optional[str] = None
74
+ format: Optional[str] = None
75
+ comment: Optional[str] = None
76
+ serde_properties: Optional[Dict[str, str]] = None
77
+ row_format: Optional[Dict[str, str]] = None
78
+ additional_properties: Optional[Dict[str, str]] = None
79
+
80
+
81
+ @dataclass
82
+ class RowFormatInfo:
83
+ """Row format information."""
84
+
85
+ properties: Dict[str, str]
86
+ json_formatted: str
87
+
88
+
89
+ @dataclass
90
+ class AthenaTableInfo:
91
+ """Complete information about an Athena table."""
92
+
93
+ partition_info: PartitionInfo
94
+ table_properties: TableProperties
95
+ row_format: RowFormatInfo
96
+
97
+
98
+ class AthenaPropertiesExtractor:
99
+ """A class to extract properties from Athena CREATE TABLE statements."""
100
+
101
+ CREATE_TABLE_REGEXP = re.compile(
102
+ "(CREATE TABLE[\s\n]*)(.*?)(\s*\()", re.MULTILINE | re.IGNORECASE
103
+ )
104
+ PARTITIONED_BY_REGEXP = re.compile(
105
+ "(PARTITIONED BY[\s\n]*\()((?:[^()]|\([^)]*\))*?)(\))",
106
+ re.MULTILINE | re.IGNORECASE,
107
+ )
108
+
109
+ def __init__(self) -> None:
110
+ """Initialize the extractor."""
111
+ pass
112
+
113
+ @staticmethod
114
+ def get_table_properties(sql: str) -> AthenaTableInfo:
115
+ """Get all table properties from a SQL statement.
116
+
117
+ Args:
118
+ sql: The SQL statement to parse
119
+
120
+ Returns:
121
+ An AthenaTableInfo object containing all table properties
122
+
123
+ Raises:
124
+ AthenaPropertiesExtractionError: If extraction fails
125
+ """
126
+ extractor = AthenaPropertiesExtractor()
127
+ return extractor._extract_all_properties(sql)
128
+
129
+ def _extract_all_properties(self, sql: str) -> AthenaTableInfo:
130
+ """Extract all properties from a SQL statement.
131
+
132
+ Args:
133
+ sql: The SQL statement to parse
134
+
135
+ Returns:
136
+ An AthenaTableInfo object containing all properties
137
+
138
+ Raises:
139
+ AthenaPropertiesExtractionError: If extraction fails
140
+ """
141
+ if not sql or not sql.strip():
142
+ raise AthenaPropertiesExtractionError("SQL statement cannot be empty")
143
+
144
+ try:
145
+ # We need to do certain transformations on the sql create statement:
146
+ # - table names are not quoted
147
+ # - column expression is not quoted
148
+ # - sql parser fails if partition colums quoted
149
+ fixed_sql = self._fix_sql_partitioning(sql)
150
+ parsed = parse_one(fixed_sql, dialect=Athena)
151
+ except ParseError as e:
152
+ raise AthenaPropertiesExtractionError(f"Failed to parse SQL: {e}") from e
153
+ except Exception as e:
154
+ raise AthenaPropertiesExtractionError(
155
+ f"Unexpected error during SQL parsing: {e}"
156
+ ) from e
157
+
158
+ try:
159
+ partition_info = self._extract_partition_info(parsed)
160
+ table_properties = self._extract_table_properties(parsed)
161
+ row_format = self._extract_row_format(parsed)
162
+
163
+ return AthenaTableInfo(
164
+ partition_info=partition_info,
165
+ table_properties=table_properties,
166
+ row_format=row_format,
167
+ )
168
+ except Exception as e:
169
+ raise AthenaPropertiesExtractionError(
170
+ f"Failed to extract table properties: {e}"
171
+ ) from e
172
+
173
+ @staticmethod
174
+ def format_column_definition(line):
175
+ # Use regex to parse the line more accurately
176
+ # Pattern: column_name data_type [COMMENT comment_text] [,]
177
+ # Use greedy match for comment to capture everything until trailing comma
178
+ pattern = r"^\s*(.+?)\s+([\s,\w<>\[\]]+)((\s+COMMENT\s+(.+?)(,?))|(,?)\s*)?$"
179
+ match = re.match(pattern, line, re.IGNORECASE)
180
+
181
+ if not match:
182
+ return line
183
+ column_name = match.group(1)
184
+ data_type = match.group(2)
185
+ comment_part = match.group(5) # COMMENT part
186
+ # there are different number of match groups depending on whether comment exists
187
+ if comment_part:
188
+ trailing_comma = match.group(6) if match.group(6) else ""
189
+ else:
190
+ trailing_comma = match.group(7) if match.group(7) else ""
191
+
192
+ # Add backticks to column name if not already present
193
+ if not (column_name.startswith("`") and column_name.endswith("`")):
194
+ column_name = f"`{column_name}`"
195
+
196
+ # Build the result
197
+ result_parts = [column_name, data_type]
198
+
199
+ if comment_part:
200
+ comment_part = comment_part.strip()
201
+
202
+ # Handle comment quoting and escaping
203
+ if comment_part.startswith("'") and comment_part.endswith("'"):
204
+ # Already properly single quoted - keep as is
205
+ formatted_comment = comment_part
206
+ elif comment_part.startswith('"') and comment_part.endswith('"'):
207
+ # Double quoted - convert to single quotes and escape internal single quotes
208
+ inner_content = comment_part[1:-1]
209
+ escaped_content = inner_content.replace("'", "''")
210
+ formatted_comment = f"'{escaped_content}'"
211
+ else:
212
+ # Not quoted - add quotes and escape any single quotes
213
+ escaped_content = comment_part.replace("'", "''")
214
+ formatted_comment = f"'{escaped_content}'"
215
+
216
+ result_parts.extend(["COMMENT", formatted_comment])
217
+
218
+ result = " " + " ".join(result_parts) + trailing_comma
219
+
220
+ return result
221
+
222
+ @staticmethod
223
+ def format_athena_column_definitions(sql_statement: str) -> str:
224
+ """
225
+ Format Athena CREATE TABLE statement by:
226
+ 1. Adding backticks around column names in column definitions (only in the main table definition)
227
+ 2. Quoting comments (if any exist)
228
+ """
229
+ lines = sql_statement.split("\n")
230
+ formatted_lines = []
231
+
232
+ in_column_definition = False
233
+
234
+ for line in lines:
235
+ stripped_line = line.strip()
236
+
237
+ # Check if we're entering column definitions
238
+ if "CREATE TABLE" in line.upper() and "(" in line:
239
+ in_column_definition = True
240
+ formatted_lines.append(line)
241
+ continue
242
+
243
+ # Check if we're exiting column definitions (closing parenthesis before PARTITIONED BY or end)
244
+ if in_column_definition and ")" in line:
245
+ in_column_definition = False
246
+ formatted_lines.append(line)
247
+ continue
248
+
249
+ # Process only column definitions (not PARTITIONED BY or other sections)
250
+ if in_column_definition and stripped_line:
251
+ # Match column definition pattern and format it
252
+ formatted_line = AthenaPropertiesExtractor.format_column_definition(
253
+ line
254
+ )
255
+ formatted_lines.append(formatted_line)
256
+ else:
257
+ # For all other lines, keep as-is
258
+ formatted_lines.append(line)
259
+
260
+ return "\n".join(formatted_lines)
261
+
262
+ @staticmethod
263
+ def _fix_sql_partitioning(sql: str) -> str:
264
+ """Fix SQL partitioning by removing backticks from partition expressions and quoting table names.
265
+
266
+ Args:
267
+ sql: The SQL statement to fix
268
+
269
+ Returns:
270
+ The fixed SQL statement
271
+ """
272
+ if not sql:
273
+ return sql
274
+
275
+ # Quote table name
276
+ table_name_match = AthenaPropertiesExtractor.CREATE_TABLE_REGEXP.search(sql)
277
+
278
+ if table_name_match:
279
+ table_name = table_name_match.group(2).strip()
280
+ if table_name and not (table_name.startswith("`") or "`" in table_name):
281
+ # Split on dots and quote each part
282
+ quoted_parts = [
283
+ f"`{part.strip()}`"
284
+ for part in table_name.split(".")
285
+ if part.strip()
286
+ ]
287
+ if quoted_parts:
288
+ quoted_table = ".".join(quoted_parts)
289
+ create_part = table_name_match.group(0).replace(
290
+ table_name, quoted_table
291
+ )
292
+ sql = sql.replace(table_name_match.group(0), create_part)
293
+
294
+ # Fix partition expressions
295
+ partition_match = AthenaPropertiesExtractor.PARTITIONED_BY_REGEXP.search(sql)
296
+
297
+ if partition_match:
298
+ partition_section = partition_match.group(2)
299
+ if partition_section:
300
+ partition_section_modified = partition_section.replace("`", "")
301
+ sql = sql.replace(partition_section, partition_section_modified)
302
+
303
+ return AthenaPropertiesExtractor.format_athena_column_definitions(sql)
304
+
305
+ @staticmethod
306
+ def _extract_column_types(create_expr: Create) -> Dict[str, str]:
307
+ """Extract column types from a CREATE TABLE expression.
308
+
309
+ Args:
310
+ create_expr: The CREATE TABLE expression to extract types from
311
+
312
+ Returns:
313
+ A dictionary mapping column names to their types
314
+ """
315
+ column_types: Dict[str, str] = {}
316
+
317
+ if not create_expr.this or not hasattr(create_expr.this, "expressions"):
318
+ return column_types
319
+
320
+ try:
321
+ for expr in create_expr.this.expressions:
322
+ if isinstance(expr, ColumnDef) and expr.this:
323
+ column_types[expr.name] = str(expr.kind)
324
+ except Exception:
325
+ # If we can't extract column types, return empty dict
326
+ pass
327
+
328
+ return column_types
329
+
330
+ @staticmethod
331
+ def _create_column_info(column_name: str, column_type: str) -> ColumnInfo:
332
+ """Create a column info object.
333
+
334
+ Args:
335
+ column_name: Name of the column
336
+ column_type: Type of the column
337
+
338
+ Returns:
339
+ A ColumnInfo object
340
+ """
341
+ return ColumnInfo(
342
+ name=str(column_name) if column_name else "unknown",
343
+ type=column_type if column_type else "unknown",
344
+ )
345
+
346
+ @staticmethod
347
+ def _handle_function_expression(
348
+ expr: Identifier, column_types: Dict[str, str]
349
+ ) -> Tuple[ColumnInfo, TransformInfo]:
350
+ """Handle function expressions like day(event_timestamp).
351
+
352
+ Args:
353
+ expr: The function expression to handle
354
+ column_types: Dictionary of column types
355
+
356
+ Returns:
357
+ A tuple of (column_info, transform_info)
358
+ """
359
+ func_str = str(expr)
360
+
361
+ if "(" not in func_str or ")" not in func_str:
362
+ # Fallback for malformed function expressions
363
+ column_info = AthenaPropertiesExtractor._create_column_info(
364
+ func_str, "unknown"
365
+ )
366
+ transform_info = TransformInfo(type="unknown", column=column_info)
367
+ return column_info, transform_info
368
+
369
+ try:
370
+ func_name = func_str.split("(")[0].lower()
371
+ column_part = func_str.split("(")[1].split(")")[0].strip("`")
372
+
373
+ column_info = AthenaPropertiesExtractor._create_column_info(
374
+ column_part, column_types.get(column_part, "unknown")
375
+ )
376
+ transform_info = TransformInfo(type=func_name, column=column_info)
377
+
378
+ return column_info, transform_info
379
+ except (IndexError, AttributeError):
380
+ # Fallback for parsing errors
381
+ column_info = AthenaPropertiesExtractor._create_column_info(
382
+ func_str, "unknown"
383
+ )
384
+ transform_info = TransformInfo(type="unknown", column=column_info)
385
+ return column_info, transform_info
386
+
387
+ @staticmethod
388
+ def _handle_time_function(
389
+ expr: Union[Year, Month, Day], column_types: Dict[str, str]
390
+ ) -> Tuple[ColumnInfo, TransformInfo]:
391
+ """Handle time-based functions like year, month, day.
392
+
393
+ Args:
394
+ expr: The time function expression to handle
395
+ column_types: Dictionary of column types
396
+
397
+ Returns:
398
+ A tuple of (column_info, transform_info)
399
+ """
400
+ try:
401
+ # Navigate the expression tree safely
402
+ column_name = "unknown"
403
+ if hasattr(expr, "this") and expr.this:
404
+ if hasattr(expr.this, "this") and expr.this.this:
405
+ if hasattr(expr.this.this, "this") and expr.this.this.this:
406
+ column_name = str(expr.this.this.this)
407
+ else:
408
+ column_name = str(expr.this.this)
409
+ else:
410
+ column_name = str(expr.this)
411
+
412
+ column_info = AthenaPropertiesExtractor._create_column_info(
413
+ column_name, column_types.get(column_name, "unknown")
414
+ )
415
+ transform_info = TransformInfo(
416
+ type=expr.__class__.__name__.lower(), column=column_info
417
+ )
418
+
419
+ return column_info, transform_info
420
+ except (AttributeError, TypeError):
421
+ # Fallback for navigation errors
422
+ column_info = AthenaPropertiesExtractor._create_column_info(
423
+ "unknown", "unknown"
424
+ )
425
+ transform_info = TransformInfo(type="unknown", column=column_info)
426
+ return column_info, transform_info
427
+
428
+ @staticmethod
429
+ def _handle_transform_function(
430
+ expr: Anonymous, column_types: Dict[str, str]
431
+ ) -> Tuple[ColumnInfo, TransformInfo]:
432
+ """Handle transform functions like bucket, hour, truncate.
433
+
434
+ Args:
435
+ expr: The transform function expression to handle
436
+ column_types: Dictionary of column types
437
+
438
+ Returns:
439
+ A tuple of (column_info, transform_info)
440
+ """
441
+ try:
442
+ # Safely extract column name from the last expression
443
+ column_name = "unknown"
444
+ if (
445
+ hasattr(expr, "expressions")
446
+ and expr.expressions
447
+ and len(expr.expressions) > 0
448
+ ):
449
+ last_expr = expr.expressions[-1]
450
+ if hasattr(last_expr, "this") and last_expr.this:
451
+ if hasattr(last_expr.this, "this") and last_expr.this.this:
452
+ column_name = str(last_expr.this.this)
453
+ else:
454
+ column_name = str(last_expr.this)
455
+
456
+ column_info = AthenaPropertiesExtractor._create_column_info(
457
+ column_name, column_types.get(column_name, "unknown")
458
+ )
459
+
460
+ transform_type = str(expr.this).lower() if expr.this else "unknown"
461
+ transform_info = TransformInfo(type=transform_type, column=column_info)
462
+
463
+ # Add transform-specific parameters safely
464
+ if (
465
+ transform_type == "bucket"
466
+ and hasattr(expr, "expressions")
467
+ and expr.expressions
468
+ and len(expr.expressions) > 0
469
+ ):
470
+ first_expr = expr.expressions[0]
471
+ if hasattr(first_expr, "this"):
472
+ transform_info.bucket_count = first_expr.this
473
+ elif (
474
+ transform_type == "truncate"
475
+ and hasattr(expr, "expressions")
476
+ and expr.expressions
477
+ and len(expr.expressions) > 0
478
+ ):
479
+ first_expr = expr.expressions[0]
480
+ if hasattr(first_expr, "this"):
481
+ transform_info.length = first_expr.this
482
+
483
+ return column_info, transform_info
484
+ except (AttributeError, TypeError, IndexError):
485
+ # Fallback for any parsing errors
486
+ column_info = AthenaPropertiesExtractor._create_column_info(
487
+ "unknown", "unknown"
488
+ )
489
+ transform_info = TransformInfo(type="unknown", column=column_info)
490
+ return column_info, transform_info
491
+
492
+ def _extract_partition_info(self, parsed: Expression) -> PartitionInfo:
493
+ """Extract partitioning information from the parsed SQL statement.
494
+
495
+ Args:
496
+ parsed: The parsed SQL expression
497
+
498
+ Returns:
499
+ A PartitionInfo object containing simple columns and transforms
500
+ """
501
+ # Get the PARTITIONED BY expression
502
+ partition_by_expr: Optional[Schema] = None
503
+
504
+ try:
505
+ for prop in parsed.find_all(Property):
506
+ if isinstance(prop, PartitionedByProperty):
507
+ partition_by_expr = prop.this
508
+ break
509
+ except Exception:
510
+ # If we can't find properties, return empty result
511
+ return PartitionInfo(simple_columns=[], transforms=[])
512
+
513
+ if not partition_by_expr:
514
+ return PartitionInfo(simple_columns=[], transforms=[])
515
+
516
+ # Extract partitioning columns and transforms
517
+ simple_columns: List[ColumnInfo] = []
518
+ transforms: List[TransformInfo] = []
519
+
520
+ # Get column types from the table definition
521
+ column_types: Dict[str, str] = {}
522
+ if isinstance(parsed, Create):
523
+ column_types = self._extract_column_types(parsed)
524
+
525
+ # Process each expression in the PARTITIONED BY clause
526
+ if hasattr(partition_by_expr, "expressions") and partition_by_expr.expressions:
527
+ for expr in partition_by_expr.expressions:
528
+ try:
529
+ if isinstance(expr, Identifier) and "(" in str(expr):
530
+ column_info, transform_info = self._handle_function_expression(
531
+ expr, column_types
532
+ )
533
+ simple_columns.append(column_info)
534
+ transforms.append(transform_info)
535
+ elif isinstance(expr, PartitionByTruncate):
536
+ column_info = AthenaPropertiesExtractor._create_column_info(
537
+ str(expr.this), column_types.get(str(expr.this), "unknown")
538
+ )
539
+
540
+ expression = expr.args.get("expression")
541
+ transform_info = TransformInfo(
542
+ type="truncate",
543
+ column=column_info,
544
+ length=int(expression.name)
545
+ if expression and expression.name
546
+ else None,
547
+ )
548
+ transforms.append(transform_info)
549
+ simple_columns.append(column_info)
550
+ elif isinstance(expr, PartitionedByBucket):
551
+ column_info = AthenaPropertiesExtractor._create_column_info(
552
+ str(expr.this), column_types.get(str(expr.this), "unknown")
553
+ )
554
+ expression = expr.args.get("expression")
555
+ transform_info = TransformInfo(
556
+ type="bucket",
557
+ column=column_info,
558
+ bucket_count=int(expression.name)
559
+ if expression and expression.name
560
+ else None,
561
+ )
562
+ simple_columns.append(column_info)
563
+ transforms.append(transform_info)
564
+ elif isinstance(expr, (Year, Month, Day)):
565
+ column_info, transform_info = self._handle_time_function(
566
+ expr, column_types
567
+ )
568
+ transforms.append(transform_info)
569
+ simple_columns.append(column_info)
570
+ elif (
571
+ isinstance(expr, Anonymous)
572
+ and expr.this
573
+ and str(expr.this).lower() in ["bucket", "hour", "truncate"]
574
+ ):
575
+ column_info, transform_info = self._handle_transform_function(
576
+ expr, column_types
577
+ )
578
+ transforms.append(transform_info)
579
+ simple_columns.append(column_info)
580
+ elif hasattr(expr, "this") and expr.this:
581
+ column_name = str(expr.this)
582
+ column_info = self._create_column_info(
583
+ column_name, column_types.get(column_name, "unknown")
584
+ )
585
+ simple_columns.append(column_info)
586
+ except Exception:
587
+ # Skip problematic expressions rather than failing completely
588
+ continue
589
+
590
+ # Remove duplicates from simple_columns while preserving order
591
+ seen_names: Set[str] = set()
592
+ unique_simple_columns: List[ColumnInfo] = []
593
+
594
+ for col in simple_columns:
595
+ if col.name and col.name not in seen_names:
596
+ seen_names.add(col.name)
597
+ unique_simple_columns.append(col)
598
+
599
+ return PartitionInfo(
600
+ simple_columns=unique_simple_columns, transforms=transforms
601
+ )
602
+
603
+ def _extract_table_properties(self, parsed: Expression) -> TableProperties:
604
+ """Extract table properties from the parsed SQL statement.
605
+
606
+ Args:
607
+ parsed: The parsed SQL expression
608
+
609
+ Returns:
610
+ A TableProperties object
611
+ """
612
+ location: Optional[str] = None
613
+ format_prop: Optional[str] = None
614
+ comment: Optional[str] = None
615
+ serde_properties: Optional[Dict[str, str]] = None
616
+ row_format: Optional[Dict[str, str]] = None
617
+ additional_properties: Dict[str, str] = {}
618
+
619
+ try:
620
+ props = list(parsed.find_all(Property))
621
+ except Exception:
622
+ return TableProperties()
623
+
624
+ for prop in props:
625
+ try:
626
+ if isinstance(prop, LocationProperty):
627
+ location = self._safe_get_property_value(prop)
628
+
629
+ elif isinstance(prop, FileFormatProperty):
630
+ format_prop = self._safe_get_property_value(prop)
631
+
632
+ elif isinstance(prop, SchemaCommentProperty):
633
+ comment = self._safe_get_property_value(prop)
634
+
635
+ elif isinstance(prop, PartitionedByProperty):
636
+ continue # Skip partition properties here
637
+
638
+ elif isinstance(prop, SerdeProperties):
639
+ serde_props = self._extract_serde_properties(prop)
640
+ if serde_props:
641
+ serde_properties = serde_props
642
+
643
+ elif isinstance(prop, RowFormatDelimitedProperty):
644
+ row_format_props = self._extract_row_format_properties(prop)
645
+ if row_format_props:
646
+ row_format = row_format_props
647
+
648
+ else:
649
+ # Handle generic properties
650
+ key, value = self._extract_generic_property(prop)
651
+ if (
652
+ key
653
+ and value
654
+ and (not serde_properties or key not in serde_properties)
655
+ ):
656
+ additional_properties[key] = value
657
+
658
+ except Exception:
659
+ # Skip problematic properties rather than failing completely
660
+ continue
661
+
662
+ if (
663
+ not location
664
+ and additional_properties
665
+ and additional_properties.get("external_location")
666
+ ):
667
+ location = additional_properties.pop("external_location")
668
+
669
+ return TableProperties(
670
+ location=location,
671
+ format=format_prop,
672
+ comment=comment,
673
+ serde_properties=serde_properties,
674
+ row_format=row_format,
675
+ additional_properties=additional_properties
676
+ if additional_properties
677
+ else None,
678
+ )
679
+
680
+ def _safe_get_property_value(self, prop: Property) -> Optional[str]:
681
+ """Safely extract value from a property."""
682
+ try:
683
+ if (
684
+ hasattr(prop, "args")
685
+ and "this" in prop.args
686
+ and prop.args["this"]
687
+ and hasattr(prop.args["this"], "name")
688
+ ):
689
+ return prop.args["this"].name
690
+ except (AttributeError, KeyError, TypeError):
691
+ pass
692
+ return None
693
+
694
+ def _extract_serde_properties(self, prop: SerdeProperties) -> Dict[str, str]:
695
+ """Extract SERDE properties safely."""
696
+ serde_props: Dict[str, str] = {}
697
+ try:
698
+ if hasattr(prop, "expressions") and prop.expressions:
699
+ for exp in prop.expressions:
700
+ if (
701
+ hasattr(exp, "name")
702
+ and hasattr(exp, "args")
703
+ and "value" in exp.args
704
+ and exp.args["value"]
705
+ and hasattr(exp.args["value"], "name")
706
+ ):
707
+ serde_props[exp.name] = exp.args["value"].name
708
+ except Exception:
709
+ pass
710
+ return serde_props
711
+
712
+ def _extract_row_format_properties(
713
+ self, prop: RowFormatDelimitedProperty
714
+ ) -> Dict[str, str]:
715
+ """Extract row format properties safely."""
716
+ row_format: Dict[str, str] = {}
717
+ try:
718
+ if hasattr(prop, "args") and prop.args:
719
+ for key, value in prop.args.items():
720
+ if hasattr(value, "this"):
721
+ row_format[key] = str(value.this)
722
+ else:
723
+ row_format[key] = str(value)
724
+ except Exception:
725
+ pass
726
+ return row_format
727
+
728
+ def _extract_generic_property(
729
+ self, prop: Property
730
+ ) -> Tuple[Optional[str], Optional[str]]:
731
+ """Extract key-value pair from generic property."""
732
+ try:
733
+ if (
734
+ hasattr(prop, "args")
735
+ and "this" in prop.args
736
+ and prop.args["this"]
737
+ and hasattr(prop.args["this"], "name")
738
+ and "value" in prop.args
739
+ and prop.args["value"]
740
+ and hasattr(prop.args["value"], "name")
741
+ ):
742
+ key = prop.args["this"].name.lower()
743
+ value = prop.args["value"].name
744
+ return key, value
745
+ except (AttributeError, KeyError, TypeError):
746
+ pass
747
+ return None, None
748
+
749
+ def _extract_row_format(self, parsed: Expression) -> RowFormatInfo:
750
+ """Extract and format RowFormatDelimitedProperty.
751
+
752
+ Args:
753
+ parsed: The parsed SQL expression
754
+
755
+ Returns:
756
+ A RowFormatInfo object
757
+ """
758
+ row_format_props: Dict[str, str] = {}
759
+
760
+ try:
761
+ props = parsed.find_all(Property)
762
+ for prop in props:
763
+ if isinstance(prop, RowFormatDelimitedProperty):
764
+ row_format_props = self._extract_row_format_properties(prop)
765
+ break
766
+ except Exception:
767
+ pass
768
+
769
+ if row_format_props:
770
+ try:
771
+ json_formatted = json.dumps(row_format_props, indent=2)
772
+ except (TypeError, ValueError):
773
+ json_formatted = "Error formatting row format properties"
774
+ else:
775
+ json_formatted = "No RowFormatDelimitedProperty found"
776
+
777
+ return RowFormatInfo(properties=row_format_props, json_formatted=json_formatted)