acryl-datahub 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/METADATA +2603 -2603
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/RECORD +43 -39
- datahub/_version.py +1 -1
- datahub/ingestion/api/report.py +183 -35
- datahub/ingestion/autogenerated/capability_summary.json +3366 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +30 -128
- datahub/ingestion/run/pipeline.py +4 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +23 -22
- datahub/ingestion/source/bigquery_v2/queries.py +2 -2
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/data_lake_common/object_store.py +40 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/dremio/dremio_source.py +6 -3
- datahub/ingestion/source/gcs/gcs_source.py +4 -1
- datahub/ingestion/source/ge_data_profiler.py +28 -20
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/snowflake/snowflake_queries.py +47 -3
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +95 -10
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/unity/proxy.py +4 -3
- datahub/ingestion/source/unity/source.py +10 -8
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +85 -4
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
- datahub/metadata/schema.avsc +54 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
- datahub/sdk/lineage_client.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
- datahub/sql_parsing/sqlglot_lineage.py +2 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc5.dist-info → acryl_datahub-1.1.0.5rc7.dist-info}/top_level.txt +0 -0
|
@@ -44,6 +44,11 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
44
44
|
SnowflakeIdentifierBuilder,
|
|
45
45
|
SnowflakeStructuredReportMixin,
|
|
46
46
|
)
|
|
47
|
+
from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
48
|
+
StoredProcCall,
|
|
49
|
+
StoredProcLineageReport,
|
|
50
|
+
StoredProcLineageTracker,
|
|
51
|
+
)
|
|
47
52
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
48
53
|
from datahub.metadata.urns import CorpUserUrn
|
|
49
54
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -130,6 +135,7 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
130
135
|
aggregator_generate_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
131
136
|
|
|
132
137
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
138
|
+
stored_proc_lineage: Optional[StoredProcLineageReport] = None
|
|
133
139
|
|
|
134
140
|
num_ddl_queries_dropped: int = 0
|
|
135
141
|
num_stream_queries_observed: int = 0
|
|
@@ -261,6 +267,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
261
267
|
TableRename,
|
|
262
268
|
TableSwap,
|
|
263
269
|
ObservedQuery,
|
|
270
|
+
StoredProcCall,
|
|
264
271
|
]
|
|
265
272
|
] = self._exit_stack.enter_context(FileBackedList(shared_connection))
|
|
266
273
|
|
|
@@ -277,12 +284,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
277
284
|
for entry in self.fetch_query_log(users):
|
|
278
285
|
queries.append(entry)
|
|
279
286
|
|
|
287
|
+
stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
|
|
288
|
+
StoredProcLineageTracker(
|
|
289
|
+
platform=self.identifiers.platform,
|
|
290
|
+
shared_connection=shared_connection,
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
self.report.stored_proc_lineage = stored_proc_tracker.report
|
|
294
|
+
|
|
280
295
|
with self.report.audit_log_load_timer:
|
|
281
296
|
for i, query in enumerate(queries):
|
|
282
297
|
if i % 1000 == 0:
|
|
283
298
|
logger.info(f"Added {i} query log entries to SQL aggregator")
|
|
284
299
|
|
|
285
|
-
|
|
300
|
+
if isinstance(query, StoredProcCall):
|
|
301
|
+
stored_proc_tracker.add_stored_proc_call(query)
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
if not (
|
|
305
|
+
isinstance(query, PreparsedQuery)
|
|
306
|
+
and stored_proc_tracker.add_related_query(query)
|
|
307
|
+
):
|
|
308
|
+
# Only add to aggregator if it's not part of a stored procedure.
|
|
309
|
+
self.aggregator.add(query)
|
|
310
|
+
|
|
311
|
+
# Generate and add stored procedure lineage entries.
|
|
312
|
+
for lineage_entry in stored_proc_tracker.build_merged_lineage_entries():
|
|
313
|
+
# TODO: Make this the lowest priority lineage - so that it doesn't override other lineage entries.
|
|
314
|
+
self.aggregator.add(lineage_entry)
|
|
286
315
|
|
|
287
316
|
with self.report.aggregator_generate_timer:
|
|
288
317
|
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
@@ -342,7 +371,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
342
371
|
|
|
343
372
|
def fetch_query_log(
|
|
344
373
|
self, users: UsersMapping
|
|
345
|
-
) -> Iterable[
|
|
374
|
+
) -> Iterable[
|
|
375
|
+
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
376
|
+
]:
|
|
346
377
|
query_log_query = _build_enriched_query_log_query(
|
|
347
378
|
start_time=self.config.window.start_time,
|
|
348
379
|
end_time=self.config.window.end_time,
|
|
@@ -382,7 +413,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
382
413
|
|
|
383
414
|
def _parse_audit_log_row(
|
|
384
415
|
self, row: Dict[str, Any], users: UsersMapping
|
|
385
|
-
) -> Optional[
|
|
416
|
+
) -> Optional[
|
|
417
|
+
Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery, StoredProcCall]
|
|
418
|
+
]:
|
|
386
419
|
json_fields = {
|
|
387
420
|
"DIRECT_OBJECTS_ACCESSED",
|
|
388
421
|
"OBJECTS_MODIFIED",
|
|
@@ -482,6 +515,17 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
482
515
|
extra_info=extra_info,
|
|
483
516
|
)
|
|
484
517
|
|
|
518
|
+
if snowflake_query_type == "CALL" and res["root_query_id"] is None:
|
|
519
|
+
return StoredProcCall(
|
|
520
|
+
# This is the top-level query ID that other entries will reference.
|
|
521
|
+
snowflake_root_query_id=res["query_id"],
|
|
522
|
+
query_text=query_text,
|
|
523
|
+
timestamp=timestamp,
|
|
524
|
+
user=user,
|
|
525
|
+
default_db=res["default_db"],
|
|
526
|
+
default_schema=res["default_schema"],
|
|
527
|
+
)
|
|
528
|
+
|
|
485
529
|
upstreams = []
|
|
486
530
|
column_usage = {}
|
|
487
531
|
|
|
@@ -231,7 +231,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
231
231
|
|
|
232
232
|
with self.report.usage_aggregation.result_fetch_timer as fetch_timer:
|
|
233
233
|
for row in results:
|
|
234
|
-
with
|
|
234
|
+
with (
|
|
235
|
+
fetch_timer.pause(),
|
|
236
|
+
self.report.usage_aggregation.result_skip_timer as skip_timer,
|
|
237
|
+
):
|
|
235
238
|
if results.rownumber is not None and results.rownumber % 1000 == 0:
|
|
236
239
|
logger.debug(f"Processing usage row number {results.rownumber}")
|
|
237
240
|
logger.debug(self.report.usage_aggregation.as_string())
|
|
@@ -255,7 +258,10 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
255
258
|
f"Skipping usage for {row['OBJECT_DOMAIN']} {dataset_identifier}, as table is not accessible."
|
|
256
259
|
)
|
|
257
260
|
continue
|
|
258
|
-
with
|
|
261
|
+
with (
|
|
262
|
+
skip_timer.pause(),
|
|
263
|
+
self.report.usage_aggregation.result_map_timer as map_timer,
|
|
264
|
+
):
|
|
259
265
|
wu = self.build_usage_statistics_for_dataset(
|
|
260
266
|
dataset_identifier, row
|
|
261
267
|
)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
7
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
8
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
9
|
+
PreparsedQuery,
|
|
10
|
+
UrnStr,
|
|
11
|
+
)
|
|
12
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
13
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class StoredProcCall:
|
|
18
|
+
snowflake_root_query_id: str
|
|
19
|
+
|
|
20
|
+
# Query text will typically be something like:
|
|
21
|
+
# "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
|
|
22
|
+
query_text: str
|
|
23
|
+
|
|
24
|
+
timestamp: datetime
|
|
25
|
+
user: CorpUserUrn
|
|
26
|
+
default_db: str
|
|
27
|
+
default_schema: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class StoredProcExecutionLineage:
|
|
32
|
+
call: StoredProcCall
|
|
33
|
+
|
|
34
|
+
inputs: List[UrnStr]
|
|
35
|
+
outputs: List[UrnStr]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class StoredProcLineageReport:
|
|
40
|
+
num_stored_proc_calls: int = 0
|
|
41
|
+
num_related_queries: int = 0
|
|
42
|
+
num_related_queries_without_proc_call: int = 0
|
|
43
|
+
|
|
44
|
+
# Incremented at generation/build time.
|
|
45
|
+
num_stored_proc_lineage_entries: int = 0
|
|
46
|
+
num_stored_proc_calls_with_no_inputs: int = 0
|
|
47
|
+
num_stored_proc_calls_with_no_outputs: int = 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StoredProcLineageTracker(Closeable):
|
|
51
|
+
"""
|
|
52
|
+
Tracks table-level lineage for Snowflake stored procedures.
|
|
53
|
+
|
|
54
|
+
Stored procedures in Snowflake trigger multiple SQL queries during execution.
|
|
55
|
+
Snowflake assigns each stored procedure call a unique query_id and uses this as the
|
|
56
|
+
root_query_id for all subsequent queries executed within that procedure. This allows
|
|
57
|
+
us to trace which queries belong to a specific stored procedure execution and build
|
|
58
|
+
table-level lineage by aggregating inputs/outputs from all related queries.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, platform: str, shared_connection: Optional[Any] = None):
|
|
62
|
+
self.platform = platform
|
|
63
|
+
self.report = StoredProcLineageReport()
|
|
64
|
+
|
|
65
|
+
# { root_query_id -> StoredProcExecutionLineage }
|
|
66
|
+
self._stored_proc_execution_lineage: FileBackedDict[
|
|
67
|
+
StoredProcExecutionLineage
|
|
68
|
+
] = FileBackedDict(shared_connection)
|
|
69
|
+
|
|
70
|
+
def add_stored_proc_call(self, call: StoredProcCall) -> None:
|
|
71
|
+
"""Add a stored procedure call to track."""
|
|
72
|
+
self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
|
|
73
|
+
StoredProcExecutionLineage(
|
|
74
|
+
call=call,
|
|
75
|
+
# Will be populated by subsequent queries.
|
|
76
|
+
inputs=[],
|
|
77
|
+
outputs=[],
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
self.report.num_stored_proc_calls += 1
|
|
81
|
+
|
|
82
|
+
def add_related_query(self, query: PreparsedQuery) -> bool:
|
|
83
|
+
"""Add a query that might be related to a stored procedure execution.
|
|
84
|
+
|
|
85
|
+
Returns True if the query was added to a stored procedure execution, False otherwise.
|
|
86
|
+
"""
|
|
87
|
+
snowflake_root_query_id = (query.extra_info or {}).get(
|
|
88
|
+
"snowflake_root_query_id"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if snowflake_root_query_id:
|
|
92
|
+
if snowflake_root_query_id not in self._stored_proc_execution_lineage:
|
|
93
|
+
self.report.num_related_queries_without_proc_call += 1
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
|
|
97
|
+
snowflake_root_query_id
|
|
98
|
+
)
|
|
99
|
+
stored_proc_execution.inputs.extend(query.upstreams)
|
|
100
|
+
if query.downstream is not None:
|
|
101
|
+
stored_proc_execution.outputs.append(query.downstream)
|
|
102
|
+
self.report.num_related_queries += 1
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
|
|
108
|
+
# For stored procedures, we can only get table-level lineage from the audit log.
|
|
109
|
+
# We represent these as PreparsedQuery objects for now. Eventually we'll want to
|
|
110
|
+
# create dataJobInputOutput lineage instead.
|
|
111
|
+
|
|
112
|
+
for stored_proc_execution in self._stored_proc_execution_lineage.values():
|
|
113
|
+
if not stored_proc_execution.inputs:
|
|
114
|
+
self.report.num_stored_proc_calls_with_no_inputs += 1
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if not stored_proc_execution.outputs:
|
|
118
|
+
self.report.num_stored_proc_calls_with_no_outputs += 1
|
|
119
|
+
# Still continue to generate lineage for cases where we have inputs but no outputs
|
|
120
|
+
|
|
121
|
+
for downstream in stored_proc_execution.outputs:
|
|
122
|
+
stored_proc_query_id = get_query_fingerprint(
|
|
123
|
+
stored_proc_execution.call.query_text,
|
|
124
|
+
self.platform,
|
|
125
|
+
fast=True,
|
|
126
|
+
secondary_id=downstream,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
lineage_entry = PreparsedQuery(
|
|
130
|
+
query_id=stored_proc_query_id,
|
|
131
|
+
query_text=stored_proc_execution.call.query_text,
|
|
132
|
+
upstreams=stored_proc_execution.inputs,
|
|
133
|
+
downstream=downstream,
|
|
134
|
+
query_count=0,
|
|
135
|
+
user=stored_proc_execution.call.user,
|
|
136
|
+
timestamp=stored_proc_execution.call.timestamp,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
self.report.num_stored_proc_lineage_entries += 1
|
|
140
|
+
yield lineage_entry
|
|
141
|
+
|
|
142
|
+
def close(self) -> None:
|
|
143
|
+
self._stored_proc_execution_lineage.close()
|
|
@@ -34,6 +34,9 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
34
34
|
SourceCapabilityModifier,
|
|
35
35
|
)
|
|
36
36
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
37
|
+
from datahub.ingestion.source.sql.athena_properties_extractor import (
|
|
38
|
+
AthenaPropertiesExtractor,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
38
41
|
SQLAlchemySource,
|
|
39
42
|
register_custom_type,
|
|
@@ -47,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
47
50
|
)
|
|
48
51
|
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
49
52
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
50
|
-
from datahub.metadata.schema_classes import
|
|
53
|
+
from datahub.metadata.schema_classes import (
|
|
54
|
+
ArrayTypeClass,
|
|
55
|
+
MapTypeClass,
|
|
56
|
+
RecordTypeClass,
|
|
57
|
+
)
|
|
51
58
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
52
59
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
53
60
|
MapType,
|
|
54
61
|
get_schema_fields_for_sqlalchemy_column,
|
|
55
62
|
)
|
|
63
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
56
64
|
|
|
57
65
|
try:
|
|
58
66
|
from typing_extensions import override
|
|
@@ -284,6 +292,11 @@ class AthenaConfig(SQLCommonConfig):
|
|
|
284
292
|
description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
|
|
285
293
|
)
|
|
286
294
|
|
|
295
|
+
extract_partitions_using_create_statements: bool = pydantic.Field(
|
|
296
|
+
default=False,
|
|
297
|
+
description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
|
|
298
|
+
)
|
|
299
|
+
|
|
287
300
|
_s3_staging_dir_population = pydantic_renamed_field(
|
|
288
301
|
old_name="s3_staging_dir",
|
|
289
302
|
new_name="query_result_location",
|
|
@@ -496,23 +509,38 @@ class AthenaSource(SQLAlchemySource):
|
|
|
496
509
|
def get_partitions(
|
|
497
510
|
self, inspector: Inspector, schema: str, table: str
|
|
498
511
|
) -> Optional[List[str]]:
|
|
499
|
-
if
|
|
512
|
+
if (
|
|
513
|
+
not self.config.extract_partitions
|
|
514
|
+
and not self.config.extract_partitions_using_create_statements
|
|
515
|
+
):
|
|
500
516
|
return None
|
|
501
517
|
|
|
502
518
|
if not self.cursor:
|
|
503
519
|
return None
|
|
504
520
|
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
521
|
+
if self.config.extract_partitions_using_create_statements:
|
|
522
|
+
try:
|
|
523
|
+
partitions = self._get_partitions_create_table(schema, table)
|
|
524
|
+
except Exception as e:
|
|
525
|
+
logger.warning(
|
|
526
|
+
f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
|
|
527
|
+
exc_info=True,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# If we can't get create table statement, we fall back to SQLAlchemy
|
|
531
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
532
|
+
else:
|
|
533
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
508
534
|
|
|
509
|
-
partitions = []
|
|
510
|
-
for key in metadata.partition_keys:
|
|
511
|
-
if key.name:
|
|
512
|
-
partitions.append(key.name)
|
|
513
535
|
if not partitions:
|
|
514
536
|
return []
|
|
515
537
|
|
|
538
|
+
if (
|
|
539
|
+
not self.config.profiling.enabled
|
|
540
|
+
or not self.config.profiling.partition_profiling_enabled
|
|
541
|
+
):
|
|
542
|
+
return partitions
|
|
543
|
+
|
|
516
544
|
with self.report.report_exc(
|
|
517
545
|
message="Failed to extract partition details",
|
|
518
546
|
context=f"{schema}.{table}",
|
|
@@ -538,6 +566,56 @@ class AthenaSource(SQLAlchemySource):
|
|
|
538
566
|
|
|
539
567
|
return partitions
|
|
540
568
|
|
|
569
|
+
def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
|
|
570
|
+
assert self.cursor
|
|
571
|
+
try:
|
|
572
|
+
res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
|
|
573
|
+
except Exception as e:
|
|
574
|
+
# Athena does not support SHOW CREATE TABLE for views
|
|
575
|
+
# and will throw an error. We need to handle this case
|
|
576
|
+
# and caller needs to fallback to sqlalchemy's get partitions call.
|
|
577
|
+
logger.debug(
|
|
578
|
+
f"Failed to get table properties for {schema}.{table}: {e}",
|
|
579
|
+
exc_info=True,
|
|
580
|
+
)
|
|
581
|
+
raise e
|
|
582
|
+
rows = res.fetchall()
|
|
583
|
+
|
|
584
|
+
# Concatenate all rows into a single string with newlines
|
|
585
|
+
create_table_statement = "\n".join(row[0] for row in rows)
|
|
586
|
+
|
|
587
|
+
try:
|
|
588
|
+
athena_table_info = AthenaPropertiesExtractor.get_table_properties(
|
|
589
|
+
create_table_statement
|
|
590
|
+
)
|
|
591
|
+
except Exception as e:
|
|
592
|
+
logger.debug(
|
|
593
|
+
f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
|
|
594
|
+
exc_info=True,
|
|
595
|
+
)
|
|
596
|
+
raise e
|
|
597
|
+
|
|
598
|
+
partitions = []
|
|
599
|
+
if (
|
|
600
|
+
athena_table_info.partition_info
|
|
601
|
+
and athena_table_info.partition_info.simple_columns
|
|
602
|
+
):
|
|
603
|
+
partitions = [
|
|
604
|
+
ci.name for ci in athena_table_info.partition_info.simple_columns
|
|
605
|
+
]
|
|
606
|
+
return partitions
|
|
607
|
+
|
|
608
|
+
def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
|
|
609
|
+
assert self.cursor
|
|
610
|
+
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
611
|
+
table_name=table, schema_name=schema
|
|
612
|
+
)
|
|
613
|
+
partitions = []
|
|
614
|
+
for key in metadata.partition_keys:
|
|
615
|
+
if key.name:
|
|
616
|
+
partitions.append(key.name)
|
|
617
|
+
return partitions
|
|
618
|
+
|
|
541
619
|
# Overwrite to modify the creation of schema fields
|
|
542
620
|
def get_schema_fields_for_column(
|
|
543
621
|
self,
|
|
@@ -563,7 +641,14 @@ class AthenaSource(SQLAlchemySource):
|
|
|
563
641
|
partition_keys is not None and column["name"] in partition_keys
|
|
564
642
|
),
|
|
565
643
|
)
|
|
566
|
-
|
|
644
|
+
if isinstance(
|
|
645
|
+
fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
|
|
646
|
+
):
|
|
647
|
+
return fields
|
|
648
|
+
else:
|
|
649
|
+
fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
|
|
650
|
+
fields[0].fieldPath
|
|
651
|
+
)
|
|
567
652
|
return fields
|
|
568
653
|
|
|
569
654
|
def generate_partition_profiler_query(
|