acryl-datahub 1.1.0.5rc4__py3-none-any.whl → 1.1.0.5rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/METADATA +2495 -2495
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/RECORD +25 -24
- datahub/_version.py +1 -1
- datahub/cli/check_cli.py +21 -4
- datahub/ingestion/api/decorators.py +14 -3
- datahub/ingestion/graph/client.py +71 -28
- datahub/ingestion/source/aws/glue.py +1 -1
- datahub/ingestion/source/bigquery_v2/queries.py +2 -2
- datahub/ingestion/source/common/subtypes.py +41 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +348 -112
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sql/athena.py +110 -13
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/mssql/source.py +9 -0
- datahub/ingestion/source/sql/sql_common.py +3 -0
- datahub/ingestion/source/sql/teradata.py +4 -1
- datahub/ingestion/source/sql/vertica.py +8 -1
- datahub/ingestion/source/tableau/tableau.py +6 -1
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.0.5rc4.dist-info → acryl_datahub-1.1.0.5rc6.dist-info}/top_level.txt +0 -0
|
@@ -29,8 +29,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
29
29
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
30
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
32
|
-
from datahub.ingestion.source.common.subtypes import
|
|
32
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
33
|
+
DatasetContainerSubTypes,
|
|
34
|
+
SourceCapabilityModifier,
|
|
35
|
+
)
|
|
33
36
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
37
|
+
from datahub.ingestion.source.sql.athena_properties_extractor import (
|
|
38
|
+
AthenaPropertiesExtractor,
|
|
39
|
+
)
|
|
34
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
35
41
|
SQLAlchemySource,
|
|
36
42
|
register_custom_type,
|
|
@@ -44,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
44
50
|
)
|
|
45
51
|
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
46
52
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
47
|
-
from datahub.metadata.schema_classes import
|
|
53
|
+
from datahub.metadata.schema_classes import (
|
|
54
|
+
ArrayTypeClass,
|
|
55
|
+
MapTypeClass,
|
|
56
|
+
RecordTypeClass,
|
|
57
|
+
)
|
|
48
58
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
49
59
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
50
60
|
MapType,
|
|
51
61
|
get_schema_fields_for_sqlalchemy_column,
|
|
52
62
|
)
|
|
63
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
53
64
|
|
|
54
65
|
try:
|
|
55
66
|
from typing_extensions import override
|
|
@@ -281,6 +292,11 @@ class AthenaConfig(SQLCommonConfig):
|
|
|
281
292
|
description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
|
|
282
293
|
)
|
|
283
294
|
|
|
295
|
+
extract_partitions_using_create_statements: bool = pydantic.Field(
|
|
296
|
+
default=False,
|
|
297
|
+
description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
|
|
298
|
+
)
|
|
299
|
+
|
|
284
300
|
_s3_staging_dir_population = pydantic_renamed_field(
|
|
285
301
|
old_name="s3_staging_dir",
|
|
286
302
|
new_name="query_result_location",
|
|
@@ -321,9 +337,18 @@ class Partitionitem:
|
|
|
321
337
|
@capability(
|
|
322
338
|
SourceCapability.DATA_PROFILING,
|
|
323
339
|
"Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
|
|
340
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
341
|
+
)
|
|
342
|
+
@capability(
|
|
343
|
+
SourceCapability.LINEAGE_COARSE,
|
|
344
|
+
"Supported for S3 tables",
|
|
345
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
346
|
+
)
|
|
347
|
+
@capability(
|
|
348
|
+
SourceCapability.LINEAGE_FINE,
|
|
349
|
+
"Supported for S3 tables",
|
|
350
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
324
351
|
)
|
|
325
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
|
|
326
|
-
@capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
|
|
327
352
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
328
353
|
class AthenaSource(SQLAlchemySource):
|
|
329
354
|
"""
|
|
@@ -484,23 +509,38 @@ class AthenaSource(SQLAlchemySource):
|
|
|
484
509
|
def get_partitions(
|
|
485
510
|
self, inspector: Inspector, schema: str, table: str
|
|
486
511
|
) -> Optional[List[str]]:
|
|
487
|
-
if
|
|
512
|
+
if (
|
|
513
|
+
not self.config.extract_partitions
|
|
514
|
+
and not self.config.extract_partitions_using_create_statements
|
|
515
|
+
):
|
|
488
516
|
return None
|
|
489
517
|
|
|
490
518
|
if not self.cursor:
|
|
491
519
|
return None
|
|
492
520
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
521
|
+
if self.config.extract_partitions_using_create_statements:
|
|
522
|
+
try:
|
|
523
|
+
partitions = self._get_partitions_create_table(schema, table)
|
|
524
|
+
except Exception as e:
|
|
525
|
+
logger.warning(
|
|
526
|
+
f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
|
|
527
|
+
exc_info=True,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# If we can't get create table statement, we fall back to SQLAlchemy
|
|
531
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
532
|
+
else:
|
|
533
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
496
534
|
|
|
497
|
-
partitions = []
|
|
498
|
-
for key in metadata.partition_keys:
|
|
499
|
-
if key.name:
|
|
500
|
-
partitions.append(key.name)
|
|
501
535
|
if not partitions:
|
|
502
536
|
return []
|
|
503
537
|
|
|
538
|
+
if (
|
|
539
|
+
not self.config.profiling.enabled
|
|
540
|
+
or not self.config.profiling.partition_profiling_enabled
|
|
541
|
+
):
|
|
542
|
+
return partitions
|
|
543
|
+
|
|
504
544
|
with self.report.report_exc(
|
|
505
545
|
message="Failed to extract partition details",
|
|
506
546
|
context=f"{schema}.{table}",
|
|
@@ -526,6 +566,56 @@ class AthenaSource(SQLAlchemySource):
|
|
|
526
566
|
|
|
527
567
|
return partitions
|
|
528
568
|
|
|
569
|
+
def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
|
|
570
|
+
assert self.cursor
|
|
571
|
+
try:
|
|
572
|
+
res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
|
|
573
|
+
except Exception as e:
|
|
574
|
+
# Athena does not support SHOW CREATE TABLE for views
|
|
575
|
+
# and will throw an error. We need to handle this case
|
|
576
|
+
# and caller needs to fallback to sqlalchemy's get partitions call.
|
|
577
|
+
logger.debug(
|
|
578
|
+
f"Failed to get table properties for {schema}.{table}: {e}",
|
|
579
|
+
exc_info=True,
|
|
580
|
+
)
|
|
581
|
+
raise e
|
|
582
|
+
rows = res.fetchall()
|
|
583
|
+
|
|
584
|
+
# Concatenate all rows into a single string with newlines
|
|
585
|
+
create_table_statement = "\n".join(row[0] for row in rows)
|
|
586
|
+
|
|
587
|
+
try:
|
|
588
|
+
athena_table_info = AthenaPropertiesExtractor.get_table_properties(
|
|
589
|
+
create_table_statement
|
|
590
|
+
)
|
|
591
|
+
except Exception as e:
|
|
592
|
+
logger.debug(
|
|
593
|
+
f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
|
|
594
|
+
exc_info=True,
|
|
595
|
+
)
|
|
596
|
+
raise e
|
|
597
|
+
|
|
598
|
+
partitions = []
|
|
599
|
+
if (
|
|
600
|
+
athena_table_info.partition_info
|
|
601
|
+
and athena_table_info.partition_info.simple_columns
|
|
602
|
+
):
|
|
603
|
+
partitions = [
|
|
604
|
+
ci.name for ci in athena_table_info.partition_info.simple_columns
|
|
605
|
+
]
|
|
606
|
+
return partitions
|
|
607
|
+
|
|
608
|
+
def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
|
|
609
|
+
assert self.cursor
|
|
610
|
+
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
611
|
+
table_name=table, schema_name=schema
|
|
612
|
+
)
|
|
613
|
+
partitions = []
|
|
614
|
+
for key in metadata.partition_keys:
|
|
615
|
+
if key.name:
|
|
616
|
+
partitions.append(key.name)
|
|
617
|
+
return partitions
|
|
618
|
+
|
|
529
619
|
# Overwrite to modify the creation of schema fields
|
|
530
620
|
def get_schema_fields_for_column(
|
|
531
621
|
self,
|
|
@@ -551,7 +641,14 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
641
|
partition_keys is not None and column["name"] in partition_keys
|
|
552
642
|
),
|
|
553
643
|
)
|
|
554
|
-
|
|
644
|
+
if isinstance(
|
|
645
|
+
fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
|
|
646
|
+
):
|
|
647
|
+
return fields
|
|
648
|
+
else:
|
|
649
|
+
fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
|
|
650
|
+
fields[0].fieldPath
|
|
651
|
+
)
|
|
555
652
|
return fields
|
|
556
653
|
|
|
557
654
|
def generate_partition_profiler_query(
|