acryl-datahub 1.1.0.5rc4__py3-none-any.whl → 1.1.0.5rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -29,8 +29,14 @@ from datahub.ingestion.api.decorators import (
29
29
  from datahub.ingestion.api.source import StructuredLogLevel
30
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
32
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
32
+ from datahub.ingestion.source.common.subtypes import (
33
+ DatasetContainerSubTypes,
34
+ SourceCapabilityModifier,
35
+ )
33
36
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
37
+ from datahub.ingestion.source.sql.athena_properties_extractor import (
38
+ AthenaPropertiesExtractor,
39
+ )
34
40
  from datahub.ingestion.source.sql.sql_common import (
35
41
  SQLAlchemySource,
36
42
  register_custom_type,
@@ -44,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
44
50
  )
45
51
  from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
46
52
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
47
- from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
53
+ from datahub.metadata.schema_classes import (
54
+ ArrayTypeClass,
55
+ MapTypeClass,
56
+ RecordTypeClass,
57
+ )
48
58
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
49
59
  from datahub.utilities.sqlalchemy_type_converter import (
50
60
  MapType,
51
61
  get_schema_fields_for_sqlalchemy_column,
52
62
  )
63
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
53
64
 
54
65
  try:
55
66
  from typing_extensions import override
@@ -281,6 +292,11 @@ class AthenaConfig(SQLCommonConfig):
281
292
  description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
282
293
  )
283
294
 
295
+ extract_partitions_using_create_statements: bool = pydantic.Field(
296
+ default=False,
297
+ description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
298
+ )
299
+
284
300
  _s3_staging_dir_population = pydantic_renamed_field(
285
301
  old_name="s3_staging_dir",
286
302
  new_name="query_result_location",
@@ -321,9 +337,18 @@ class Partitionitem:
321
337
  @capability(
322
338
  SourceCapability.DATA_PROFILING,
323
339
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
340
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
341
+ )
342
+ @capability(
343
+ SourceCapability.LINEAGE_COARSE,
344
+ "Supported for S3 tables",
345
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
346
+ )
347
+ @capability(
348
+ SourceCapability.LINEAGE_FINE,
349
+ "Supported for S3 tables",
350
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
324
351
  )
325
- @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
326
- @capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
327
352
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
328
353
  class AthenaSource(SQLAlchemySource):
329
354
  """
@@ -484,23 +509,38 @@ class AthenaSource(SQLAlchemySource):
484
509
  def get_partitions(
485
510
  self, inspector: Inspector, schema: str, table: str
486
511
  ) -> Optional[List[str]]:
487
- if not self.config.extract_partitions:
512
+ if (
513
+ not self.config.extract_partitions
514
+ and not self.config.extract_partitions_using_create_statements
515
+ ):
488
516
  return None
489
517
 
490
518
  if not self.cursor:
491
519
  return None
492
520
 
493
- metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
494
- table_name=table, schema_name=schema
495
- )
521
+ if self.config.extract_partitions_using_create_statements:
522
+ try:
523
+ partitions = self._get_partitions_create_table(schema, table)
524
+ except Exception as e:
525
+ logger.warning(
526
+ f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
527
+ exc_info=True,
528
+ )
529
+
530
+ # If we can't get create table statement, we fall back to SQLAlchemy
531
+ partitions = self._get_partitions_sqlalchemy(schema, table)
532
+ else:
533
+ partitions = self._get_partitions_sqlalchemy(schema, table)
496
534
 
497
- partitions = []
498
- for key in metadata.partition_keys:
499
- if key.name:
500
- partitions.append(key.name)
501
535
  if not partitions:
502
536
  return []
503
537
 
538
+ if (
539
+ not self.config.profiling.enabled
540
+ or not self.config.profiling.partition_profiling_enabled
541
+ ):
542
+ return partitions
543
+
504
544
  with self.report.report_exc(
505
545
  message="Failed to extract partition details",
506
546
  context=f"{schema}.{table}",
@@ -526,6 +566,56 @@ class AthenaSource(SQLAlchemySource):
526
566
 
527
567
  return partitions
528
568
 
569
+ def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
570
+ assert self.cursor
571
+ try:
572
+ res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
573
+ except Exception as e:
574
+ # Athena does not support SHOW CREATE TABLE for views
575
+ # and will throw an error. We need to handle this case
576
+ # and caller needs to fallback to sqlalchemy's get partitions call.
577
+ logger.debug(
578
+ f"Failed to get table properties for {schema}.{table}: {e}",
579
+ exc_info=True,
580
+ )
581
+ raise e
582
+ rows = res.fetchall()
583
+
584
+ # Concatenate all rows into a single string with newlines
585
+ create_table_statement = "\n".join(row[0] for row in rows)
586
+
587
+ try:
588
+ athena_table_info = AthenaPropertiesExtractor.get_table_properties(
589
+ create_table_statement
590
+ )
591
+ except Exception as e:
592
+ logger.debug(
593
+ f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
594
+ exc_info=True,
595
+ )
596
+ raise e
597
+
598
+ partitions = []
599
+ if (
600
+ athena_table_info.partition_info
601
+ and athena_table_info.partition_info.simple_columns
602
+ ):
603
+ partitions = [
604
+ ci.name for ci in athena_table_info.partition_info.simple_columns
605
+ ]
606
+ return partitions
607
+
608
+ def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
609
+ assert self.cursor
610
+ metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
611
+ table_name=table, schema_name=schema
612
+ )
613
+ partitions = []
614
+ for key in metadata.partition_keys:
615
+ if key.name:
616
+ partitions.append(key.name)
617
+ return partitions
618
+
529
619
  # Overwrite to modify the creation of schema fields
530
620
  def get_schema_fields_for_column(
531
621
  self,
@@ -551,7 +641,14 @@ class AthenaSource(SQLAlchemySource):
551
641
  partition_keys is not None and column["name"] in partition_keys
552
642
  ),
553
643
  )
554
-
644
+ if isinstance(
645
+ fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
646
+ ):
647
+ return fields
648
+ else:
649
+ fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
650
+ fields[0].fieldPath
651
+ )
555
652
  return fields
556
653
 
557
654
  def generate_partition_profiler_query(